In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

import warnings 
warnings.filterwarnings('ignore')

%matplotlib inline

### <b>Read the data</b>

In [2]:
df = pd.read_csv('../../data/processed/canomical_data.csv')
df.head()

Unnamed: 0,p_categories,p_brand,p_day_created,p_sold_quantity,p_original_price,p_discount_rate
0,7,155,659.0,702,528000,49
1,84,198,974.0,12844,799000,46
2,84,192,1372.0,938,209000,0
3,7,155,593.0,10359,473000,50
4,15,145,529.0,2338,106000,16


### <b>Train test split</b>

In [3]:
X = df.drop('p_sold_quantity', axis='columns')
y = testLabels = df.p_sold_quantity

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15)

In [4]:
y_train.value_counts()

0        65
1        28
2        19
3        12
4        12
         ..
2535      1
482       1
23009     1
480       1
2033      1
Name: p_sold_quantity, Length: 727, dtype: int64

In [5]:
y.value_counts()

0       85
1       35
2       21
4       18
3       17
        ..
539      1
538      1
2585     1
2580     1
6134     1
Name: p_sold_quantity, Length: 833, dtype: int64

In [6]:
104/89

1.1685393258426966

In [7]:
y_test.value_counts()

0       20
5        9
1        7
4        6
3        5
        ..
1700     1
21       1
681      1
1041     1
596      1
Name: p_sold_quantity, Length: 259, dtype: int64

In [8]:
X_train.shape, X_test.shape

((1423, 5), (356, 5))

In [9]:
X_train[:10]

Unnamed: 0,p_categories,p_brand,p_day_created,p_original_price,p_discount_rate
566,151,192,74.0,299000,0
772,18,159,195.0,748000,41
1668,120,192,42.0,12000,0
1497,104,180,212.0,58000,0
449,84,73,1225.0,235000,39
1660,84,173,708.0,95000,0
184,84,182,758.0,1000000,55
1652,97,135,747.0,500000,47
1556,7,1,310.0,150000,0
910,127,65,421.0,1260000,17


In [10]:
len(X_train.columns)

5

### <b>Use the model `LogisticRegression` to predict the target variable</b>

In [11]:
def confusion_matrix_plot(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt='d')
    plt.xlabel('Predicted')
    plt.ylabel('Truth')
    plt.show()

In [12]:
def log_reg(X_train, X_test, y_train, y_test, weights=-1):
    if weights == -1:
        model = LogisticRegression()
    else:
        model = LogisticRegression(class_weight={0: weights[0], 1: weights[1]})
    model.fit(X_train, y_train)
    accuracy = model.score(X_test, y_test)
    y_pred = model.predict(X_test)
    print('Accuracy: ', accuracy)
   #  confusion_matrix_plot(y_test, y_pred)
    print(classification_report(y_test, y_pred))
    return model

In [13]:
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)

LinearRegression()

In [14]:
from sklearn.feature_selection import RFE
rfe = RFE(linear_reg, 10)
rfe = rfe.fit(X_train, y_train)

In [15]:
list(zip(X_train.columns,rfe.support_,rfe.ranking_))

[('p_categories', True, 1),
 ('p_brand', True, 1),
 ('p_day_created', True, 1),
 ('p_original_price', True, 1),
 ('p_discount_rate', True, 1)]

In [16]:
# model = log_reg(X_train, X_test, y_train, y_test) 
# model