In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

import warnings 
warnings.filterwarnings('ignore')

%matplotlib inline

### <b>Read the data</b>

In [15]:
df = pd.read_csv('../../data/processed/canomical_data.csv')
df.head()

Unnamed: 0,p_categories,p_brand,p_day_created,p_sold_quantity,p_original_price,p_current_price,p_discount_rate
0,0.046053,0.518395,659.0,702,528000,269000,49
1,0.552632,0.662207,974.0,12844,799000,429000,46
2,0.552632,0.64214,1372.0,938,209000,209000,0
3,0.046053,0.518395,593.0,10359,473000,235000,50
4,0.098684,0.48495,529.0,2338,106000,89000,16


### <b>Train test split</b>

In [16]:
X = df.drop('p_sold_quantity', axis='columns')
y = testLabels = df.p_sold_quantity

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15)

In [17]:
y_train.value_counts()

0      65
1      28
2      19
4      12
3      12
       ..
299     1
718     1
496     1
289     1
208     1
Name: p_sold_quantity, Length: 727, dtype: int64

In [18]:
y.value_counts()

0       85
1       35
2       21
4       18
3       17
        ..
1141     1
847      1
1798     1
980      1
706      1
Name: p_sold_quantity, Length: 833, dtype: int64

In [19]:
104/89

1.1685393258426966

In [20]:
y_test.value_counts()

0       20
5        9
1        7
4        6
3        5
        ..
1154     1
25       1
1005     1
690      1
177      1
Name: p_sold_quantity, Length: 259, dtype: int64

In [21]:
X_train.shape, X_test.shape

((1423, 6), (356, 6))

In [22]:
X_train[:10]

Unnamed: 0,p_categories,p_brand,p_day_created,p_original_price,p_current_price,p_discount_rate
566,0.993421,0.64214,74.0,299000,299000,0
772,0.118421,0.531773,195.0,748000,439000,41
1668,0.789474,0.64214,42.0,12000,12000,0
1497,0.684211,0.602007,212.0,58000,58000,0
449,0.552632,0.244147,1225.0,235000,144000,39
1660,0.552632,0.578595,708.0,95000,95000,0
184,0.552632,0.608696,758.0,1000000,455000,55
1652,0.638158,0.451505,747.0,500000,265000,47
1556,0.046053,0.003344,310.0,150000,150000,0
910,0.835526,0.217391,421.0,1260000,1045000,17


In [23]:
len(X_train.columns)

6

### <b>Use the model `LogisticRegression` to predict the target variable</b>

In [24]:
def confusion_matrix_plot(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt='d')
    plt.xlabel('Predicted')
    plt.ylabel('Truth')
    plt.show()

In [25]:
def log_reg(X_train, X_test, y_train, y_test, weights=-1):
    if weights == -1:
        model = LogisticRegression()
    else:
        model = LogisticRegression(class_weight={0: weights[0], 1: weights[1]})
    model.fit(X_train, y_train)
    accuracy = model.score(X_test, y_test)
    y_pred = model.predict(X_test)
    print('Accuracy: ', accuracy)
   #  confusion_matrix_plot(y_test, y_pred)
    print(classification_report(y_test, y_pred))
    return model

In [26]:
model = log_reg(X_train, X_test, y_train, y_test) 
model

Accuracy:  0.056179775280898875
              precision    recall  f1-score   support

           0       0.07      1.00      0.13        20
           1       0.00      0.00      0.00         7
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00         5
           4       0.00      0.00      0.00         6
           5       0.00      0.00      0.00         9
           6       0.00      0.00      0.00         4
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         3
           9       0.00      0.00      0.00         2
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         2
          13       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         1
          17       0.00      0.00      0.00      