In [270]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import warnings 
warnings.filterwarnings('ignore')

%matplotlib inline

### <b>Read the data</b>

In [271]:
df = pd.read_csv('../../data/processed/canomical_data.csv')
df.head() 

Unnamed: 0,p_categories,p_brand,p_day_created,p_sold_quantity,p_original_price,p_discount_rate
0,7,155,659.0,702,528000,49
1,84,198,974.0,12844,799000,46
2,84,192,1372.0,938,209000,0
3,7,155,593.0,10359,473000,50
4,15,145,529.0,2338,106000,16


### <b>Create columns trending</b>
1. The products have mean 

In [272]:
col_trending = df['p_sold_quantity'] / df['p_day_created']
df['trending'] = col_trending.apply(lambda x: 1 if x > col_trending.mean() else 0)


### <b>Train test split</b>

In [273]:
X = df.drop('trending', axis='columns')
y = testLabels = df.trending

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15)

In [274]:
y_train.value_counts()

0    1087
1     336
Name: trending, dtype: int64

In [275]:
y.value_counts()

0    1377
1     402
Name: trending, dtype: int64

In [276]:
104/89

1.1685393258426966

In [277]:
y_test.value_counts()

0    290
1     66
Name: trending, dtype: int64

In [278]:
X_train.shape, X_test.shape

((1423, 6), (356, 6))

In [279]:
X_train[:10]

Unnamed: 0,p_categories,p_brand,p_day_created,p_sold_quantity,p_original_price,p_discount_rate
566,151,192,74.0,57,299000,0
772,18,159,195.0,160,748000,41
1668,120,192,42.0,104,12000,0
1497,104,180,212.0,106,58000,0
449,84,73,1225.0,1798,235000,39
1660,84,173,708.0,330,95000,0
184,84,182,758.0,1497,1000000,55
1652,97,135,747.0,610,500000,47
1556,7,1,310.0,390,150000,0
910,127,65,421.0,435,1260000,17


In [280]:
len(X_train.columns)

6

### <b>Use the model `LogisticRegression` to predict the target variable</b>

In [281]:
def confusion_matrix_plot(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt='d')
    plt.xlabel('Predicted')
    plt.ylabel('Truth')
    plt.show()

In [282]:
def log_reg(X_train, X_test, y_train, y_test, weights=-1):
    if weights == -1:
        model = LogisticRegression()
    else:
        model = LogisticRegression(class_weight={0: weights[0], 1: weights[1]})
    model.fit(X_train, y_train)
    accuracy = model.score(X_test, y_test)
    y_pred = model.predict(X_test)
    print('Accuracy: ', accuracy)
    #  confusion_matrix_plot(y_test, y_pred)
    print(classification_report(y_test, y_pred))
    return model

In [283]:
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

In [284]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
lr_scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)

print('LogisticRegression: %.3f (%.3f)' % (np.mean(lr_scores), np.std(lr_scores)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LogisticRegression: 0.998 (0.004)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [285]:
from sklearn.model_selection import RandomizedSearchCV

clf = LogisticRegression()

sgd = make_pipeline(StandardScaler(), LogisticRegression(penalty='l2'))

fit_intercept = [True, False]
penalty = ['l2', 'l1', 'elasticnet']
param = dict(penalty=penalty, fit_intercept = fit_intercept)
rnd_search = RandomizedSearchCV(clf, param_distributions=param, n_iter=10, cv=5, scoring='accuracy', random_state=15)
rnd_search.fit(X_train, y_train)
rnd_search.best_params_


{'penalty': 'l2', 'fit_intercept': False}

In [286]:
from sklearn.metrics import mean_squared_error
clf = LogisticRegression(penalty='l2', fit_intercept=False)
clf.fit(X_train, y_train)
y_pred = model.predict(X_test)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
lr_scores = cross_val_score(clf, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [287]:
print('LogisticRegression: %.3f (%.3f)' % (np.mean(lr_scores), np.std(lr_scores)))
print(accuracy_score(y_test, y_pred))


LogisticRegression: 0.997 (0.004)
0.9943820224719101
