# Production modelling

In [111]:
# Libraries and parameters
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

from imblearn.over_sampling import SMOTE

In [112]:
#Loading X_train, y_train, X_test, y_test from .csv creating in preprocessing

X_train = pd.read_csv('../preproc_data/X_train_preproc.csv', index_col=0)
y_train = pd.read_csv('../preproc_data/y_train.csv', index_col=0)["y_target"]

X_test = pd.read_csv('../preproc_data/X_test_preproc.csv', index_col=0)
y_test = pd.read_csv('../preproc_data/y_test.csv', index_col=0)["y_target"]

In [113]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [114]:
y_pred = model.predict(X_test)

accuracy_score(y_test, y_pred)

0.6821231392829687

In [115]:
recall_score(y_test, y_pred)

0.00010991426687183997

In [116]:
TEST = pd.DataFrame(y_pred)

In [117]:
TEST.value_counts()

0    28617
1        1
Name: count, dtype: int64

In [118]:
#Utilizing the SMOTE method to resample X_train, y_train to balance the data set\n",
X_train_resampled, y_train_resampled = SMOTE().fit_resample(X_train, y_train)

In [119]:
model2 = LogisticRegression()
model2.fit(X_train_resampled, y_train_resampled)

In [120]:
y2_pred = model2.predict(X_test)

In [121]:
accuracy_score(y_test, y2_pred)

0.5285135229575791

In [122]:
recall_score(y_test, y2_pred)

0.5535282479665861

In [123]:
precision_score(y_test, y2_pred)

0.3481025782816064

In [124]:
clf = GradientBoostingClassifier()

In [125]:
clf.fit(X_train_resampled, y_train_resampled)

In [126]:
y_pred = clf.predict(X_test)

In [127]:
print("Accuracy =", accuracy_score(y_test, y_pred))
print("Recall =", recall_score(y_test, y_pred))
print("Precision =", precision_score(y_test, y_pred))

Accuracy = 0.681354392340485
Recall = 0.016597054297647833
Precision = 0.4674922600619195


In [128]:
recall_score(y_test, y_pred)
precision_score(y_test, y_pred)

0.4674922600619195

In [129]:
precision_score(y_test, y_pred)

0.4674922600619195

In [130]:
rdc = RandomForestClassifier()
rdc.fit(X_train_resampled, y_train_resampled)

In [131]:
y_pred = rdc.predict(X_test)

In [132]:
print("Accuracy =", accuracy_score(y_test, y_pred))
print("Recall =", recall_score(y_test, y_pred))
print("Precision =", precision_score(y_test, y_pred))

Accuracy = 0.6562303445384025
Recall = 0.12717080677071885
Precision = 0.3788474132285527


In [133]:
pd.DataFrame(y_pred).value_counts()

0    25564
1     3054
Name: count, dtype: int64

In [134]:
sgdc = SGDClassifier()
sgdc.fit(X_train_resampled, y_train_resampled)

In [135]:
y_pred = sgdc.predict(X_test)

In [136]:
print("Accuracy =", accuracy_score(y_test, y_pred))
print("Recall =", recall_score(y_test, y_pred))
print("Precision =", precision_score(y_test, y_pred))

Accuracy = 0.5543364316164652
Recall = 0.4619696636623434
Precision = 0.34844967667053556
