In [16]:
# Installing dask[dataframe]
!pip install dask[dataframe]



In [32]:
# Imports
import pandas as pd
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline

In [33]:
# Loading preprocessed data
with open('train_test_splits.pkl', 'rb') as f:
    X_train, X_test, y_train, y_test = pickle.load(f)

In [34]:
# Creating pipeline
# LogisticRegression
model_pipeline_lr = Pipeline(steps=[('classifier', LogisticRegression())])

# Training & fitting model
model_pipeline_lr.fit(X_train, y_train)

# Making predictions on test set
y_pred = model_pipeline_lr.predict(X_test)

In [35]:
# Displaying & evaluating model - LogisticRegression
print("Logistic Regression Classifier:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("="*60)

Logistic Regression Classifier:
Accuracy: 0.850733024691358
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.84      0.85     12771
           1       0.85      0.86      0.85     13149

    accuracy                           0.85     25920
   macro avg       0.85      0.85      0.85     25920
weighted avg       0.85      0.85      0.85     25920

Confusion Matrix:
 [[10785  1986]
 [ 1883 11266]]


In [36]:
# Creating pipeline
# RandomForestClassifier
model_pipeline_rf = Pipeline(steps=[('classifier', RandomForestClassifier())])

# Training & fitting model
model_pipeline_rf.fit(X_train, y_train)

# Making predictions on test set
y_pred = model_pipeline_rf.predict(X_test)

In [37]:
# Displaying & evaluating model - RandomForestClassifier
print("Random Forest Classifier:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("="*60)

Random Forest Classifier:
Accuracy: 0.8997685185185185
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.88      0.90     12771
           1       0.89      0.92      0.90     13149

    accuracy                           0.90     25920
   macro avg       0.90      0.90      0.90     25920
weighted avg       0.90      0.90      0.90     25920

Confusion Matrix:
 [[11245  1526]
 [ 1072 12077]]


In [38]:
# Creating pipeline
# GradientBoostingClassifier
model_pipeline_gb = Pipeline(
    steps=[('classifier', GradientBoostingClassifier())])

# Training & fitting model
model_pipeline_gb.fit(X_train, y_train)

# Making predictions on test set
y_pred = model_pipeline_gb.predict(X_test)

In [39]:
# Displaying & evaluating model - GradientBoostingClassifier
print("Random Forest Classifier:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("="*60)

Random Forest Classifier:
Accuracy: 0.8976851851851851
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.85      0.89     12771
           1       0.87      0.94      0.90     13149

    accuracy                           0.90     25920
   macro avg       0.90      0.90      0.90     25920
weighted avg       0.90      0.90      0.90     25920

Confusion Matrix:
 [[10899  1872]
 [  780 12369]]


In [40]:
# XGBoost Classifier
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)

# Predict on the test set
xgb_predictions = xgb_model.predict(X_test)

# Evaluation
print("XGBoost Classifier:")
print("Accuracy:", accuracy_score(y_test, xgb_predictions))
print(
    "Classification Report:\n", classification_report(y_test, xgb_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, xgb_predictions))
print("="*60)

XGBoost Classifier:
Accuracy: 0.8991898148148149
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.86      0.89     12771
           1       0.87      0.94      0.90     13149

    accuracy                           0.90     25920
   macro avg       0.90      0.90      0.90     25920
weighted avg       0.90      0.90      0.90     25920

Confusion Matrix:
 [[10965  1806]
 [  807 12342]]


In [41]:
# LightGBM Classifier
lgbm_model = LGBMClassifier()
lgbm_model.fit(X_train, y_train)

# Predict on the test set
lgbm_predictions = lgbm_model.predict(X_test)

# Evaluation
print("LightGBM Classifier:")
print("Accuracy:", accuracy_score(y_test, lgbm_predictions))
print(
    "Classification Report:\n", classification_report(y_test, lgbm_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, lgbm_predictions))
print("="*60)

[LightGBM] [Info] Number of positive: 51650, number of negative: 52028
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002667 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 103678, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498177 -> initscore=-0.007292
[LightGBM] [Info] Start training from score -0.007292
LightGBM Classifier:
Accuracy: 0.8981095679012345
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.85      0.89     12771
           1       0.87      0.94      0.90     13149

    accuracy                           0.90     25920
   macro avg       0.90      0.90      0.90     25920
weighted avg       0.90      0.90      0.90     25920

Confusion Matrix:
 [[10904  1867]
 [  774 12375]]


In [42]:
# SVC Classifier
svc_model = SVC()
svc_model.fit(X_train, y_train)

# Predict on the test set
svc_predictions = svc_model.predict(X_test)

# Evaluation
print("SVC Classifier:")
print("Accuracy:", accuracy_score(y_test, svc_predictions))
print(
    "Classification Report:\n", classification_report(y_test, svc_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, svc_predictions))
print("="*60)

SVC Classifier:
Accuracy: 0.8933256172839507
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.86      0.89     12771
           1       0.88      0.92      0.90     13149

    accuracy                           0.89     25920
   macro avg       0.89      0.89      0.89     25920
weighted avg       0.89      0.89      0.89     25920

Confusion Matrix:
 [[11043  1728]
 [ 1037 12112]]


In [43]:
# Decision Tree Classifier
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

# Predict on the test set
dt_predictions = dt_model.predict(X_test)

# Evaluation
print("Decision Tree Classifier:")
print("Accuracy:", accuracy_score(y_test, dt_predictions))
print("Classification Report:\n", classification_report(y_test, dt_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, dt_predictions))
print("="*60)

Decision Tree Classifier:
Accuracy: 0.8734953703703704
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.88      0.87     12771
           1       0.88      0.87      0.87     13149

    accuracy                           0.87     25920
   macro avg       0.87      0.87      0.87     25920
weighted avg       0.87      0.87      0.87     25920

Confusion Matrix:
 [[11220  1551]
 [ 1728 11421]]


In [44]:
# Extra Trees Classifier
et_model = ExtraTreesClassifier()
et_model.fit(X_train, y_train)

# Predict on the test set
et_predictions = et_model.predict(X_test)

# Evaluation
print("Extra Trees Classifier:")
print("Accuracy:", accuracy_score(y_test, et_predictions))
print("Classification Report:\n", classification_report(y_test, et_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, et_predictions))
print("="*60)

Extra Trees Classifier:
Accuracy: 0.9039737654320987
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.89      0.90     12771
           1       0.89      0.92      0.91     13149

    accuracy                           0.90     25920
   macro avg       0.90      0.90      0.90     25920
weighted avg       0.90      0.90      0.90     25920

Confusion Matrix:
 [[11335  1436]
 [ 1053 12096]]
