In [41]:
# import libraries
import joblib 
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [42]:
# load the saved preprocessed data
X_train, X_test, y_train, y_test, preprocessor = joblib.load("../models/preprocessed.pkl")


In [43]:
# apply logistic regression model

log_reg_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("logreg", LogisticRegression(max_iter=1000))
])

# train the model with data
log_reg_model.fit(X_train, y_train)

# predict the value with test data
y_pred_logreg = log_reg_model.predict(X_test)

In [44]:
# random forest model
from sklearn.ensemble import RandomForestClassifier
rf_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("logreg", RandomForestClassifier(n_estimators=100, random_state=42))
])

# train the model with data
rf_model.fit(X_train, y_train)

# predict the value with test data
y_pred_rf = rf_model.predict(X_test)

In [45]:
# decision tree model
from sklearn.tree import DecisionTreeClassifier
dt_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("logreg", DecisionTreeClassifier())
])

# train the model with data
dt_model.fit(X_train, y_train)

# predict the value with test data
y_pred_dt = dt_model.predict(X_test)

In [46]:
# evaluate the logistic regression model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
print("Logistic regression result: \n")
print("Accuracy: ", accuracy_score(y_test, y_pred_logreg))
print(classification_report(y_test, y_pred_logreg))
print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred_logreg))


Logistic regression result: 

Accuracy:  0.8048261178140526
              precision    recall  f1-score   support

           0       0.85      0.90      0.87      1035
           1       0.66      0.55      0.60       374

    accuracy                           0.80      1409
   macro avg       0.75      0.72      0.74      1409
weighted avg       0.80      0.80      0.80      1409

Confusion Matrix: 
 [[927 108]
 [167 207]]


In [47]:
# evaluate the random forest model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
print("Random Forest result: \n")
print("Accuracy: ", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred_rf))


Random Forest result: 

Accuracy:  0.7970191625266146
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      1035
           1       0.65      0.51      0.57       374

    accuracy                           0.80      1409
   macro avg       0.74      0.71      0.72      1409
weighted avg       0.79      0.80      0.79      1409

Confusion Matrix: 
 [[931 104]
 [182 192]]


In [48]:
# evaluate the decision tree model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
print("Decision Tree  result: \n")
print("Accuracy: ", accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))
print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred_dt))


Decision Tree  result: 

Accuracy:  0.7374024130589071
              precision    recall  f1-score   support

           0       0.82      0.83      0.82      1035
           1       0.51      0.49      0.50       374

    accuracy                           0.74      1409
   macro avg       0.66      0.66      0.66      1409
weighted avg       0.73      0.74      0.74      1409

Confusion Matrix: 
 [[856 179]
 [191 183]]


In [49]:
# save the best model
joblib.dump(log_reg_model, "../models/final_model.pkl")
print("Best model saved as final_model.pkl")

Best model saved as final_model.pkl
