## Model training 

In [47]:
# model packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, classification_report, roc_auc_score
from xgboost import XGBClassifier

In [48]:
import warnings
from sklearn.exceptions import ConvergenceWarning

# Ignore FutureWarnings and DeprecationWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)
warnings.simplefilter(action='ignore', category=ConvergenceWarning)

In [49]:
# SELECT THE MOST INTERESTING MODELS AND LOOK AT IT IN MORE DETAILS
# train model
rfc = RandomForestClassifier(random_state=42)
xtrain.columns = xtrain.columns.astype(str) # keep column name
rfc.fit(xtrain, ytrain)

# predictions
xtest.columns = xtest.columns.astype(str) # keep column name
rfc_ypred = rfc.predict(xtest)
rfc_yprobs = rfc.predict_proba(xtest)[:, 1] 

# Evaluate the Random Forest model
print("Random Forest Classifier:")
print("Accuracy:", accuracy_score(ytest, rfc_ypred))
print("\nClassification Report:\n", classification_report(ytest, rfc_ypred))

# Assuming y_true and y_pred are your true labels and predicted labels
conf_matrix = confusion_matrix(ytest, rfc_ypred)

# Calculate evaluation metrics
accuracy = accuracy_score(ytest, rfc_ypred)
precision = precision_score(ytest, rfc_ypred)
recall = recall_score(ytest, rfc_ypred)
f1 = f1_score(ytest, rfc_ypred)
roc_auc = roc_auc_score(ytest, rfc_yprobs)

# Display the confusion matrix and metrics
print("Confusion Matrix:")
print(conf_matrix)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)

Random Forest Classifier:
Accuracy: 0.9815436241610739

Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99      3423
           1       0.94      0.61      0.74       153

    accuracy                           0.98      3576
   macro avg       0.96      0.80      0.86      3576
weighted avg       0.98      0.98      0.98      3576

Confusion Matrix:
[[3417    6]
 [  60   93]]
Accuracy: 0.9815436241610739
Precision: 0.9393939393939394
Recall: 0.6078431372549019
F1 Score: 0.738095238095238
ROC AUC Score: 0.9685604303070923


In [53]:
# import the models 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

# Set the random seed for reproducibility
np.random.seed(42)

# perform model training and model comparisons
list_of_models = [LogisticRegression() , KNeighborsClassifier() , 
                  DecisionTreeClassifier() ,ExtraTreeClassifier(), RandomForestClassifier(), ExtraTreesClassifier(), 
                  SVC(kernel='rbf',probability=True), AdaBoostClassifier(), XGBClassifier()]
classification_report = pd.DataFrame(columns=['Accuracy','F1_score','Precision','Recall'])

for model in list_of_models :
    model = model.fit(xtrain, ytrain) # TO FILL IN
    for i in range(2) :
        if i == 0 :
            to_pred = xtrain # TO FILL IN
            pred = ytrain # TO FILL IN
            title = 'Train'
        else :
            to_pred = xtest # TO FILL IN
            pred = ytest
            title = 'Test'
        model_name = str(type(model)).split(".")[-1][:-2]
        y_pred = model.predict(to_pred) 
        y_probs = model.predict_proba(to_pred)[:, 1] 
        acc = round(accuracy_score(pred, y_pred)*100)
        f1 = round(f1_score(pred, y_pred)*100)
        prec = round(precision_score(pred, y_pred)*100)
        recall = round(recall_score(pred, y_pred)*100)
        rocauc =  round(roc_auc_score(pred, y_probs)*100)
        d = pd.DataFrame(data=np.array([acc,f1,prec,recall,rocauc]).reshape(1,5) 
                     , columns=['Accuracy' , 'F1_score' , 'Precision' , 'Recall', 'Roc Auc'])  
        classification_report = pd.concat([classification_report , d])
        classification_report.rename( index= { 0 :'{} _ {} Details'.format(model_name , title) } ,inplace=True )
pd.options.display.max_rows = None
classification_report

Unnamed: 0,Accuracy,F1_score,Precision,Recall,Roc Auc
LogisticRegression _ Train Details,97,64,86,51,97.0
LogisticRegression _ Test Details,97,57,76,46,95.0
KNeighborsClassifier _ Train Details,98,80,87,74,99.0
KNeighborsClassifier _ Test Details,98,71,78,65,94.0
DecisionTreeClassifier _ Train Details,100,96,98,95,100.0
DecisionTreeClassifier _ Test Details,98,70,75,66,86.0
ExtraTreeClassifier _ Train Details,100,96,98,95,100.0
ExtraTreeClassifier _ Test Details,97,65,68,63,85.0
RandomForestClassifier _ Train Details,100,96,98,95,100.0
RandomForestClassifier _ Test Details,98,74,94,61,97.0


## Use SMOTE for oversampling

In [54]:
target_var.value_counts()

fraudulent
0    17014
1      866
Name: count, dtype: int64

In [55]:
from imblearn.over_sampling import SMOTENC # smotenc is used as it can handle categorical variable
from imblearn.over_sampling import SMOTE

# oversampling
smote = SMOTE(random_state=42)
xtrain_resampled, ytrain_resampled = smote.fit_resample(xtrain, ytrain)

In [57]:
# SELECT THE MOST INTERESTING MODELS AND LOOK AT IT IN MORE DETAILS
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, classification_report, roc_auc_score
# train model
rfc = RandomForestClassifier(random_state=42)
xtrain.columns = xtrain.columns.astype(str) # keep column name
rfc.fit(xtrain_resampled, ytrain_resampled) # TO FILL IN

# predictions
xtest.columns = xtest.columns.astype(str) # keep column name
rfc_ypred = rfc.predict(xtest) # TO FILL IN
rfc_yprobs = rfc.predict_proba(xtest)[:, 1] # TO FILL IN

# Evaluate the Random Forest model
print("Random Forest Classifier:")
print("Accuracy:", accuracy_score(ytest, rfc_ypred))
print("\nClassification Report:\n", classification_report(ytest, rfc_ypred))

# Assuming y_true and y_pred are your true labels and predicted labels
conf_matrix = confusion_matrix(ytest, rfc_ypred)

# Calculate evaluation metrics
accuracy = accuracy_score(ytest, rfc_ypred)
precision = precision_score(ytest, rfc_ypred)
recall = recall_score(ytest, rfc_ypred)
f1 = f1_score(ytest, rfc_ypred)
roc_auc = roc_auc_score(ytest, rfc_yprobs)

# Display the confusion matrix and metrics
print("Confusion Matrix:")
print(conf_matrix)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)

Random Forest Classifier:
Accuracy: 0.9765100671140939

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      3423
           1       0.70      0.78      0.74       153

    accuracy                           0.98      3576
   macro avg       0.85      0.88      0.86      3576
weighted avg       0.98      0.98      0.98      3576

Confusion Matrix:
[[3372   51]
 [  33  120]]
Accuracy: 0.9765100671140939
Precision: 0.7017543859649122
Recall: 0.7843137254901961
F1 Score: 0.7407407407407408
ROC AUC Score: 0.960473078120137


In [58]:
# Set the random seed for reproducibility
np.random.seed(42)

# perform model training and model comparisons
list_of_models = [LogisticRegression() , KNeighborsClassifier() , 
                  DecisionTreeClassifier() ,ExtraTreeClassifier(), RandomForestClassifier(), ExtraTreesClassifier(), 
                  SVC(kernel='rbf',probability=True), AdaBoostClassifier(), XGBClassifier()]
classification_report = pd.DataFrame(columns=['Accuracy','F1_score','Precision','Recall'])

for model in list_of_models :
    model = model.fit(xtrain_resampled, ytrain_resampled) # TO FILL IN
    for i in range(2) :
        if i == 0 :
            to_pred = xtrain_resampled # TO FILL IN
            pred = ytrain_resampled # TO FILL IN
            title = 'Train'
        else :
            to_pred = xtest # TO FILL IN
            pred = ytest
            title = 'Test'
        model_name = str(type(model)).split(".")[-1][:-2]
        y_pred = model.predict(to_pred) 
        y_probs = model.predict_proba(to_pred)[:, 1] 
        acc = round(accuracy_score(pred, y_pred)*100)
        f1 = round(f1_score(pred, y_pred)*100)
        prec = round(precision_score(pred, y_pred)*100)
        recall = round(recall_score(pred, y_pred)*100)
        rocauc =  round(roc_auc_score(pred, y_probs)*100)
        d = pd.DataFrame(data=np.array([acc,f1,prec,recall,rocauc]).reshape(1,5) 
                     , columns=['Accuracy' , 'F1_score' , 'Precision' , 'Recall', 'Roc Auc'])  
        classification_report = pd.concat([classification_report , d])
        classification_report.rename( index= { 0 :'{} _ {} Details'.format(model_name , title) } ,inplace=True )
pd.options.display.max_rows = None
classification_report

Unnamed: 0,Accuracy,F1_score,Precision,Recall,Roc Auc
LogisticRegression _ Train Details,96,96,96,97,99.0
LogisticRegression _ Test Details,94,52,40,75,94.0
KNeighborsClassifier _ Train Details,94,95,90,100,100.0
KNeighborsClassifier _ Test Details,85,36,22,95,94.0
DecisionTreeClassifier _ Train Details,100,100,100,100,100.0
DecisionTreeClassifier _ Test Details,96,60,50,75,86.0
ExtraTreeClassifier _ Train Details,100,100,100,100,100.0
ExtraTreeClassifier _ Test Details,95,57,45,76,86.0
RandomForestClassifier _ Train Details,100,100,100,100,100.0
RandomForestClassifier _ Test Details,98,74,70,78,96.0
