## Import Required Libraries

In [2]:
import numpy as np
import os
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn import svm
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from sklearn.metrics import make_scorer, accuracy_score
from imblearn.over_sampling import SMOTE
import joblib
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_score, f1_score

## KNN

In [None]:
def train_knn(X_train, y_train, X_test, y_test):
    print("----------KNN-----------")
    
    param_grid = {'n_neighbors': np.arange(1, 20), 'weights': ['uniform', 'distance'], 'metric':["euclidean", "manhattan", "chebyshev", "minkowski"]}
    
    clf = make_pipeline(
        preprocessor,
        SMOTE(sampling_strategy='minority', random_state=5),
         KNeighborsClassifier()
    )
    
    knn_gscv = GridSearchCV(clf, param_grid, cv=5)
    knn_gscv.fit(X_train, y_train.values.ravel())
    print(knn_gscv.best_params_, knn_gscv.best_score_)
    y_pred = knn_gscv.predict(X_test)
    print(f1_score(y_test, y_pred, average='weighted'))
#     print(classification_report(y_test, y_pred))
    y_train_pred = knn_gscv.predict(X_train)
#     print(classification_report(y_train, y_train_pred))
    return {
        "f1_score": f1_score(y_test, y_pred, average='weighted'),
        "params" : knn_gscv.best_params_,
        "train_score" : knn_gscv.best_score_,
        "test_score" : accuracy_score(y_test, y_pred),
        "model" : knn_gscv
    }
#     plot_confusion_matrix(knn_gscv, X_test, y_test)

## Random Forest

In [None]:
def train_rf(X_train, y_train, X_test, y_test):
    print("----------RANDOM FOREST-----------")
    
    parameters = {'n_estimators': [10,20,30,40,50,60,70,80,90,100], 'max_depth': [4,5,6,7,8], 'criterion': ['gini', 'entropy'], 'max_features': ['sqrt', 'log2']}

        # Create a pipeline with preprocessing, SMOTE, and the Random Forest classifier
    clf = make_pipeline(
        preprocessor,
        SMOTE(sampling_strategy='minority', random_state=5),
        RandomForestClassifier(random_state=5)
    )
    
    cv_combined = GridSearchCV(estimator=clf, param_grid=parameters, cv=5)
    cv_combined.fit(X_train, y_train)

    print('best params: ', cv_combined.best_params_)

    y_pred = cv_combined.predict(X_test)
    print(f1_score(y_test, y_pred, average='weighted'))
#     print(classification_report(y_test, y_pred))
#     print(classification_report(y_train, cv_combined.predict(X_train)))
    return {
        "f1_score": f1_score(y_test, y_pred, average='weighted'),
        "params" : cv_combined.best_params_,
        "train_score" : cv_combined.best_score_,
        "test_score" : accuracy_score(y_test, y_pred),
        "model" : cv_combined
    }
#     plot_confusion_matrix(cv_combined, X_test, y_test)

## Decision Tree

In [54]:
def train_dt(X_train, y_train, X_test, y_test, preprocessor):
    print("----------DECISION TREE-----------")

    # Define the parameter grid for grid search
    parameters = {
        'decisiontreeclassifier__max_depth':  [4,5,6,7,8],
        'decisiontreeclassifier__criterion': ['gini', 'entropy'],
        'decisiontreeclassifier__max_features': ['sqrt', 'log2'],
        'decisiontreeclassifier__min_samples_split': [2,6,10,14,18,22]
    }
    

    # Create a pipeline with preprocessing, SMOTE, and the decision tree classifier
    clf = make_pipeline(
        preprocessor,
        SMOTE(sampling_strategy='minority', random_state=5),
        DecisionTreeClassifier(random_state=5)
    )

    # Initialize GridSearchCV with the pipeline and parameter grid
    cv_combined = GridSearchCV(estimator=clf, param_grid=parameters, cv=5, refit=True, scoring=make_scorer(accuracy_score))
    cv_combined.fit(X_train, y_train)

    print('Best parameters: ', cv_combined.best_params_)

    # Make predictions on the test set
    y_pred = cv_combined.predict(X_test)
    
    # Calculate and print the F1 score
    f1 = f1_score(y_test, y_pred, average='weighted')
    print(f1)

    # Return a dictionary with relevant metrics and the best model
    return {
        "f1_score": f1,
        "params": cv_combined.best_params_,
        "train_score": cv_combined.best_score_,
        "test_score": accuracy_score(y_test, y_pred),
        "model": cv_combined
    }

## SVM

In [None]:
def train_svm(X_train, y_train, X_test, y_test):
    print("----------SVM-----------")
    tuned_parameters = {'kernel': ['linear', 'rbf'], 'C': [1, 10, 100, 1000], 'gamma': [1e-3, 1e-4]}
    
    # Create a pipeline with preprocessing, SMOTE, and SVM
    clf = make_pipeline(
        preprocessor,
        SMOTE(sampling_strategy='minority', random_state=5),
        svm.SVC(random_state=10)
    )


    clf = GridSearchCV(clf, param_grid=tuned_parameters, cv=5)

    clf.fit(X_train, y_train)
    print(clf.best_params_)
    y_pred = clf.predict(X_test)
    print(f1_score(y_test, y_pred, average='weighted'))
#     print(classification_report(y_test, y_pred))
#     print(classification_report(y_train, clf.predict(X_train)))
    return {
        "f1_score": f1_score(y_test, y_pred, average='weighted'),
        "params" : clf.best_params_,
        "train_score" : clf.best_score_,
        "test_score" : accuracy_score(y_test, y_pred)
    }
#     plot_confusion_matrix(clf, X_test, y_test)

In [46]:
def create_log_dict(classifier, result):
    row = {
           "classifier": classifier,
           "params": result["params"],
           "f1_score": result["f1_score"],
           "train_score": result["train_score"],
           "test_score": result["test_score"]
          }
    return row

In [43]:
def preprocess_data():
    numerical_features = ['amt','trans_month_sin', 'trans_month_cos', 'trans_hour_sin', 'trans_hour_cos','age','distance']
    categorical_features = ['merchant', 'category','gender','city','state', 'job','trans_num']
    numerical_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')
    preprocessor = ColumnTransformer(
        transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])
    return preprocessor

In [4]:
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
# features_file = os.path.join(parent_dir, 'data', 'final_features.csv')
# dataset = pd.read_csv(csv_file)

output_list = []
OUTPUT_FOLDER = os.path.join(parent_dir, 'output', 'dt_grid_search_result/')
print(dt_grid_search_result)



df = pd.read_csv(features_file)

X = df.drop(columns=['is_fraud'])
y = df['is_fraud']

X_TRAIN, X_TEST, Y_TRAIN, Y_TEST = train_test_split(X,y,stratify=y,test_size = 0.2,random_state=42,shuffle=True)
X_TRAIN.reset_index(drop=True,inplace=True)
X_TEST.reset_index(drop=True,inplace=True)
Y_TRAIN.reset_index(drop=True,inplace=True)
Y_TEST.reset_index(drop=True,inplace=True)

print("Distribution of y_train = {}".format(Y_TRAIN.value_counts()))
print("Distribution of y_test = {}".format(Y_TEST.value_counts()))

preprocessor = preprocess_data()

dt_result = train_dt(X_TRAIN, Y_TRAIN, X_TEST, Y_TEST, preprocessor)
# svm_result = train_svm(X_TRAIN, Y_TRAIN, X_TEST, preprocessor)
# knn_result = train_knn(X_TRAIN, Y_TRAIN, X_TEST, preprocessor)
# rf_result = train_rf(X_TRAIN, Y_TRAIN, X_TEST, preprocessor)


dt_row = create_log_dict("decision_tree", dt_result)
# svm_row = create_log_dict( "svm", svm_result)
# knn_row = create_log_dict("knn", knn_result)
# rf_row = create_log_dict("random_forest", rf_result)

output_list.append(dt_row)
# output_list.append(svm_row)
# output_list.append(knn_row)
# output_list.append(rf_row)

output_df = output_df.append(output_list)

if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)
with open(OUTPUT_FOLDER + f'results_first_exp_cv5.csv', 'a') as f:
    output_df.to_csv(f, header=f.tell()==0, index=False)
    joblib.dump(dt_result["model"], OUTPUT_FOLDER + f'gridsearch_exp_dt_model.joblib')
    


NameError: name 'dt_grid_search_result' is not defined

## Using Best Models

In [57]:
clf = dt_result["model"]

In [58]:
print(classification_report(Y_TEST, clf.predict(X_TEST)))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99    257834
           1       0.01      0.02      0.01      1501

    accuracy                           0.99    259335
   macro avg       0.50      0.51      0.50    259335
weighted avg       0.99      0.99      0.99    259335



In [59]:
plot_confusion_matrix(clf, X_TEST, Y_TEST)

NameError: name 'plot_confusion_matrix' is not defined

In [60]:
for col, imp in zip(cols, clf.best_estimator_.feature_importances_):
    print(col, imp)

NameError: name 'cols' is not defined