In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, make_scorer, recall_score, f1_score
from scipy.stats import randint
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import LabelEncoder

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

import joblib
import random
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1, 16, 25"

In [4]:
def calculate_metrics(y_valid, y_pred):
    """
    Calculates the 4 metrics: accuracy, precision, recall, and false positive rate
    Also calculates the Macro avg. Accuracy, Precision, and Recall
    and the f1-scores

    Parameters:
        y_valid (list): list of ACTUAL label values
        y_pred (list): List of PREDICTED label values
    """
    # Calculate Macro Avg and f1-score of both classes
    report = classification_report(y_valid, y_pred)
    print(report)
    
    # Calculate precision and recall directly
    cm = confusion_matrix(y_valid, y_pred) 
    total = len(y_valid)
    tn, fp, fn, tp = cm.ravel()
    accuracy = (tp + tn) / total
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    fpr = fp / (fp + tn)
    # fnr = fn / (fn + tp)
    return accuracy, precision, recall, fpr, fnr

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if device.type == "cuda":
    print('GPU available')
else:
    print('GPU not available')
device

GPU available


device(type='cuda')

In [6]:
# Setup seed for Reproduciblity
seed = 42
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [1]:
task1_dev_set = "610_ps4_training/trainingT1FD/cct_train.csv"

In [2]:
# CONVERT DEV FILE INTO A DATAFRAME
task1_df = pd.read_csv(task1_dev_set)

# DO SOME PREPROCESSING: turn ssn from string into number
task1_df['ssn'] = task1_df['ssn'].str.replace('-', '')
task1_df['ssn'] = pd.to_numeric(task1_df['ssn'])

print(type(task1_df))
print(len(task1_df))
print(task1_df.columns)

NameError: name 'pd' is not defined

In [7]:
# Everything except the last element are features
column_names = task1_df.columns
label = column_names[-1]
X = task1_df.drop(columns=[label])
y = task1_df[label]

In [8]:
# Convert each categorical column to numerical if needed
for col in X.select_dtypes(include=['object']).columns:
    # Label encode each string column
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

# print(X.dtypes)  # Check data types of each column

In [9]:
# Define and fit the model
# def determine_important_features():
#     num_of_trees = [30, 40, 50, 75, 80, 85, 90, 100, 150]
#     for i in num_of_trees:
#         model = RandomForestClassifier(n_estimators = i, random_state=seed)
#         model.fit(X, y)
        
#         feature_importances = pd.DataFrame(model.feature_importances_, index = X.columns, columns=['importance']).sort_values('importance', ascending=False)
#         print(feature_importances)
#         print("\n\n")

# determine_important_features()

In [10]:
important_features = ['amt', 'trans_time', 'unix_time', 'trans_date', 'category', 'dob', 'profile', 'merch_long', 'trans_num', 'merch_lat', 'merchant', 'city_pop']
print(len(important_features))

12


In [11]:
# Get the important feature columns only
# print(X.keys())
X_important_features = X[important_features]
# print(X_important_features)

In [12]:
# change the data structure from pandas to numpy 
X_numpy_arr = X_important_features.to_numpy()
y_numpy_arr = y.to_numpy()

In [13]:
# # Define parameter grid for n_estimators
# # num_of_trees = [i for i in range(10, 90)]
# # num_of_trees = [10, 20, 50, 75]
# # num_of_tress = [30, 40, 50, 75, 80, 85, 90]
# num_of_trees = [75, 80, 85, 90]
# param_grid = {'n_estimators': num_of_trees}

# # Initialize RandomForestClassifier
# rf = RandomForestClassifier(random_state=seed)

# # Set up StratifiedKFold
# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

# # Create a custom scoring function to maximize recall
# scorer = make_scorer(recall_score, average='binary')

# # Set up GridSearchCV with recall scoring and StratifiedKFold cross-validation
# grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, scoring=scorer, cv=skf)

# # Run grid search
# grid_search.fit(X_numpy_arr, y_numpy_arr)

# # Get the best model and parameters
# best_n_estimators = grid_search.best_params_['n_estimators']
# best_recall = grid_search.best_score_

# print(f"Best n_estimators: {best_n_estimators}")
# print(f"Highest recall across splits: {best_recall}")
# # this best_recall should match the recall below

# # gridsearchCV is only finding the best number of features 
# # once you have the best number of features
# # retrain on each train/valid split using the SAME seed 
# # to find the best train/valid split, accuracy, and macro average f1 score


In [14]:
def do_stratified_k_fold_RFECV(rf_model):
    # Use stratified K-fold, k = number of splits
    k = 10
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
    models = []
    # PERFORM Stratified K-fold training and testing
    for train_index, valid_index in skf.split(X_numpy_arr, y_numpy_arr):
        # train_index, valid_index = next(iter(skf.split(X_numpy_arr, y_numpy_arr)))
        
        # SPLIT DEV SET INTO TRAINING AND VALIDATION
        X_train, X_valid = X_numpy_arr[train_index], X_numpy_arr[valid_index]
        y_train, y_valid = y_numpy_arr[train_index], y_numpy_arr[valid_index]
        
        # Use RFECV to select optimal number of features with cross-validation
        rfecv = RFECV(estimator=rf_model, step=5, cv=StratifiedKFold(5), scoring='recall_macro')
        
        # Fit RFECV to the data
        rfecv.fit(X_train, y_train)
        
        # IDENTIFY the Optimal number of features
        optimal_num_of_features = rfecv.n_features_
        print("Optimal number of features: %d" % optimal_num_of_features)
        
        # TRAIN your model with those features
        X_train_optimal = X_train[:, rfecv.support_]  # Select the optimal features
        rf_model.fit(X_train_optimal, y_train)
        
        # VALIDATION
        X_valid_optimal = X_valid[:, rfecv.support_]  # Select the optimal features
        y_pred = rf_model.predict(X_valid_optimal)
        
        # Compute multi-class recall
        macro_avg_recall = recall_score(y_valid, y_pred, average='macro')  # Macro-average recall
        print(f"Macro-average Recall: {macro_avg_recall}")
        
        # Compute accuracy and f1 score
        accuracy = accuracy_score(y_valid, y_pred)
        macro_avg_f1_score = f1_score(y_valid, y_pred, average='macro')
        
        # Save metrics
        temp_list = [train_index, valid_index, macro_avg_recall, accuracy, macro_avg_f1_score]
        models.append(temp_list)
    return models 

In [15]:
# num_of_tress = best_n_estimators
num_of_trees = 15
# default n_estimators, which controls the number of trees in the forest, is 100
optimal_rf = RandomForestClassifier(random_state=seed, n_estimators = num_of_trees)

# TRAIN MODEL
models = do_stratified_k_fold_RFECV(optimal_rf)

Optimal number of features: 7
Macro-average Recall: 0.9287566072511839
Optimal number of features: 7
Macro-average Recall: 0.9287566072511839
Optimal number of features: 7
Macro-average Recall: 0.9301030566526743
Optimal number of features: 7
Macro-average Recall: 0.9315212184015966
Optimal number of features: 7
Macro-average Recall: 0.9371364955193404
Optimal number of features: 7
Macro-average Recall: 0.9301532552958767
Optimal number of features: 7
Macro-average Recall: 0.9245308069433897
Optimal number of features: 7
Macro-average Recall: 0.9439417348626986
Optimal number of features: 7
Macro-average Recall: 0.9385044580079323
Optimal number of features: 7
Macro-average Recall: 0.9468915176136865


In [16]:
# temp_list = [train_index, valid_index, macro_avg_recall, accuracy, macro_avg_f1_score]

recalls = []
for i, list_of_metrics in enumerate(models):
    r = list_of_metrics[2]
    recalls.append(r)
    
index_max_recall = np.argmax(recalls)
print("index with the greatest RECALL", index_max_recall)

index with the greatest RECALL 9


In [24]:
# Get the best indices for training
# temp_list = [train_index, valid_index, macro_avg_recall, accuracy, macro_avg_f1_score]

# best_metrics = models[index_min_FNR]
best_metrics = models[index_max_recall]
train_indices = best_metrics[0]
valid_indices = best_metrics[1]
recall = best_metrics[2]
accuracy = best_metrics[3]
calculated_f1_score = best_metrics[4]
precision = best_metrics[3]
# fpr = best_metrics[5]
# fnr = best_metrics[6]

# print(train_indices)
print("recall:", recall)
print("accuracy:", accuracy)

my_f1_score = (2 * precision * recall) / (precision + recall)
print("my_f1_score:", my_f1_score)
print("check f1 score:", calculated_f1_score)

recall: 0.9468915176136865
accuracy: 0.9993864155251142
my_f1_score: 0.9724310218663827
check f1 score: 0.9683670825894657


In [25]:
# Retrain to get the model I guess
# SPLIT DEV SET INTO TRAINING AND VALIDATION
X_train, X_valid = X_numpy_arr[train_indices], X_numpy_arr[valid_indices]
y_train, y_valid = y_numpy_arr[train_indices], y_numpy_arr[valid_indices]

# Use RFECV to select optimal number of features with cross-validation
rfecv = RFECV(estimator=optimal_rf, step=5, cv=StratifiedKFold(5), scoring='recall_macro')

# Fit RFECV to the data
rfecv.fit(X_train, y_train)

# IDENTIFY the Optimal number of features
optimal_num_of_features = rfecv.n_features_
print("Optimal number of features: %d" % optimal_num_of_features)

# TRAIN your model with those features
X_train_optimal = X_train[:, rfecv.support_]  # Select the optimal features
optimal_rf.fit(X_train_optimal, y_train)

# Save the model to a file
joblib.dump(optimal_rf, 'task1_important_features_random_forest_rfecv_model.joblib')

Optimal number of features: 7


['task1_important_features_random_forest_rfecv_model.joblib']

In [26]:
# Use rfecv.support_ to get the mask of selected features
# print(rfecv.support_)
# selected_feature_names = [feature for feature, selected in zip(important_features, rfecv.support_) if selected]
# print("Selected features:", selected_feature_names)

In [30]:
# Load the saved model
rf_loaded = joblib.load('task1_important_features_random_forest_rfecv_model.joblib')

# VALIDATION
X_valid_optimal = X_valid[:, rfecv.support_]  # Select the optimal features
y_pred = rf_loaded.predict(X_valid_optimal)

accuracy, precision, recall, fpr, fnr = calculate_metrics(y_valid, y_pred)
print("recall:", recall)
print("accuracy:", accuracy)

# CALCULATE Macro-average recall
macro_avg_recall = recall_score(y_valid, y_pred, average='macro')

# CALCULATE accuracy and f1 score
model_accuracy = accuracy_score(y_valid, y_pred)
macro_avg_f1_score = f1_score(y_valid, y_pred, average='macro')
f1_score_for_fraud = f1_score(y_valid, y_pred, pos_label=1, average='binary')

print(f"Macro-average Recall: {macro_avg_recall}")
# Macro-average Recall: 0.9468915176136865

print("Accuracy:", model_accuracy)
# accuracy: 0.9468915176136865

print("Macro average F1 score:", macro_avg_f1_score)

print("f1_score_for_fraud:", f1_score_for_fraud)
# my_f1_score: 0.9836322573014133

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     69722
           1       0.98      0.89      0.94       358

    accuracy                           1.00     70080
   macro avg       0.99      0.95      0.97     70080
weighted avg       1.00      1.00      1.00     70080

recall: 0.8938547486033519
accuracy: 0.9993864155251142
Macro-average Recall: 0.9468915176136865
Accuracy: 0.9993864155251142
Macro average F1 score: 0.9683670825894657
f1_score_for_fraud: 0.9370424597364568
