In [10]:
import numpy as np
import pandas as pd
import torch
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, make_scorer
from sklearn.preprocessing import LabelEncoder
import joblib
import csv
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1, 16, 25"

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if device.type == "cuda":
    print('GPU available')
else:
    print('GPU not available')
device

GPU available


device(type='cuda')

In [3]:
task1_test_set = "610_ps4_test/testingT1FD/cct_test.csv"

# task1_test_set = "610_ps4_testing/testingT1FD/cct_test.csv"

In [4]:
# CONVERT DEV FILE INTO A DATAFRAME
task1_df = pd.read_csv(task1_test_set)

# Everything except the last element are features
column_names = task1_df.columns
label = column_names[-1]
X = task1_df.drop(columns=[label])
y = task1_df[label] 

# PREPROCESSING
# Convert each categorical column to numerical
for col in X.select_dtypes(include=['object']).columns:
    # Label encode each string column
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

# Hard coded important features
# important_features = ['amt', 'trans_time', 'unix_time', 'trans_date', 'category', 'dob', 'profile', 'merch_long', 'trans_num', 'merch_lat', 'merchant', 'city_pop']

important_features = ['amt', 'trans_time', 'unix_time', 'trans_date', 'category', 'dob', 'profile']

# Get the important feature columns only
X_important_features = X[important_features]

# change the data structure from pandas to numpy 
X_numpy_arr = X_important_features.to_numpy()
y_test_numpy_arr = y.to_numpy()

In [5]:
# Load the saved model
# model_name = 'task1_random_forest_model.joblib'
model_name = 'task1_important_features_random_forest_rfecv_model.joblib'
rf_loaded = joblib.load(model_name)

In [6]:
 # PREDICT
y_pred = rf_loaded.predict(X_numpy_arr)

# CALCULATE accuracy, macro f1 score, and f1 score for fraud
model_accuracy = accuracy_score(y_test_numpy_arr, y_pred)
macro_avg_f1_score = f1_score(y_test_numpy_arr, y_pred, average='macro')
f1_score_for_fraud = f1_score(y_test_numpy_arr, y_pred, pos_label=1, average='binary')

print("Accuracy:", model_accuracy)
print("Macro average F1 score:", macro_avg_f1_score)
print("f1_score_for_fraud:", f1_score_for_fraud)

Accuracy: 0.9987271834796406
Macro average F1 score: 0.9346656535479205
f1_score_for_fraud: 0.8699708454810495


In [17]:
# accuracy = accuracy_score(y, y_pred)
# precision = precision_score(y, y_pred)
# recall = recall_score(y, y_pred)
# f1 = f1_score(y, y_pred)

# print(f'Accuracy: {accuracy}')
# print(f'Precision: {precision}')
# print(f'Recall: {recall}')
# print(f'F1 Score: {f1}')

cm = confusion_matrix(y_test_numpy_arr, y_pred)
print(cm)

print(classification_report(y_test_numpy_arr, y_pred))

[[174233     26]
 [   197    746]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    174259
           1       0.97      0.79      0.87       943

    accuracy                           1.00    175202
   macro avg       0.98      0.90      0.93    175202
weighted avg       1.00      1.00      1.00    175202



In [18]:
output_df = task1_df.drop(columns=[label])
output_df["is_fraud"] = y_pred
output_df.to_csv('task1_test_output.csv', index=False) 