# 0.) Import the Credit Card Fraud Data From CCLE

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


In [2]:
df = pd.read_csv("fraudTest.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,...,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,...,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,...,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [4]:
df_select = df[["trans_date_trans_time", "category", "amt", "city_pop", "is_fraud"]]

df_select["trans_date_trans_time"] = pd.to_datetime(df_select["trans_date_trans_time"])
df_select["time_var"] = [i.second for i in df_select["trans_date_trans_time"]]

X = pd.get_dummies(df_select, ["category"]).drop(["trans_date_trans_time", "is_fraud"], axis = 1)
y = df["is_fraud"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_select["trans_date_trans_time"] = pd.to_datetime(df_select["trans_date_trans_time"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_select["time_var"] = [i.second for i in df_select["trans_date_trans_time"]]


# 1.) Use scikit learn preprocessing to split the data into 70/30 in out of sample

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)

In [7]:
X_test, X_holdout, y_test, y_holdout = train_test_split(X_test, y_test, test_size = .5)

In [8]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_holdout = scaler.transform(X_holdout)

# 2.) Make three sets of training data (Oversample, Undersample and SMOTE)

In [9]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

In [10]:
ros = RandomOverSampler()
over_X, over_y = ros.fit_resample(X_train, y_train)

rus = RandomUnderSampler()
under_X, under_y = rus.fit_resample(X_train, y_train)

smote = SMOTE()
smote_X, smote_y = smote.fit_resample(X_train, y_train)

In [11]:
len(y_train)

389003

In [12]:
sum((y_train == 1))

1492

In [13]:
sum((y_train == 0))

387511

In [14]:
len(under_y)

2984

In [15]:
len(smote_y)

775022

# 3.) Train three logistic regression models

In [16]:
from sklearn.linear_model import LogisticRegression

In [17]:
over_log = LogisticRegression().fit(over_X, over_y)

under_log = LogisticRegression().fit(under_X, under_y)

smote_log = LogisticRegression().fit(smote_X, smote_y)

# 4.) Test the three models

In [18]:
over_log.score(X_test, y_test)

0.9215072338587778

In [19]:
under_log.score(X_test, y_test)

0.8964226588929677

In [20]:
smote_log.score(X_test, y_test)

0.9198277309916265

In [21]:
# We see SMOTE performing with higher accuracy but is ACCURACY really the best measure?

# 5.) Which performed best in Out of Sample metrics?

In [22]:
# Sensitivity here in credit fraud is more important as seen from last class

In [23]:
from sklearn.metrics import confusion_matrix

In [24]:
y_true = y_test

In [25]:
y_pred = over_log.predict(X_test)
cm = confusion_matrix(y_true, y_pred)
cm

array([[76569,  6453],
       [   90,   246]])

In [26]:
print("Over Sample Sensitivity : ", cm[1,1] /( cm[1,0] + cm[1,1]))

Over Sample Sensitivity :  0.7321428571428571


In [27]:
y_pred = under_log.predict(X_test)
cm = confusion_matrix(y_true, y_pred)
cm

array([[74474,  8548],
       [   86,   250]])

In [28]:
print("Under Sample Sensitivity : ", cm[1,1] /( cm[1,0] + cm[1,1]))

Under Sample Sensitivity :  0.7440476190476191


In [29]:
y_pred = smote_log.predict(X_test)
cm = confusion_matrix(y_true, y_pred)
cm

array([[76429,  6593],
       [   90,   246]])

In [30]:
print("SMOTE Sample Sensitivity : ", cm[1,1] /( cm[1,0] + cm[1,1]))

SMOTE Sample Sensitivity :  0.7321428571428571


# 6.) Pick two features and plot the two classes before and after SMOTE.

In [31]:
raw_temp = pd.concat([pd.DataFrame(X_train), pd.DataFrame(y_train)], axis =1)

In [32]:
raw_temp

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,is_fraud
0,-0.303480,-0.281110,-0.549598,-0.279334,-0.275843,-0.336512,-0.190271,-0.322679,-0.266036,-0.322572,3.221906,-0.22724,-0.256998,-0.276199,-0.285078,-0.312967,-0.180136,0.0
1,0.019246,-0.292664,-1.300761,-0.279334,-0.275843,-0.336512,-0.190271,-0.322679,3.758888,-0.322572,-0.310375,-0.22724,-0.256998,-0.276199,-0.285078,-0.312967,-0.180136,
2,-0.211562,-0.277999,-1.589669,-0.279334,-0.275843,-0.336512,-0.190271,-0.322679,-0.266036,-0.322572,3.221906,-0.22724,-0.256998,-0.276199,-0.285078,-0.312967,-0.180136,0.0
3,-0.413020,-0.232956,-0.202908,3.579943,-0.275843,-0.336512,-0.190271,-0.322679,-0.266036,-0.322572,-0.310375,-0.22724,-0.256998,-0.276199,-0.285078,-0.312967,-0.180136,0.0
4,-0.016950,0.094562,0.837163,-0.279334,-0.275843,-0.336512,-0.190271,-0.322679,-0.266036,-0.322572,3.221906,-0.22724,-0.256998,-0.276199,-0.285078,-0.312967,-0.180136,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
398797,,,,,,,,,,,,,,,,,,0.0
489225,,,,,,,,,,,,,,,,,,0.0
481760,,,,,,,,,,,,,,,,,,0.0
446750,,,,,,,,,,,,,,,,,,0.0


In [33]:
#plt.scatter(raw_temp[raw_temp["is_fraud"] == 0]["amt"], raw_temp[raw_temp["is_fraud"] == 0]["city_pop"])

plt.scatter(raw_temp[raw_temp["is_fraud"] == 1]["amt"], raw_temp[raw_temp["is_fraud"] == 1]["city_pop"])
plt.legend(["Fraud", "Not Fraud"])
plt.xlabel("Amount")
plt.ylabel("Population")

plt.show()

KeyError: 'amt'

In [None]:

raw_temp = pd.concat([smote_X, smote_y], axis =1)


In [53]:
#plt.scatter(raw_temp[raw_temp["is_fraud"] == 0]["amt"], raw_temp[raw_temp["is_fraud"] == 0]["city_pop"])

plt.scatter(raw_temp[raw_temp["is_fraud"] == 1]["amt"], raw_temp[raw_temp["is_fraud"] == 1]["city_pop"])
plt.legend([ "Not Fraud", "Fraud"])
plt.xlabel("Amount")
plt.ylabel("Population")

plt.show()

KeyError: 'amt'

# 7.) We want to compare oversampling, Undersampling and SMOTE across our 3 models (Logistic Regression, Logistic Regression Lasso and Decision Trees).

# Make a dataframe that has a dual index and 9 Rows.
# Calculate: Sensitivity, Specificity, Precision, Recall and F1 score. for out of sample data.
# Notice any patterns across perfomance for this model. Does one totally out perform the others IE. over/under/smote or does a model perform better DT, Lasso, LR?
# Choose what you think is the best model and why. test on Holdout

In [35]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
import pandas as pd

In [40]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

In [54]:
resampling_methods = {
    "over" : RandomOverSampler(),
    "under" : RandomUnderSampler(),
    "smote" : SMOTE()
}

model_configs = {
    "LOG" : LogisticRegression(),
    "LASSO" : LogisticRegression(penalty = "l1", 
                                 C = 2., solver = "liblinear"),
    "DecisionTree" : DecisionTreeClassifier()
}

In [70]:
def calc_perf_metrics(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()  
    sensitiveity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    return(sensitiveity, specificity, precision, recall, f1)

In [71]:
trained_models = {}
results = []

In [72]:
# Assuming resampling_methods is a dictionary where keys are names of resampling strategies (e.g., "SMOTE", "UnderSample") 
# and values are instances of those strategies.
# Assuming model_configs is a dictionary where keys are model names (e.g., "LogisticRegression", "RandomForest") 
# and values are instances of those models.
# Ensure X and y are defined before this block, where X is your feature matrix and y is your target vector.

# trained_models = {}  # Initialize an empty dictionary to store trained models.

for resample_key, resampler in resampling_methods.items():
    # Corrected: Added missing arguments X, y to fit_resample(). These should be your dataset's features and labels.
    resample_X, resample_y = resampler.fit_resample(X_train, y_train)
    
    for model_key, model in model_configs.items():
        combined_key = f"{resample_key}_{model_key}"  # Combine the keys to create a unique identifier for each model.

        m = model.fit(resample_X, resample_y)
        # Corrected: Used parentheses for method calls and assignment.
        #trained_models[combined_key] = model.fit(resample_X, resample_y)
        trained_models[combined_key] = m
        y_pred = m.predict(X_test)
        sensitiveity, specificity, precision, recall, f1 = calc_perf_metrics(y_true, y_pred)
        results.append({"Model" : combined_key,
                        "Sensitiveity" : sensitiveity,
                        "Specificity" : specificity,
                        "Precision" : precision,
                        "Recall" : recall,
                        "F1" : f1})


In [73]:
result_df = pd.DataFrame(results)

In [59]:
trained_models

{'key1': 'value1',
 'over_LOG': LogisticRegression(),
 'over_LASSO': LogisticRegression(C=2.0, penalty='l1', solver='liblinear'),
 'over_DecisionTree': DecisionTreeClassifier(),
 'under_LOG': LogisticRegression(),
 'under_LASSO': LogisticRegression(C=2.0, penalty='l1', solver='liblinear'),
 'under_DecisionTree': DecisionTreeClassifier(),
 'smote_LOG': LogisticRegression(),
 'smote_LASSO': LogisticRegression(C=2.0, penalty='l1', solver='liblinear'),
 'smote_DecisionTree': DecisionTreeClassifier()}

In [74]:
result_df

Unnamed: 0,Model,Sensitiveity,Specificity,Precision,Recall,F1
0,over_LOG,0.732143,0.922201,0.036689,0.732143,0.069876
1,over_LASSO,0.732143,0.92225,0.036711,0.732143,0.069916
2,over_DecisionTree,0.535714,0.998398,0.57508,0.535714,0.5547
3,under_LOG,0.732143,0.931199,0.041289,0.732143,0.07817
4,under_LASSO,0.732143,0.93103,0.041192,0.732143,0.077996
5,under_DecisionTree,0.958333,0.946749,0.06789,0.958333,0.126797
6,smote_LOG,0.732143,0.921888,0.036547,0.732143,0.069619
7,smote_LASSO,0.732143,0.921864,0.036536,0.732143,0.0696
8,smote_DecisionTree,0.714286,0.993568,0.310078,0.714286,0.432432


Decision Tree with SMOTE and oversampling showed the most balanced score across 5 kinds of ratio and were the best two models shown in the table.