In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
# from category_encoders import TargetEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

import datetime

import joblib


# from cap_modules import *

pd.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv('data/fraudTrain.csv', sep = ',')

In [None]:
df.head(5)

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.isna().sum()

In [None]:
(df['is_fraud'].value_counts()[1] / df['is_fraud'].shape[0]) * 100

0.5789 percent of the training set is fraud. It is a very imbalanced

In [None]:
px.histogram(df[df['is_fraud'] == 1], x="amt", text_auto=True)

In [None]:
%load_ext autoreload
%autoreload 2
from cap_modules import count_plot
from cap_modules import count_plot_multi

In [None]:
# count_plot(df[df['is_fraud'] == 0], 'category', 'Number of non fradulent transactions by Category')

In [None]:
count_plot(df[df['is_fraud'] == 1], 'category', 'Number of fradulent transactions by Category')

In [None]:
px.histogram(df, x="category", color="is_fraud", histnorm='percent')

In [None]:
# dataset_graph = [
#     {
#         'data': df[df['is_fraud'] == 0],
#         'x': 'gender',
#         'title': 'Number of non fradulent transactions by Gender'
#     },
#     {
#         'data': df[df['is_fraud'] == 1],
#         'x': 'gender',
#         'title': 'Number of fradulent transactions by Gender'
#     }
# ]
# count_plot_multi(dataset_graph, '', 2, 1)



In [None]:
# count_plot(df[df['is_fraud'] == 0], 'gender', 'Number of non fradulent transactions by Gender', False)

In [None]:
count_plot(df[df['is_fraud'] == 1], 'gender', 'Number of fradulent transactions by Gender', False)

No significant differences in the proportion of fraud victims across genders. However, transaction volume differs notably between men and women. Women are involved in more transactions then men.

In [None]:
df['trans_year'] = pd.Series(pd.to_datetime(df['trans_date_trans_time'])).dt.year

In [None]:
df['dob'] = pd.Series(pd.to_datetime(df['dob'])).dt.year

In [None]:
age = pd.Series(df['trans_year'] - df['dob'])
df["age"] = age

In [None]:
df.head()

In [None]:
bins = [10, 18, 35, 60, 100]
labels = ["14-18", "18-35", "35-60", "60+"]
df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels, right=True)

In [None]:
df.drop(axis=1, columns=["age", "dob", "trans_year", "trans_date_trans_time"], inplace=True)

In [None]:
count_plot(df[df['is_fraud'] == 1], 'age_group', 'Number of fradulent transactions by Age', False)

In [None]:
# count_plot(df[df['is_fraud'] == 0], 'age_group', 'Number of non fradulent transactions by Age', False)

In [None]:
df.groupby('is_fraud')['age_group'].value_counts() / df.groupby('is_fraud')['age_group'].count()

In [None]:
df.groupby('is_fraud')['age_group'].value_counts() / df['is_fraud'].shape[0]

In [None]:
df.groupby('age_group')['is_fraud'].value_counts() / df.groupby('age_group')['is_fraud'].count()

The analysis indicates that teenagers are the least likely demographic to fall victim to credit card fraud. This trend is likely attributed to their limited access to credit cards compared to older age groups.

Individuals in middle-age groups show a higher susceptibility to credit card fraud, as illustrated by the accompanying bar chart. This vulnerability may be explained by their increased frequency of credit card usage, which correlates with their financial activity and purchasing habits.

Notably, the percentage of fraudulent transactions is highest among individuals aged 60 or older, at 0.75%. This elevated rate may stem from lower levels of digital literacy within this demographic, making them more vulnerable to fraudulent schemes.

In [None]:
df.groupby('is_fraud')['age_group'].count()

In [None]:
# (df['is_fraud'].value_counts()[1] / df['is_fraud'].shape[0]) * 100

In [None]:
state_to_region = {
    "CT": "Northeast", "ME": "Northeast", "MA": "Northeast", "NH": "Northeast",
    "RI": "Northeast", "VT": "Northeast", "NJ": "Northeast", "NY": "Northeast",
    "PA": "Northeast",
    "IL": "Midwest", "IN": "Midwest", "IA": "Midwest", "KS": "Midwest",
    "MI": "Midwest", "MN": "Midwest", "MO": "Midwest", "NE": "Midwest",
    "ND": "Midwest", "OH": "Midwest", "SD": "Midwest", "WI": "Midwest",
    "AL": "South", "AR": "South", "DE": "South", "FL": "South",
    "GA": "South", "KY": "South", "LA": "South", "MD": "South",
    "MS": "South", "NC": "South", "OK": "South", "SC": "South",
    "TN": "South", "TX": "South", "VA": "South", "WV": "South",
    "AK": "West", "AZ": "West", "CA": "West", "CO": "West",
    "HI": "West", "ID": "West", "MT": "West", "NV": "West",
    "NM": "West", "OR": "West", "UT": "West", "WA": "West",
    "WY": "West"
}

In [None]:
df["region"] = df['state'].map(state_to_region)

In [None]:
df.head()

In [None]:
# count_plot(df[df['is_fraud'] == 0], 'region', 'Number of non fradulent transactions by Region', True)

In [None]:
count_plot(df[df['is_fraud'] == 1], 'region', 'Number of fradulent transactions by Region', True)

In [None]:
df.groupby('is_fraud')['region'].value_counts() / df['is_fraud'].shape[0]

In [None]:
df.groupby('region')['is_fraud'].value_counts() / df.groupby('region')['is_fraud'].count()

In [None]:
cols_to_drop = ["Unnamed: 0", "cc_num", "zip", "lat", "long", "city_pop", "unix_time", "merch_lat", "merch_long", "trans_num", "street", "first", "last"]
df.drop(axis=1, columns=cols_to_drop, inplace=True)


In [None]:
df.head()

In [None]:
len(df['job'].unique())

In [None]:
len(df['merchant'].unique())

In [None]:
df.drop(axis=1, columns=["job", "merchant"], inplace=True)

In [None]:
df.drop(axis=1, columns=["city", "state"], inplace=True)

In [None]:
df.head()

In [None]:
df['region'].unique()

In [None]:
ohe_cols = ['category', 'gender', 'age_group', 'region']
num_features = ['amt', 'is_fraud']
enc = OneHotEncoder(sparse_output = False)
temp = enc.fit_transform(df[ohe_cols])
ohc_df = pd.DataFrame(temp, columns = enc.get_feature_names_out())
ohc_df.reset_index(drop=True, inplace=True)
ohc_df

In [None]:
enc_df = pd.concat([df[num_features].reset_index(), ohc_df], axis=1)
enc_df.drop(axis=1, columns=['index', 'region_nan'], inplace=True)
enc_df

In [None]:
plt.figure(figsize = (20, 20))
sns.heatmap(enc_df.corr(), annot=True, cmap='coolwarm')

In [None]:
# Calculate the correlation matrix
corr_matrix = enc_df.corr()

# Convert the correlation matrix to a list of tuples
corr_list = corr_matrix.unstack().sort_values(ascending=False).drop_duplicates()

# Remove the diagonal elements (correlation of a variable with itself)
corr_list = corr_list[corr_list != 1]

print(corr_list)

In [None]:
# enc_df.to_csv('enc_df.csv', encoding='utf-8', index=False)

Logistic Regression

In [None]:
X_train = enc_df.drop(columns=['is_fraud'], inplace=False)
y_train = enc_df['is_fraud']

In [None]:
# scaler = StandardScaler()

In [None]:
# X_train = scaler.fit_transform(X_train)

In [None]:
X_train

In [None]:
# log_reg = LogisticRegression(penalty="l2", fit_intercept=False, random_state=42)
# log_reg.fit(X_train, y_train)

In [None]:
# joblib.dump(log_reg, "log_reg.pkl")

In [None]:
log_reg = joblib.load("pkls/log_reg.pkl")

In [None]:
test_df = pd.read_csv("data/fraudTest.csv")

In [None]:
test_df.columns

In [None]:
test_df.drop(axis=1, columns=["Unnamed: 0", "cc_num", "merchant", "first", "last", "street", "city", "zip", "lat", "long", "city_pop", "job", "trans_num", "unix_time", "merch_lat", "merch_long"], inplace=True)

In [None]:
test_df.head()

In [None]:
test_df["region"] = test_df.state.map(state_to_region)

In [None]:
test_df["trans_year"] = pd.Series(pd.to_datetime(test_df.trans_date_trans_time)).dt.year
test_df["dob"] = pd.Series(pd.to_datetime(test_df.dob)).dt.year

age = pd.Series(test_df.trans_year - test_df.dob)
test_df["age"] = age

bins = [10, 18, 35, 60, 100]
labels = ["14-18", "18-35", "35-60", "60+"]
test_df["age_group"] = pd.cut(test_df['age'], bins=bins, labels=labels, right=True)

In [None]:
test_df

In [None]:
test_df.drop(axis=1, columns=["dob", "trans_year", "trans_date_trans_time", "age", "state"], inplace=True)

In [None]:
test_df

In [None]:
ohe_cols = ['category', 'gender', 'age_group', 'region']
num_features = ['amt', 'is_fraud']
enc_test = OneHotEncoder(sparse_output = False)
temp = enc_test.fit_transform(test_df[ohe_cols])
ohc_df_test = pd.DataFrame(temp, columns = enc_test.get_feature_names_out())
ohc_df_test.reset_index(drop=True, inplace=True)
ohc_df_test

In [None]:
enc_df_test = pd.concat([test_df[num_features].reset_index(), ohc_df_test], axis=1)
enc_df_test.drop(axis=1, columns=['index', 'region_nan'], inplace=True)
enc_df_test

In [None]:
# enc_df_test.to_csv('enc_df_test.csv', encoding='utf-8', index=False)

In [None]:
X_test = enc_df_test.drop(columns=['is_fraud'], inplace=False)
y_test = enc_df_test['is_fraud']

In [None]:
y_test.value_counts()

In [None]:
# scaler = StandardScaler()
# X_test = scaler.fit_transform(X_test)

In [None]:
y_pred = log_reg.predict(X_test)
y_pred

In [None]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

In [None]:
from sklearn.metrics import classification_report, roc_curve, roc_auc_score

y_pred = log_reg.predict(X_test)

print(classification_report(y_pred, y_test))

In [None]:
y_pred_prob = log_reg.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = roc_auc_score(y_test, y_pred_prob)
result_array_log = [{'fpr': fpr, 'tpr': tpr, 'roc_auc': roc_auc, 'model':'Log Reg'}]

In [None]:
def show_roc_plot(array):
    fig = plt.figure()
    for ar in array:
        plt.plot(ar['fpr'], ar['tpr'], label=f'{ar["model"]} area = {ar["roc_auc"]:.2f})')
    plt.plot([0, 1], [0, 1], linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve')
    plt.legend(loc='right')
    plt.show()

show_roc_plot(result_array_log)

In [None]:
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='coolwarm')
plt.xlabel('Predicted')
plt.ylabel('Actual')

In [None]:
# log_reg2 = LogisticRegression(penalty="l2", fit_intercept=False, random_state=42, class_weight="balanced")
# log_reg2.fit(X_train, y_train)

In [None]:
# joblib.dump(log_reg2, "log_reg2.pkl")

In [None]:
log_reg2 = joblib.load("pkls/log_reg2.pkl")

In [None]:
y_pred2 = log_reg2.predict(X_test)
y_pred2

In [None]:
accuracy = accuracy_score(y_test, y_pred2)
conf_matrix = confusion_matrix(y_test, y_pred2)
class_report = classification_report(y_test, y_pred2)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

In [None]:
y_pred_prob = log_reg2.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = roc_auc_score(y_test, y_pred_prob)
result_array_log = [{'fpr': fpr, 'tpr': tpr, 'roc_auc': roc_auc, 'model':'Log Reg'}]

In [None]:
show_roc_plot(result_array_log)

In [None]:
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='coolwarm')
plt.xlabel('Predicted')
plt.ylabel('Actual')

Although the model is now less biased towards wrongly classifying fraudulent transactions as non-fraudulent, the model has an underwhelming accuracy as demonstrated by the many false negatives and false positives.

Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# rf = RandomForestClassifier(class_weight="balanced", random_state=42)
# rf.fit(X_train, y_train)

In [None]:
import joblib

In [None]:

# # save
# joblib.dump(rf, "randomForest.pkl")

In [None]:
rf = joblib.load("pkls/randomForest.pkl")

In [None]:
y_pred_random_forest = rf.predict(X_test)
y_pred_random_forest

In [None]:
accuracy = accuracy_score(y_test, y_pred_random_forest)
conf_matrix = confusion_matrix(y_test, y_pred_random_forest)
class_report = classification_report(y_test, y_pred_random_forest)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

In [None]:
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='coolwarm')
plt.xlabel('Predicted')
plt.ylabel('Actual')

In [None]:
y_pred_prob = rf.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = roc_auc_score(y_test, y_pred_prob)
result_array_random_forest = [{'fpr': fpr, 'tpr': tpr, 'roc_auc': roc_auc, 'model':'Random Forest'}]

In [None]:
combine_array = result_array_log + result_array_random_forest

In [None]:
show_roc_plot(combine_array)

Other classifiers

In [None]:
logreg_coefs = log_reg2.coef_
logreg_coefs[0]

In [None]:
train_mse = mean_squared_error(y_train, log_reg2.predict(X_train))
test_mse = mean_squared_error(y_test, log_reg2.predict(X_test))
print(train_mse)
print(test_mse)

In [None]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Ridge

In [None]:
pipe = Pipeline([('polyfeatures', PolynomialFeatures(degree = 1, include_bias = False)),
                      ('scaler', StandardScaler()),
                     ('log_reg', LogisticRegression())])
pipe.fit(X_train, y_train)
logreg_coefs = pipe.named_steps['log_reg'].coef_
logreg_coefs[0]

In [None]:
feature_names = pipe.named_steps['polyfeatures'].get_feature_names_out()
reg_df = pd.DataFrame({'feature': feature_names, 'coef': logreg_coefs[0]})
reg_df.sort_values('coef', key=abs, ascending=False)