# **Importing Libraries**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np 
import seaborn as sns

sns.set_style('whitegrid')
sns.set_palette('Set2')

import warnings
warnings.filterwarnings("ignore")

# **Loading Data**

In [None]:
train = pd.read_csv('/kaggle/input/fraud-detection/fraudTrain.csv',index_col='Unnamed: 0')
test = pd.read_csv('/kaggle/input/fraud-detection/fraudTest.csv',index_col='Unnamed: 0')

In [None]:
train.head(2)

In [None]:
train.shape

In [None]:
train.columns

In [None]:
columns_drop = ["unix_time","merchant","first","street"]
train.drop(columns = columns_drop,inplace= True)
test.drop(columns = columns_drop,inplace= True)

**Droping Certain unuseful columns**


# **EDA**

In [None]:
train.isna().sum().sum(),train.duplicated().sum()

**No null or duplicate values**

In [None]:
train.describe().T

In [None]:
train.info()

**There are a lot of categorical features! So we need to encode them**

In [None]:
#Target Distribution

donut = train["is_fraud"].value_counts().reset_index()

labels = ["Not Fraud", "Fraud"]
explode = (0, 0)

fig, ax = plt.subplots(dpi=120, figsize=(8, 4))
plt.pie(donut["count"],
        labels=donut["count"],
        autopct="%1.1f%%",
        pctdistance=0.8,
        explode=explode)

centre_circle = plt.Circle((0.0, 0.0), 0.5, fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

plt.title("Target Distribution")
plt.legend(labels, loc="center", frameon=False)
plt.show();

**Highly imbalanced classes** Because of this, Oversampling/Undersampling can be used in the model.

In [None]:
sns.kdeplot(x="amt", hue="is_fraud",data=train);

In [None]:
pct99 = train["amt"].quantile(0.99)
sns.histplot(x="amt", hue="is_fraud", bins=30,
             stat="probability", data=train[train["amt"] <= pct99],
             common_norm=False);

**the probability distribution behaves differently for each value of "is_fraud".**

In [None]:
import math
categories = train['category'].unique()

num_plots = len(categories)
num_rows = math.isqrt(num_plots)
num_cols = math.ceil(num_plots / num_rows)

fig, axes = plt.subplots(num_rows, num_cols, figsize=(
    5*num_cols, 5*num_rows), sharex=True)

for ax, category in zip(axes.ravel(),categories):

    data_category = train[train['category'] == category]

    sns.histplot(x='amt', data=data_category[data_category['amt'] <= pct99],
                 hue='is_fraud', stat='probability',
                 common_norm=False, bins=30, ax=ax)

    ax.set_ylabel('Percentage in Each Type')
    ax.set_xlabel('Transaction Amount in USD')
    ax.set_title(f'{category}')
    ax.legend(title='Type', labels=['Fraud', 'Not Fraud'])

plt.tight_layout()
plt.show();

**There are patterns in categories hence we must consider categories for modelling**

In [None]:
import math
lastnames = train['last'].value_counts().index[:12]

num_plots = len(lastnames)
num_rows = math.isqrt(num_plots)
num_cols = math.ceil(num_plots / num_rows)

fig, axes = plt.subplots(num_rows, num_cols, figsize=(
    5*num_cols, 5*num_rows), sharex=True)



for ax, lastname in zip(axes.ravel(),lastnames):

    data_lastname = train[train['last'] == lastname]

    sns.histplot(x='amt', data=data_lastname[data_lastname['amt'] <= pct99],
                 hue='is_fraud', stat='probability',
                 common_norm=False, bins=30, ax=ax)

    ax.set_ylabel('Percentage in Each Type')
    ax.set_xlabel('Transaction Amount in USD')
    ax.set_title(f'{lastname}')
    ax.legend(title='Type', labels=['Fraud', 'Not Fraud'])

plt.tight_layout()
plt.show();

**There are patterns in lastnames as well hence we must consider Lastnames for modelling**

In [None]:
train["state"].value_counts().shape

In [None]:
train.groupby("state")["is_fraud"].value_counts()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fig,axb = plt.subplots(ncols=2,nrows=1,figsize=(15, 8))

#Gender Distribution
explode = [0.1, 0.1]
train.groupby('gender')['is_fraud'].count().plot.pie(explode=explode, autopct="%1.1f%%",ax=axb[0]);

ax = sns.countplot(x="gender", hue="is_fraud", data=train,ax=axb[1])

# Add values on top of each bar
for p in ax.patches:
    ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', xytext=(0, 10), textcoords='offset points')

# Set labels and title
plt.title("Distribution of Gender with Fraud Status")
plt.xlabel("Gender")
plt.ylabel("Count")

# Show the plot
plt.show()

**Females are doing more transactions but males are more likely to make fraud transaction**

In [None]:
import datetime as dt

train['age'] = dt.date.today().year-pd.to_datetime(train['dob']).dt.year
test['age'] = dt.date.today().year-pd.to_datetime(test['dob']).dt.year

ax = sns.kdeplot(x='age', data=train, hue='is_fraud', common_norm=False)
ax.set_xlabel('Credit Card Holder Age')
ax.set_ylabel('Density')
plt.xticks(np.arange(0, 110, 10))
plt.title('Age Distribution')
plt.legend(title='Type', labels=['Fraud', 'Not Fraud']);

**age has a less major impact on target**

In [None]:
train['hour'] = pd.to_datetime(train['trans_date_trans_time']).dt.hour
test['hour'] = pd.to_datetime(test['trans_date_trans_time']).dt.hour

f, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,5), sharey=True)

ax1 = sns.histplot(x='hour', data=train[train["is_fraud"] == 0],
                   stat="density", bins=24, ax=ax1)

ax2 = sns.histplot(x='hour', data=train[train["is_fraud"] == 1],
                   stat="density", bins=24, ax=ax2, color="orange")

ax1.set_title("Not Fraud")
ax2.set_title("Fraud")

ax1.set_xticks(np.arange(24))
ax2.set_xticks(np.arange(24));

**It is clear that fraud transactions mainly occur at midnight.**

# **Preparing Data For Modelling**

In [None]:
train.dtypes

In [None]:
train["gender"]= train["gender"].apply(lambda x: 1 if x == "M" else 0)
test["gender"]= test["gender"].apply(lambda x: 1 if x == "M" else 0)

In [None]:
import category_encoders as ce

def apply_woe(train, columns, target_col):
    woe = ce.WOEEncoder()

    for col in columns:
        X = train[col]
        y = train[target_col]

        new_col_name = f"{col}_WOE"
        train[new_col_name] = woe.fit_transform(X, y)

    return train


columns_to_encode = ["category", "state", "city", "job","last"]
target_column = "is_fraud"

train = apply_woe(train, columns_to_encode, target_column)
test = apply_woe(test, columns_to_encode, target_column)

In [None]:
col_drop = ["is_fraud","trans_date_trans_time","trans_num","dob","category", "state", "city", "job","last"]
X_train,X_test,y_train,y_test = train.drop(columns=col_drop),test.drop(columns=col_drop),train["is_fraud"],test["is_fraud"]

In [None]:
X_train.dtypes

In [None]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(sampling_strategy=0.1, random_state=23)
# X_undersampled, y_undersampled = rus.fit_resample(X_train, y_train)
X_undersampled, y_undersampled = X_train, y_train

**Tried Undersampling but without Undersampling model has greater score so commented it**

In [None]:
# y_train.value_counts(),y_undersampled.value_counts()

**undersampled normal transactions from 12L to 75k**

In [None]:
X_undersampled.sample(2)

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(pd.concat([X_undersampled,y_undersampled],axis=1).corr(),annot=True,fmt='.2f')

**Amount has the highest corelation with fraud other columns have very negligible effect on fraud transaction**

In [None]:
# col_d = ["zip","lat","long","city_pop","merch_lat", "merch_long", "cc_num"]
col_d = []

X_train = X_undersampled.drop(columns=col_d)
X_test =  X_test.drop(columns=col_d)
y_train = y_undersampled

**Tried Dropping these columns but score is greater keeping these columns**

In [None]:
X_train.head(2)

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
X_train.shape,y_train.shape,X_test.shape,y_test.shape

In [None]:
#classification report
from sklearn.metrics import classification_report,confusion_matrix,precision_score,accuracy_score

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train,y_train)

In [None]:
rf.score(X_test,y_test)

In [None]:
y_pred = rf.predict(X_test)
print(classification_report(y_test,y_pred))
cnf = confusion_matrix(y_test,y_pred)
sns.heatmap(cnf,annot=True,fmt=".2f")

**Feature Importance**

In [None]:
print("Accuracy: ", accuracy_score(y_test,y_pred))
print("Precision: ", precision_score(y_test,y_pred))

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score, auc
y_prob = rf.predict_proba(X_test)[:, 1]

# Calculate the ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_prob)

# Calculate the AUC (Area Under the Curve) score
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(8, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Guess')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
importances = rf.feature_importances_
forest_importances = pd.Series(importances,index=X_undersampled.columns)
fig, ax = plt.subplots(figsize=(12, 5))
forest_importances.sort_values(ascending=False).plot.bar(ax=ax)
ax.set_ylabel("Feature Importance")
fig.tight_layout();

# Conclusion

In this problem, it is crucial to look at precision score besides the accuracy score because of highly imbalanced dataset

Since our problem is related to fraud transactions, the model with low false negatives (transactions that are predicted as non-frauds and but actually are fraudulents) is better.

in this problem the precision and recall metric are the most important for the model.