# Credit Card Transactions Fraud Detection

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

In [None]:
train_data = pd.read_csv("fraudTrain.csv")
test_data = pd.read_csv("fraudTest.csv")

In [None]:
print(train_data.columns.tolist())

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
train_data.tail()

In [None]:
test_data.tail()

In [None]:
train_data.info()

In [None]:
test_data.info()

In [None]:
train_data.isnull().sum()

In [None]:
test_data.isnull().sum()

In [None]:
train_data['is_fraud'].value_counts()

In [None]:
test_data['is_fraud'].value_counts()


0 - Normal Transaction

1 - fraudulent transaction


In [None]:
train_data.dtypes

In [None]:
test_data.dtypes

In [None]:
# separating the data for analysis
legit_train = train_data[train_data.is_fraud == 0]
fraud_train= train_data[train_data.is_fraud == 1]

In [None]:
print(legit_train.shape)
print(fraud_train.shape)

In [None]:
legit_test = test_data[test_data.is_fraud == 0]
fraud_test= test_data[test_data.is_fraud == 1]

In [None]:
print(legit_test.shape)
print(fraud_test.shape)

### Feature Engineering

In [None]:
#converting all non numerical type column into numerical column
non_numeric_cols = train_data.select_dtypes(include=['object', 'category', 'datetime']).columns
print("Non-numeric columns:", non_numeric_cols)

In [None]:
non_numeric_cols = test_data.select_dtypes(include=['object', 'category', 'datetime']).columns
print("Non-numeric columns:", non_numeric_cols)

In [None]:
non_numeric_traindata = pd.read_csv('fraudTrain.csv',usecols=['trans_date_trans_time', 'merchant', 'category', 'first', 'last',
       'gender', 'street', 'city', 'state', 'job', 'dob', 'trans_num'])
non_numeric_traindata.head()

In [None]:
non_numeric_testdata = pd.read_csv('fraudTest.csv',usecols=['trans_date_trans_time', 'merchant', 'category', 'first', 'last',
       'gender', 'street', 'city', 'state', 'job', 'dob', 'trans_num'])
non_numeric_testdata.head()

In [None]:
for col in non_numeric_traindata.columns:
    print(col, ': ',len(non_numeric_traindata[col].unique()), 'labels')

In [None]:
for col in non_numeric_testdata.columns:
    print(col, ': ',len(non_numeric_testdata[col].unique()), 'labels')

In [None]:
train_data.drop(columns=['trans_num', 'unix_time','Unnamed: 0','first','last','street'], inplace=True)


In [None]:
test_data.drop(columns=['trans_num', 'unix_time', 'Unnamed: 0','first','last','street'], inplace=True)


In [None]:
print(train_data.columns.tolist())

In [None]:
train_data = pd.get_dummies(train_data, columns=['gender'], drop_first=True)

In [None]:
test_data = pd.get_dummies(test_data, columns=['gender'], drop_first=True)

In [None]:
train_data['trans_date_trans_time'] = pd.to_datetime(train_data['trans_date_trans_time'])
train_data['hour'] = train_data['trans_date_trans_time'].dt.hour
train_data['day'] = train_data['trans_date_trans_time'].dt.day
train_data['weekday'] = train_data['trans_date_trans_time'].dt.weekday
train_data['month'] = train_data['trans_date_trans_time'].dt.month


In [None]:
test_data['trans_date_trans_time'] = pd.to_datetime(test_data['trans_date_trans_time'])
test_data['hour'] = test_data['trans_date_trans_time'].dt.hour
test_data['day'] = test_data['trans_date_trans_time'].dt.day
test_data['weekday'] = test_data['trans_date_trans_time'].dt.weekday
test_data['month'] = test_data['trans_date_trans_time'].dt.month


In [None]:
train_data['dob'] = pd.to_datetime(train_data['dob'])
train_data['age'] = train_data['trans_date_trans_time'].dt.year - train_data['dob'].dt.year
train_data.drop(columns=['dob'], inplace=True)


In [None]:
test_data['dob'] = pd.to_datetime(test_data['dob'])
test_data['age'] = test_data['trans_date_trans_time'].dt.year - test_data['dob'].dt.year
test_data.drop(columns=['dob'], inplace=True)


In [None]:
train_data.drop(columns=['trans_date_trans_time'], inplace=True)

In [None]:
test_data.drop(columns=['trans_date_trans_time'], inplace=True)

In [None]:
freq_maps = {}
for col in ['city', 'state', 'job','merchant','category']:
    freq = train_data[col].value_counts(normalize=True)
    train_data[col + '_encoded'] = train_data[col].map(freq)
    train_data.drop(columns=[col], inplace=True)
    freq_maps[col] = freq  # Save the mapping

In [None]:
for col in ['city', 'state', 'job','merchant','category']:
    test_data[col + '_encoded'] = test_data[col].map(freq_maps[col])
    test_data.drop(columns=[col], inplace=True)

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
train_data.isnull().sum()

In [None]:
test_data.isnull().sum()

In [None]:
for col in ['city_encoded', 'job_encoded']:
    test_data[col] = test_data[col].fillna(0)

In [None]:
test_data.head()

In [None]:
test_data.isnull().sum()

In [None]:
X_train = train_data.drop("is_fraud", axis=1)
y_train = train_data["is_fraud"]

In [None]:
X_test = test_data.drop("is_fraud", axis=1)
y_test = test_data["is_fraud"]

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight="balanced"),
    "Decision Tree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, class_weight="balanced", random_state=42)
}

In [None]:
for name, model in models.items():
    model.fit(X_train, y_train)
    print(f"{name} trained.")

In [None]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1]
    print(f"\n=== {name} ===")
    print(classification_report(y_test, y_pred))
    print("ROC-AUC:", roc_auc_score(y_test, y_proba))


### Analysis:
**Logistic Regression** achieved the highest recall (77%) and the best ROC-AUC score (0.84), indicating strong ability to detect fraud cases. However, it suffers from extremely low precision (1%), meaning a high number of false positives.

**Decision Tree** provided a better balance between precision and recall, resulting in the highest F1-score (0.19) for the fraud class. It also showed a very high overall accuracy (98%) and a strong ROC-AUC score (0.83).

**Random Forest**, while commonly effective, performed the worst in this scenario. Its recall (53%) and ROC-AUC (0.64) were significantly lower than the other two models, making it less suitable for this imbalanced fraud detection task.



In [None]:
import pickle as pk

In [None]:
### Create a Pickle file using serialization 
for model_name, trained_model in models.items():
    file_name = model_name.lower().replace(" ", "_") + ".pkl"
    model_file = open(file_name, "wb")
    pk.dump(trained_model, model_file)
    model_file.close()


In [None]:
with open("scaler.pkl", "wb") as f:
    pk.dump(scaler, f)

In [None]:
with open('freq_maps.pkl', 'wb') as f:
    pk.dump(freq_maps, f)

In [None]:
import numpy as np

In [None]:
test_input =[X_test[0]] 
for name, model in models.items():
    prediction = model.predict(test_input)
    print(f"{name} prediction: {prediction}")