In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score ,confusion_matrix, roc_auc_score

from xgboost import XGBClassifier

In [2]:
df_transactions = pd.read_csv('/kaggle/input/credit-card-transactions-dataset/credit_card_transactions.csv',parse_dates=['trans_date_trans_time','dob'])

In [3]:
# Feature Engineering
# Extract features from 'trans_date_trans_time'
df_transactions['trans_date_trans_time'] = pd.to_datetime(df_transactions['trans_date_trans_time'])
df_transactions['transaction_hour'] = df_transactions['trans_date_trans_time'].dt.hour
df_transactions['transaction_day'] = df_transactions['trans_date_trans_time'].dt.day
df_transactions['transaction_month'] = df_transactions['trans_date_trans_time'].dt.month
df_transactions['transaction_day_of_week'] = df_transactions['trans_date_trans_time'].dt.dayofweek

#unix_time
df_transactions['unix_timestamp'] = pd.to_datetime(df_transactions['unix_time'], unit='s')
df_transactions['unix_transaction_hour'] = df_transactions['unix_timestamp'].dt.hour
df_transactions['unix_transaction_day'] = df_transactions['unix_timestamp'].dt.day
df_transactions['unix_transaction_month'] = df_transactions['unix_timestamp'].dt.month
df_transactions['unix_transaction_day_of_week'] = df_transactions['unix_timestamp'].dt.dayofweek

# Calculate the mean transaction amount per cardholder
df_transactions['mean_amt_per_cardholder'] = df_transactions.groupby('cc_num')['amt'].transform('mean')
# Calculate the relative transaction amount
df_transactions['amt_relative_to_mean'] = df_transactions['amt'] / df_transactions['mean_amt_per_cardholder']

# Identify categorical columns
categorical_features = ['merchant', 'category','gender','street','city','state','job']#,'merch_zipcode']

# Apply one-hot encoding to categorical columns
df_transactions = pd.get_dummies(df_transactions, columns=categorical_features)

# Split data into features and target variable
X = df_transactions.drop(['merch_zipcode','is_fraud'], axis=1)#'cc_num','merch_zipcode','unix_time'
y = df_transactions['is_fraud']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale numerical features (example: standardization)
#numerical_features = ['amt', 'zip', 'lat', 'long','city_pop','merch_lat','merch_long']#'cc_num',,'unix_time'
numerical_features = ['amt', 'zip', 'lat', 'long', 'city_pop', 'merch_lat', 'merch_long', 'transaction_hour', 
                      'transaction_day', 'transaction_month', 'transaction_day_of_week','unix_transaction_hour',
                      'unix_transaction_day','unix_transaction_month','unix_transaction_day_of_week','amt_relative_to_mean'
                     ]
scaler = StandardScaler()
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

# Exclude the 'timestamp' column from features
X_train = X_train.drop(['trans_date_trans_time','trans_num'], axis=1)
X_test = X_test.drop(['trans_date_trans_time','trans_num'], axis=1)

# Separate numeric and categorical columns
numeric_features_train = X_train.select_dtypes(include=['float64', 'int64']).columns
categorical_features_train = X_train.select_dtypes(include=['object']).columns

# Create transformers for numeric and categorical features
numeric_transformer = Pipeline(steps=[('numeric', StandardScaler())])  # You can replace StandardScaler with other scalers if needed
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Use ColumnTransformer to apply transformers to the respective columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features_train),
        ('cat', categorical_transformer, categorical_features_train)
    ])

In [4]:
# Append the classifier to the transformers in a pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', RandomForestClassifier(class_weight='balanced', random_state=42))])

# Train the model (if not already trained)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_test = model.predict(X_test)

# Evaluate the model on the test data
accuracy_test = accuracy_score(y_test, y_pred_test)
conf_matrix_test = confusion_matrix(y_test, y_pred_test)
classification_rep_test = classification_report(y_test, y_pred_test)

# Print the evaluation metrics for the test data
print(f"Test Data Accuracy: {accuracy_test:.2f}")
print("\nTest Data Confusion Matrix:\n", conf_matrix_test)
print("\nTest Data Classification Report:\n", classification_rep_test)

Test Data Accuracy: 1.00

Test Data Confusion Matrix:
 [[257765     50]
 [   587    933]]

Test Data Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    257815
           1       0.95      0.61      0.75      1520

    accuracy                           1.00    259335
   macro avg       0.97      0.81      0.87    259335
weighted avg       1.00      1.00      1.00    259335



In [5]:
# Create the model with XGBClassifier
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(scale_pos_weight=len(y_train[y_train==0]) / len(y_train[y_train==1]), random_state=42))
])

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_test = model.predict(X_test)

# Evaluate the model on the test data
accuracy_test = accuracy_score(y_test, y_pred_test)
conf_matrix_test = confusion_matrix(y_test, y_pred_test)
classification_rep_test = classification_report(y_test, y_pred_test)
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

# Print the evaluation metrics for the test data
print(f"Test Data Accuracy: {accuracy_test:.2f}")
print("\nTest Data Confusion Matrix:\n", conf_matrix_test)
print("\nTest Data Classification Report:\n", classification_rep_test)
print(f"\nROC AUC Score: {roc_auc:.2f}")

Test Data Accuracy: 0.97

Test Data Confusion Matrix:
 [[249748   8067]
 [    72   1448]]

Test Data Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.97      0.98    257815
           1       0.15      0.95      0.26      1520

    accuracy                           0.97    259335
   macro avg       0.58      0.96      0.62    259335
weighted avg       0.99      0.97      0.98    259335


ROC AUC Score: 0.99
