<a href="https://colab.research.google.com/github/Sunayana921/Data-Science-credit-card-project/blob/main/Creditcard_fraud_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Load dataset
df = pd.read_csv("creditcard.csv")

# Simulate additional features
np.random.seed(42)
num_users = 5000
num_merchants = 1000
locations = ['NY', 'LA', 'CHI', 'SF', 'HOU', 'MIA', 'SEA', 'ATL']

df['UserID'] = np.random.choice([f"U{i}" for i in range(num_users)], size=len(df))
df['MerchantID'] = np.random.choice([f"M{i}" for i in range(num_merchants)], size=len(df))
df['TransactionLocation'] = np.random.choice(locations, size=len(df))
df['UserHomeLocation'] = np.random.choice(locations, size=len(df))
df['LocationMismatch'] = (df['TransactionLocation'] != df['UserHomeLocation']).astype(int)
df['UserTxCount'] = df.groupby('UserID').cumcount()
df['MerchantAvgAmtDiff'] = df['Amount'] - df.groupby('MerchantID')['Amount'].transform('mean')

# Feature engineering
df['Hour'] = (df['Time'] // 3600) % 24
df['TxCountPerHour'] = df.groupby('Hour')['Amount'].transform('count')
df['Amount_Z'] = (df['Amount'] - df['Amount'].mean()) / (df['Amount'].std() + 1e-9)

# Select features
feature_cols = ['Amount', 'Hour', 'TxCountPerHour', 'Amount_Z',
                'LocationMismatch', 'UserTxCount', 'MerchantAvgAmtDiff'] +                [col for col in df.columns if col.startswith('V')]

X = df[feature_cols]
y = df['Class']

# Balance classes via under-sampling
fraud_df = df[df['Class'] == 1]
normal_df = df[df['Class'] == 0].sample(n=len(fraud_df)*3, random_state=42)
balanced_df = pd.concat([fraud_df, normal_df]).sample(frac=1, random_state=42)

X_bal = balanced_df[feature_cols]
y_bal = balanced_df['Class']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_bal, y_bal, test_size=0.3, stratify=y_bal, random_state=42)

# Model training
clf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
clf.fit(X_train, y_train)

# Evaluation
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:, 1]

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob))

Confusion Matrix:
[[441   2]
 [ 23 125]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       443
           1       0.98      0.84      0.91       148

    accuracy                           0.96       591
   macro avg       0.97      0.92      0.94       591
weighted avg       0.96      0.96      0.96       591

ROC AUC Score: 0.9793713013238972
