In [None]:
import pandas as pd
import numpy as np


In [None]:
df=pd.read_csv("creditcard.csv")
df

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(6,4))
sns.countplot(x="Class", data=df, palette="coolwarm")
plt.title("Class Distribution: Fraud vs. Genuine Transactions")
plt.show()

In [None]:
plt.figure(figsize=(6,4))
sns.histplot(df["Amount"], bins=50, kde=True)
plt.title("Transaction Amount Distribution")
plt.show()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(), cmap="coolwarm", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
print(df["Class"].value_counts(normalize=True) * 100)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
df["Amount"] = scaler.fit_transform(df[["Amount"]])
df["Amount"]

In [None]:
df.drop(["Time"], axis=1, inplace=True)
df.head()


In [None]:
X = df.drop(columns=["Class"])  # Features
y = df["Class"]  # Target variable

# Print dataset shape after preprocessing
print("Dataset shape after preprocessing:", X.shape)


In [None]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


In [None]:
print("Before SMOTE:", y_train.value_counts())  # Original class distribution
print("After SMOTE:", pd.Series(y_train_resampled).value_counts())

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc

In [None]:
lr_model = LogisticRegression()
lr_model.fit(X_train_resampled, y_train_resampled)


In [None]:
lr_preds = lr_model.predict(X_test)
lr_preds

In [None]:
print("Logistic Regression Performance:\n", classification_report(y_test, lr_preds))

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_resampled, y_train_resampled)

In [None]:
rf_preds = rf_model.predict(X_test)
rf_preds

In [None]:
print("Random Forest Performance:\n", classification_report(y_test, rf_preds))


In [None]:
def plot_confusion_matrix(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6,4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="coolwarm", xticklabels=["Genuine", "Fraud"], yticklabels=["Genuine", "Fraud"])
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title(title)
    plt.show()

In [None]:
plot_confusion_matrix(y_test, lr_preds, "Logistic Regression - Confusion Matrix")
plot_confusion_matrix(y_test, rf_preds, "Random Forest - Confusion Matrix")

In [None]:
def plot_roc_curve(y_true, model, title):
    y_probs = model.predict_proba(X_test)[:, 1]  # Get fraud probabilities
    fpr, tpr, _ = roc_curve(y_true, y_probs)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(6,4))
    plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}", color="blue")
    plt.plot([0,1], [0,1], "k--", label="Random Guess")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(title)
    plt.legend()
    plt.show()

In [None]:
plot_roc_curve(y_test, lr_model, "Logistic Regression - ROC Curve")
plot_roc_curve(y_test, rf_model, "Random Forest - ROC Curve")