In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import time


In [None]:
# Function for memory analysis and DataFrame information
def analyze_memory(df):
    print("\n=== Memory Analysis and DataFrame Information ===")
    memory = df.memory_usage(deep(True).sum() / 1024 ** 2
    print(f"Total memory used by DataFrame: {memory:.2f} MB")
    print("\nDetailed DataFrame information:")
    print(df.info())
    print("\nMissing values per column:")
    print(df.isnull().sum())


In [None]:
# Loading the Dataset
df = pd.read_csv(r"creditcard.csv")
# Perform memory analysis and DataFrame information
analyze_memory(df)


In [None]:
# Exploratory Data Analysis (EDA)
print("\nStatistical description:")
print(df.describe().T)
# Create a figure with 1 row and 2 columns
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# Distribution of the target variable (Class)
sns.countplot(data=df, x='Class', ax=axes[0])
axes[0].set_title('Class Distribution')
# Analysis of the distribution of the Amount variable
sns.histplot(df['Amount'], bins=50, kde=True, ax=axes[1])
axes[1].set_title('Amount Distribution')
plt.tight_layout()
plt.show()


In [None]:
# Correlation matrix
plt.figure(figsize=(10, 8))
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()


In [None]:
# Separating independent and dependent variables
X = df.drop('Class', axis=1)
y = df['Class']
# Analysis of class imbalance
print("\nClass Distribution (Imbalanced):")
print(y.value_counts(normalize(True))
# Normalization of the Amount variable
scaler = StandardScaler()
df['Amount'] = scaler.fit_transform(df[['Amount']])


In [None]:
# Separation of independent and dependent variables
X = df.drop(columns=['Class'])
y = df['Class']
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


In [None]:
# Applying SMOTE only to the training data
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)


In [None]:
# Function to evaluate models
def evaluate_model(model, X_train, y_train, X_test, y_test):
    start = time.time()
    # Cross-validation
    scores = cross_val_score(model, X_train, y_train, cv=5)
    mean_scores = scores.mean()
    # Training
    model.fit(X_train, y_train)
    # Predictions and evaluation on the test set
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred)
    confusion_matrix = confusion_matrix(y_test, y_pred)
    # Calculation of AUC-ROC
    roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    execution_time = time.time() - start
    return {
        "scores": scores,
        "mean_scores": mean_scores,
        "report": report,
        "confusion_matrix": confusion_matrix,
        "roc_auc": roc_auc,
        "execution_time": execution_time
    }
# Models to be evaluated
models = {
    "Random Forest": RandomForestClassifier(random_state=42, class_weight='balanced'),
    "Neural Network": MLPClassifier(random_state=42, max_iter=300, early_stopping=True, validation_fraction=0.1, n_iter_no_change=15, verbose=True),
    "XGBoost": XGBClassifier(random_state=42, eval_metric='logloss')
}
# Model evaluation
results = {}
for name, model in models.items():
    print(f"\nEvaluating model: {name}")
    results[name] = evaluate_model(model, X_train_res, y_train_res, X_test, y_test)
# Displaying the results
for name, result in results.items():
    print(f"\n=== {name} ===")
    print("Cross-Validation Scores:", result['scores'])
    print("Mean Scores:", result['mean_scores'])
    print("Classification Report:\n", result['report'])
    print("Confusion Matrix:\n", result['confusion_matrix'])
    print("AUC-ROC:\n", result['roc_auc'])
    print("Execution Time (s):\n", result['execution_time'])


In [None]:
# Saving the Random Forest model as an example
import joblib
joblib.dump(models["Random Forest"], 'fraud_detection_model_rf.pkl')
print("\nRandom Forest model saved as 'fraud_detection_model_rf.pkl'")
