In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
import xgboost as xgb
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RandomizedSearchCV
import matplotlib.pyplot as plt
import time

# -----------------------------------------
# Step 1: Load & Explore Data
# -----------------------------------------
data = pd.read_csv('/content/fraud_simulation_dataset.csv')

print("Dataset Preview:")
print(data.head())
print(data.info())
print("Missing values:\n", data.isnull().sum())
print("isFraud distribution:\n", data['isFraud'].value_counts())

# Simulate fraud if none exist (e.g., set 5% of transactions as fraud)
if data['isFraud'].sum() == 0:
    num_records = len(data)
    num_fraud = max(1, int(0.05 * num_records))
    fraud_indices = np.random.choice(data.index, num_fraud, replace=False)
    data.loc[fraud_indices, 'isFraud'] = 1
    print(f"\nSimulated {num_fraud} fraud transactions.")
    print("New isFraud distribution:\n", data['isFraud'].value_counts())

# -----------------------------------------
# Step 2: Data Cleaning
# -----------------------------------------
numeric_columns = data.select_dtypes(include=['number']).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())

# Drop irrelevant columns.
data.drop(['nameOrig', 'nameDest'], axis=1, inplace=True)

# Remove outliers (extreme amounts beyond the 99th percentile).
data = data[data['amount'] < data['amount'].quantile(0.99)]

# -----------------------------------------
# Step 3: Feature Engineering & Selection
# -----------------------------------------
data['balanceDiff'] = data['oldbalanceOrg'] - data['newbalanceOrig']

# Encode categorical variable 'type'
le = LabelEncoder()
data['type'] = le.fit_transform(data['type'])

# Separate target and predictors.
y = data['isFraud']
X = data.drop('isFraud', axis=1)

# Drop highly correlated features.
corr_matrix = X.corr().abs()
upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > 0.85)]
print("\nDropping columns due to high correlation:", to_drop)
X.drop(columns=to_drop, inplace=True)

# -----------------------------------------
# Step 4: Train-Test Split (Before SMOTE)
# -----------------------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("\nClass distribution in training set before SMOTE:")
print(y_train.value_counts())

# -----------------------------------------
# Step 5: Apply SMOTE on Training Data Only
# -----------------------------------------
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)
print("\nClass distribution in training set after SMOTE:")
print(pd.Series(y_train_sm).value_counts())

# -----------------------------------------
# Step 6: Train XGBoost Model
# -----------------------------------------
scale_pos_weight = y_train_sm.value_counts()[0] / y_train_sm.value_counts()[1]
xgb_clf = xgb.XGBClassifier(scale_pos_weight=scale_pos_weight, random_state=42)
xgb_clf.fit(X_train_sm, y_train_sm)

# -----------------------------------------
# Step 7: Evaluate the Model
# -----------------------------------------
y_pred = xgb_clf.predict(X_test)
roc_auc = roc_auc_score(y_test, y_pred)
print("\nAUC-ROC:", roc_auc)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Feature Importance Plot
xgb.plot_importance(xgb_clf)
plt.show()

# -----------------------------------------
# Step 8: Hyperparameter Tuning (Optional)
# -----------------------------------------
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 9],
    'n_estimators': [50, 100, 200, 300],
    'scale_pos_weight': [scale_pos_weight],
}
random_search = RandomizedSearchCV(estimator=xgb_clf,
                                   param_distributions=param_grid,
                                   n_iter=50,
                                   scoring='roc_auc',
                                   cv=3,
                                   verbose=0,
                                   random_state=42)
random_search.fit(X_train_sm, y_train_sm)
#print("\nBest Parameters from Hyperparameter Tuning:")
#print(random_search.best_params_)
print("Best Score from Hyperparameter Tuning:", random_search.best_score_)

# ------------------------------------------------------------------------------
# Additional Section: Proactive Fraud Prevention & Monitoring Simulation
# ------------------------------------------------------------------------------

def simulate_real_time_detection(model, transactions_df, threshold=0.8):
    """
    Simulates a real-time detection system. For each new transaction (row in transactions_df),
    the model predicts the fraud probability. If the probability exceeds the threshold,
    an alert is printed.
    """
    print("\n--- Real-Time Fraud Detection Simulation ---")
    for idx, transaction in transactions_df.iterrows():
        # Reshape the transaction row to match model input dimensions.
        trans_data = transaction.values.reshape(1, -1)
        fraud_prob = model.predict_proba(trans_data)[0, 1]
        if fraud_prob > threshold:
            print(f"Alert: Transaction {idx} flagged as potential fraud with probability {fraud_prob:.2f}")
        # In a real system, instead of sleeping, this function would process each incoming transaction continuously.
        time.sleep(0.05)

# Simulate real-time detection on a subset of the test set.
simulate_sample = X_test.iloc[:20]  # only take 20 transactions as a sample
simulate_real_time_detection(xgb_clf, simulate_sample, threshold=0.8)

# ------------------------------------------------------------------------------
# Monitoring & Evaluation Dashboard (Conceptual)
# ------------------------------------------------------------------------------
# In a production system, you would deploy monitoring tools (e.g., Grafana dashboards)
# to continuously track these key performance indicators:
# - ROC-AUC and other classification metrics over time.
# - Alert volume and the ratio of true alerts vs. false alarms.
# - Time-to-detection for potential fraud events.
#
# For example, using a logging framework:
#
# import logging
# logging.basicConfig(level=logging.INFO, filename="fraud_detection.log")
# logging.info("New fraud alert generated at {timestamp}: Transaction {id} with probability {p}".format(
#     timestamp=time.strftime("%Y-%m-%d %H:%M:%S"), id=transaction_id, p=fraud_prob))
#
# This log feeds into a dashboard where analysts can review model performance and adjust thresholds.
# ------------------------------------------------------------------------------


