In [2]:
import os
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (confusion_matrix, classification_report, roc_auc_score, precision_recall_curve, auc, average_precision_score)
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from statsmodels.stats.outliers_influence import variance_inflation_factor
import shap

ModuleNotFoundError: No module named 'xgboost'

In [None]:
DATA_PATH = Path('/content/Fraud.csv')
OUTPUT_DIR = Path('fraud_analysis_outputs')
OUTPUT_DIR.mkdir(exist_ok=True)
if not DATA_PATH.exists():
    raise FileNotFoundError(f"Dataset not found at {DATA_PATH}. Please place 'Fraud.csv' in the same folder.")
df = pd.read_csv(DATA_PATH, low_memory=False)
print('Data loaded. Shape:', df.shape)
print(df.head().to_string())

Data loaded. Shape: (6362620, 11)
   step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig     nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud
0     1   PAYMENT   9839.64  C1231006815       170136.0       160296.36  M1979787155             0.0             0.0        0               0
1     1   PAYMENT   1864.28  C1666544295        21249.0        19384.72  M2044282225             0.0             0.0        0               0
2     1  TRANSFER    181.00  C1305486145          181.0            0.00   C553264065             0.0             0.0        1               0
3     1  CASH_OUT    181.00   C840083671          181.0            0.00    C38997010         21182.0             0.0        1               0
4     1   PAYMENT  11668.14  C2048537720        41554.0        29885.86  M1230701703             0.0             0.0        0               0


In [None]:
print('\n--- Data Info ---')
print(df.info())
print('\n--- Missing values (count & %) ---')
miss = df.isnull().sum()
print(pd.concat([miss, (miss/len(df))*100], axis=1).rename(columns={0:'missing_count',1:'missing_pct'}).sort_values('missing_count', ascending=False).head(20))


--- Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB
None

--- Missing values (count & %) ---
                missing_count  missing_pct
step                        0          0.0
type                        0          0.0
amount                      0          0.0
nameOrig                    0          0.0
oldbalanceOrg               0          0.0
newbalanceOrig              0          0.0
nameDest                    0          0.0
oldbalanceDest              0          0.0

In [None]:
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
print('\nNumeric columns:', num_cols)
print('Categorical columns:', cat_cols)
thresh = 0.8
high_missing = [(c, df[c].isnull().mean()) for c in df.columns if df[c].isnull().mean() > thresh]
if high_missing:
    print('\nDropping columns with >80% missing:')
    for c, pct in high_missing:
        print(c, pct)
    df.drop(columns=[c for c,p in high_missing], inplace=True)
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')
df[num_cols] = num_imputer.fit_transform(df[num_cols])
if cat_cols:
    df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])
print('\nAfter imputation missing values:', df.isnull().sum().sum())


Numeric columns: ['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'isFraud', 'isFlaggedFraud']
Categorical columns: ['type', 'nameOrig', 'nameDest']

After imputation missing values: 0


In [None]:
for c in num_cols:
    if df[c].nunique() > 10:
        Q1 = df[c].quantile(0.25)
        Q3 = df[c].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df[c] = np.where(df[c] < lower, lower, df[c])
        df[c] = np.where(df[c] > upper, upper, df[c])
print('\nOutliers capped using IQR method for numeric variables.')


Outliers capped using IQR method for numeric variables.


In [None]:
possible_id_cols = [c for c in df.columns if c.lower() in ('id','transactionid','transaction_id','cust_id','customer_id')]
print('\nPossible ID-like columns to drop for modeling:', possible_id_cols)
df_model = df.copy()
df_model.drop(columns=possible_id_cols, inplace=True, errors='ignore')
num_for_vif = df_model.select_dtypes(include=[np.number]).copy()
target_col_candidates = [c for c in df_model.columns if c.lower() in ('isfraud','fraud','label','target')]
if target_col_candidates:
    target_col = target_col_candidates[0]
    print('Found target column:', target_col)
    if target_col in num_for_vif.columns:
        num_for_vif.drop(columns=[target_col], inplace=True)
else:
    target_col = None
num_for_vif = num_for_vif.loc[:, num_for_vif.nunique() > 1]
def compute_vif(dfv):
    vif_data = pd.DataFrame()
    vif_data['feature'] = dfv.columns
    vif_data['VIF'] = [variance_inflation_factor(dfv.values, i) for i in range(dfv.shape[1])]
    return vif_data.sort_values('VIF', ascending=False)
if not num_for_vif.empty:
    vif_df = compute_vif(num_for_vif.fillna(0))
    print('\nVIF top features:')
    print(vif_df.head(20).to_string(index=False))
    high_vif = vif_df[vif_df['VIF'] > 10]['feature'].tolist()
    print('\nFeatures with VIF > 10 (will be dropped):', high_vif)
    df_model.drop(columns=high_vif, inplace=True, errors='ignore')
else:
    print('No numeric features for VIF check.')


Possible ID-like columns to drop for modeling: []
Found target column: isFraud

VIF top features:
       feature       VIF
newbalanceDest 36.858666
oldbalanceDest 33.025454
newbalanceOrig  6.489256
 oldbalanceOrg  5.923001
        amount  2.680130
          step  1.758521
isFlaggedFraud  1.000037

Features with VIF > 10 (will be dropped): ['newbalanceDest', 'oldbalanceDest']


In [None]:
if target_col is None:
    for c in df_model.columns:
        if 'fraud' in c.lower() and df_model[c].nunique() <= 2:
            target_col = c
            break
if target_col is None:
    raise ValueError('Target column not identified automatically. Please set `target_col` variable manually.')
print('\nUsing target column:', target_col)
X = df_model.drop(columns=[target_col])
y = df_model[target_col].astype(int)
cat_cols = X.select_dtypes(include=['object','category']).columns.tolist()
low_card_cats = [c for c in cat_cols if X[c].nunique() <= 20]
high_card_cats = [c for c in cat_cols if X[c].nunique() > 20]
print('Low-cardinality categoricals (one-hot):', low_card_cats)
print('High-cardinality categoricals (dropping):', high_card_cats)
X.drop(columns=high_card_cats, inplace=True)
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), X.select_dtypes(include=[np.number]).columns.tolist()),
    ('cat', OneHotEncoder(handle_unknown='ignore'), low_card_cats)
], remainder='drop')


Using target column: isFraud
Low-cardinality categoricals (one-hot): ['type']
High-cardinality categoricals (dropping): ['nameOrig', 'nameDest']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print('\nTrain shape:', X_train.shape, 'Test shape:', X_test.shape)


Train shape: (5090096, 6) Test shape: (1272524, 6)


In [None]:
pipeline = Pipeline(steps=[('pre', preprocessor), ('model', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))])
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [3, 6],
    'model__learning_rate': [0.1, 0.01]
}
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
from sklearn.base import BaseEstimator, ClassifierMixin, clone
class SMOTE_XGB(BaseEstimator, ClassifierMixin):
    def __init__(self, estimator=None):
        self.estimator = estimator if estimator is not None else XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    def fit(self, X, y):
        sm = SMOTE(random_state=42)
        Xs, ys = sm.fit_resample(X, y)
        self.estimator_ = clone(self.estimator)
        self.estimator_.fit(Xs, ys)
        return self
    def predict(self, X):
        return self.estimator_.predict(X)
    def predict_proba(self, X):
        return self.estimator_.predict_proba(X)
X_train_trans = preprocessor.fit_transform(X_train)
X_test_trans = preprocessor.transform(X_test)
from sklearn.model_selection import ParameterGrid
grid = list(ParameterGrid({
    'n_estimators':[100,200], 'max_depth':[3,6],'learning_rate':[0.1,0.01]
}))
print('\nRunning grid search over', len(grid), 'models (this may take a while)')
best_score = -np.inf
best_params = None
best_clf = None
for params in grid:
    clf = SMOTE_XGB(XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42,
                                  n_estimators=params['n_estimators'], max_depth=params['max_depth'], learning_rate=params['learning_rate']))
    clf.fit(X_train_trans, y_train)
    yhat_proba = clf.predict_proba(X_test_trans)[:,1]
    ap = average_precision_score(y_test, yhat_proba)
    print('params:', params, 'AP:', round(ap,4))
    if ap > best_score:
        best_score = ap
        best_params = params
        best_clf = clf
print('\nBest params by AP:', best_params, 'Best AP:', best_score)


Running grid search over 8 models (this may take a while)
params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100} AP: 0.4691
params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200} AP: 0.5842
params: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100} AP: 0.6944
params: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200} AP: 0.7919
params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100} AP: 0.1082
params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200} AP: 0.296
params: {'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 100} AP: 0.3782
params: {'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 200} AP: 0.3885

Best params by AP: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200} Best AP: 0.7918747510654136


In [None]:
y_pred = best_clf.predict(X_test_trans)
y_proba = best_clf.predict_proba(X_test_trans)[:,1]
print('\nConfusion Matrix:')
print(confusion_matrix(y_test, y_pred))
print('\nClassification Report:')
print(classification_report(y_test, y_pred))
print('ROC-AUC:', round(roc_auc_score(y_test, y_proba),4))
print('PR-AUC (Average Precision):', round(average_precision_score(y_test, y_proba),4))
precision, recall, _ = precision_recall_curve(y_test, y_proba)
pr_auc = auc(recall, precision)
metrics = {
    'confusion_matrix': confusion_matrix(y_test, y_pred).tolist(),
    'classification_report': classification_report(y_test, y_pred, output_dict=True),
    'roc_auc': roc_auc_score(y_test, y_proba),
    'pr_auc': average_precision_score(y_test, y_proba)
}
import json
with open(OUTPUT_DIR / 'metrics.json', 'w') as f:
    json.dump(metrics, f, indent=2)
print('\nMetrics saved to', OUTPUT_DIR / 'metrics.json')


Confusion Matrix:
[[1263767    7114]
 [      4    1639]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00   1270881
           1       0.19      1.00      0.32      1643

    accuracy                           0.99   1272524
   macro avg       0.59      1.00      0.66   1272524
weighted avg       1.00      0.99      1.00   1272524

ROC-AUC: 0.9994
PR-AUC (Average Precision): 0.7919

Metrics saved to fraud_analysis_outputs/metrics.json


In [None]:
num_feats = preprocessor.transformers_[0][2]
cat_encoder = preprocessor.transformers_[1][1]
cat_cols_ohe = []
if low_card_cats:
    cat_ohe = preprocessor.named_transformers_['cat']
    try:
        cat_feature_names = cat_ohe.get_feature_names_out(low_card_cats)
    except Exception:
        cat_feature_names = cat_ohe.get_feature_names(low_card_cats)
    cat_cols_ohe = list(cat_feature_names)
feature_names = list(num_feats) + cat_cols_ohe
try:
    importances = best_clf.estimator_.feature_importances_
    fi = pd.DataFrame({'feature':feature_names, 'importance':importances})
    fi = fi.sort_values('importance', ascending=False)
    fi.to_csv(OUTPUT_DIR / 'feature_importance.csv', index=False)
    print('\nTop features:')
    print(fi.head(20).to_string(index=False))
except Exception as e:
    print('Could not extract feature importances:', e)


Top features:
       feature  importance
newbalanceOrig    0.508577
  type_PAYMENT    0.206182
 type_TRANSFER    0.120683
 oldbalanceOrg    0.090435
        amount    0.033906
 type_CASH_OUT    0.016612
          step    0.011091
  type_CASH_IN    0.005861
    type_DEBIT    0.004264
isFlaggedFraud    0.002388


In [None]:
try:
    explainer = shap.TreeExplainer(best_clf.estimator_)
    shap_values = explainer.shap_values(X_test_trans)
    mean_abs_shap = np.abs(shap_values).mean(axis=0)
    shap_df = pd.DataFrame({'feature':feature_names, 'mean_abs_shap':mean_abs_shap}).sort_values('mean_abs_shap', ascending=False)
    shap_df.to_csv(OUTPUT_DIR / 'shap_summary.csv', index=False)
    print('\nSHAP top features:')
    print(shap_df.head(20).to_string(index=False))
    plt.figure(figsize=(8,6))
    shap.summary_plot(shap_values, features=X_test_trans, feature_names=feature_names, show=False)
    plt.tight_layout()
    plt.savefig(OUTPUT_DIR / 'shap_summary_plot.png', dpi=150)
    plt.close()
    print('SHAP plot saved to', OUTPUT_DIR / 'shap_summary_plot.png')
except Exception as e:
    print('SHAP explanation failed:', e)


SHAP top features:
       feature  mean_abs_shap
 oldbalanceOrg       4.045572
newbalanceOrig       3.590510
  type_PAYMENT       1.823965
        amount       1.530852
          step       1.184622
  type_CASH_IN       0.764720
 type_CASH_OUT       0.397444
 type_TRANSFER       0.262346
    type_DEBIT       0.024236
isFlaggedFraud       0.004996
SHAP plot saved to fraud_analysis_outputs/shap_summary_plot.png


In [None]:
key_factors_text = []
try:
    top_features = fi.head(10)['feature'].tolist()
    key_factors_text.append('Top features by model importance: ' + ', '.join(top_features))
except:
    key_factors_text.append('Top features could not be computed.')
key_factors_text.append('\nDo these factors make sense?\n- Provide domain reasoning: e.g., high transaction amounts, unusual device/country, rapid frequency, mismatched billing/shipping info, and unusual merchant categories often indicate fraud. Use business context to validate features.')
key_factors_text.append('\nPrevention recommendations:\n1. Rule-based blocking for extremely high-risk patterns.\n2. Multi-factor authentication for high-value transactions.\n3. Real-time scoring and manual review queue for medium-risk cases.\n4. Regular model retraining and feature drift monitoring.\n5. Logging and secure storage for audit trails.\n6. Role-based access and secure data pipelines.\n')
key_factors_text.append('\nHow to determine if these measures work:\n- A/B test: route a % of traffic to the new infra and compare fraud rate, false positive rate, conversion rate.\n- Monitor before/after metrics: fraud loss amount, detection rate, manual review workload, customer friction (drop-offs).\n- Set statistical significance tests for changes in key metrics.\n- Implement monitoring dashboards with alerting on drift in feature distributions and model performance metrics.\n')
with open(OUTPUT_DIR / 'key_factors_and_recommendations.txt', 'w') as f:
    f.write('\n\n'.join(key_factors_text))
print('\nKey factors and recommendations saved to', OUTPUT_DIR / 'key_factors_and_recommendations.txt')
print('\nAll done. Check the fraud_analysis_outputs directory for artifacts: metrics.json, feature_importance.csv, shap_summary.csv, shap_summary_plot.png, key_factors_and_recommendations.txt')


Key factors and recommendations saved to fraud_analysis_outputs/key_factors_and_recommendations.txt

All done. Check the fraud_analysis_outputs directory for artifacts: metrics.json, feature_importance.csv, shap_summary.csv, shap_summary_plot.png, key_factors_and_recommendations.txt
