In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

# Base directory where your project folders live
drive_base = '/content/drive/MyDrive'
# Path to the folder containing your EDA notebook
eda_folder = os.path.join(drive_base, 'data', 'raw', 'Capstone 2 - Data Wrangling')
eda_notebook_path = os.path.join(eda_folder, 'EDA new-capstone.ipynb')
print('EDA notebook path:', eda_notebook_path)

EDA notebook path: /content/drive/MyDrive/data/raw/Capstone 2 - Data Wrangling/EDA new-capstone.ipynb


In [None]:
## Step 2: Load raw CSV files and merge
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
import zipfile
zip_path = '/content/drive/MyDrive/Capstone1/Capstone 2 - Data Wrangling/ieee-fraud-detection_project/data/raw/Archive.zip'
extract_path = '/content/data_extracted'
os.makedirs(extract_path, exist_ok=True)

# 4. Extract the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# 5. Optional: List the extracted files to see what’s inside
os.listdir(extract_path)

transaction_path = '/content/data_extracted/train_transaction.csv'
identity_path = '/content/data_extracted/train_identity.csv'

print("train_transaction.csv exists:", os.path.exists(transaction_path))
print("train_identity.csv exists:", os.path.exists(identity_path))

train_transaction.csv exists: True
train_identity.csv exists: True


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

In [None]:
import zipfile
import pandas as pd
import numpy as np
import os

# 1) Set path to your archive
zip_path = '/content/drive/MyDrive/Capstone1/Capstone 2 - Data Wrangling/ieee-fraud-detection_project/data/raw/Archive.zip'
extract_path = '/content/fraud_data/'

# 2) Extract files if not already extracted
if not os.path.exists(extract_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)

# 3) Load transaction and identity CSVs
transaction_path = os.path.join(extract_path, 'train_transaction.csv')
identity_path = os.path.join(extract_path, 'train_identity.csv')

trans = pd.read_csv(transaction_path)
iden = pd.read_csv(identity_path)

# 4) Merge on TransactionID (left join to keep all transaction rows)
df = trans.merge(iden, on="TransactionID", how="left")

# 5) Split into fraud and non-fraud
fraud_df = df[df["isFraud"] == 1]
nonfraud_df = df[df["isFraud"] == 0]

print(f"Fraud count: {len(fraud_df):,}")
print(f"Non-fraud count: {len(nonfraud_df):,}")

# 6) Sample non-fraud (you can adjust the ratio as needed)
n_fraud = len(fraud_df)
n_nonfraud_to_sample = n_fraud * 2  # change to 1x, 2x, 3x etc. based on RAM

nonfraud_sampled = nonfraud_df.sample(n=n_nonfraud_to_sample, random_state=42)

# 7) Combine both
combined_df = pd.concat([fraud_df, nonfraud_sampled], axis=0).reset_index(drop=True)

# 8) Shuffle
combined_df = combined_df.sample(frac=1.0, random_state=42).reset_index(drop=True)

print(f"Final combined dataset shape: {combined_df.shape}")
print(f"Fraud ratio:\n{combined_df['isFraud'].value_counts(normalize=True)}")


Fraud count: 20,663
Non-fraud count: 569,877
Final combined dataset shape: (61989, 434)
Fraud ratio:
isFraud
0    0.666667
1    0.333333
Name: proportion, dtype: float64


Original class imbalance:

- Total transactions: 590,540
- Fraudulent: 20,663 (≈ 3.5%)
- Non-fraudulent: 569,877 (≈ 96.5%)

Undersampling strategy:

Kept all 20,663 fraud cases.
Randomly sampled 41,326 non-fraud cases (2 × fraud count).

Final combined dataset:

- 61,989 transactions, 434 features
- Fraud = 33.3 %
- Non-fraud = 66.7 %

This rebalanced dataset both reduces computational overhead and provides a more representative class distribution, helping downstream classification algorithms learn to detect fraud without being overwhelmed by the majority class.

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from imblearn.over_sampling import SMOTE

# ────────────────────────────────────────────────────────────────────────────────
# 1) Assume `combined_df` is already in memory (from merging, sampling, etc.)
#    If not, load it here. For example:
# combined_df = pd.read_csv("/path/to/your/combined_df.csv")
# ────────────────────────────────────────────────────────────────────────────────

# 2) Separate target
y = combined_df["isFraud"]
X = combined_df.drop(columns=["isFraud", "TransactionID"])  # drop ID + target

In [None]:
# 3) Drop columns with > 90% missing values
missing_pct = X.isna().mean()
high_missing_cols = missing_pct[missing_pct > 0.90].index.tolist()
print(f"Dropping {len(high_missing_cols)} columns with > 90% missing:\n{high_missing_cols}\n")
X = X.drop(columns=high_missing_cols)

# 4) Identify numeric vs. categorical columns
#    (We treat anything with dtype "object" as categorical. Adjust if needed.)
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols     = X.select_dtypes(include=["object"]).columns.tolist()

print(f"Numeric features (n={len(numeric_cols)}): {numeric_cols[:10]} ...")
print(f"Categorical features (n={len(cat_cols)}): {cat_cols[:10]} ...\n")


Dropping 9 columns with > 90% missing:
['id_07', 'id_08', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27']

Numeric features (n=394): ['TransactionDT', 'TransactionAmt', 'card1', 'card2', 'card3', 'card5', 'addr1', 'addr2', 'dist1', 'dist2'] ...
Categorical features (n=29): ['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain', 'M1', 'M2', 'M3', 'M4', 'M5'] ...



In [None]:
# 5) Build transformers for each type

# 5a) Numeric imputer + (optional) scaler
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    # If you want to scale: uncomment the next line
    # ("scaler", StandardScaler()),
])

# 5b) Categorical imputer + ordinal encoder
#     - imputing missing with a constant string "__MISSING__"
#     - ordinal encoder will assign integers to each category
cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="__MISSING__")),
    ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
])


In [None]:

# 6) Bundle into a ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_cols),
    ("cat", cat_transformer,     cat_cols),
], remainder="drop")  # we already dropped anything we don't want

# 7) Split into train/test (stratify on 'isFraud')
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.20,
    stratify=y,
    random_state=42
)

print(f"Before preprocessing → Train shape: {X_train.shape},  Test shape: {X_test.shape}")
print(f"Train class distribution:\n{y_train.value_counts(normalize=True)}\n")

Before preprocessing → Train shape: (49591, 423),  Test shape: (12398, 423)
Train class distribution:
isFraud
0    0.666673
1    0.333327
Name: proportion, dtype: float64



- Dropped 9 columns with >90% missing values.
- Numeric features (394) imputed with median; categorical features (29) imputed with constant and ordinal encoded.
- Data split stratified on isFraud with 80% train (49,591 samples) and 20% test (12,398 samples).
- Training set class distribution: 66.7% non-fraud, 33.3% fraud.

Preprocessing ensures clean, encoded data with preserved class balance, ready for modeling.

In [None]:
# 8) Fit & transform training data; transform test data
X_train_prep = preprocessor.fit_transform(X_train)
X_test_prep  = preprocessor.transform(X_test)

print(f"After preprocessing → X_train_prep shape: {X_train_prep.shape}")
print(f"                       X_test_prep  shape: {X_test_prep.shape}\n")

# 9) Apply SMOTE to the training set ONLY
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_prep, y_train)

print("After SMOTE →")
print(f"  X_train_smote shape: {X_train_smote.shape}")
print(f"  y_train_smote distribution:\n{pd.Series(y_train_smote).value_counts(normalize=True)}\n")


After preprocessing → X_train_prep shape: (49591, 423)
                       X_test_prep  shape: (12398, 423)

After SMOTE →
  X_train_smote shape: (66122, 423)
  y_train_smote distribution:
isFraud
0    0.5
1    0.5
Name: proportion, dtype: float64



- Preprocessed training data shape: (49,591, 423)

- Preprocessed test data shape: (12,398, 423)

After applying SMOTE on training set:

- Samples increased to 66,122
- Perfectly balanced classes: 50% fraud, 50% non-fraud

This approach addresses class imbalance effectively, providing a balanced and comprehensive dataset for model training.

In [None]:

# 11) Save the processed datasets to disk
# Feel free to change to your preferred format (CSV, pickle, joblib, etc.)
pd.DataFrame(X_train_smote).to_csv("X_train_smote.csv", index=False)
pd.Series(y_train_smote, name="isFraud").to_csv("y_train_smote.csv", index=False)

pd.DataFrame(X_test_prep).to_csv("X_test_processed.csv", index=False)
pd.Series(y_test, name="isFraud").to_csv("y_test.csv", index=False)

print("Preprocessed files written:")
print("  - X_train_smote.csv, y_train_smote.csv")
print("  - X_test_processed.csv, y_test.csv")

Preprocessed files written:
  - X_train_smote.csv, y_train_smote.csv
  - X_test_processed.csv, y_test.csv


In [None]:
import os
os.makedirs('/content/drive/MyDrive/Capstone1/processed', exist_ok=True)


In [None]:
import joblib

# Save preprocessed (before SMOTE)
joblib.dump(X_train_prep, '/content/drive/MyDrive/Capstone1/processed/X_train_prep.pkl')
joblib.dump(X_test_prep, '/content/drive/MyDrive/Capstone1/processed/X_test_prep.pkl')
joblib.dump(y_train, '/content/drive/MyDrive/Capstone1/processed/y_train.pkl')
joblib.dump(y_test, '/content/drive/MyDrive/Capstone1/processed/y_test.pkl')

# Save SMOTE-balanced training data
joblib.dump(X_train_smote, '/content/drive/MyDrive/Capstone1/processed/X_train_smote.pkl')
joblib.dump(y_train_smote, '/content/drive/MyDrive/Capstone1/processed/y_train_smote.pkl')


['/content/drive/MyDrive/Capstone1/processed/y_train_smote.pkl']