In [15]:
import sys
sys.path.append("..") 

In [16]:
# 1. Import libraries
import pandas as pd
import logging

from src.preprocessing import (
    handle_missing_values, clean_data_types, remove_duplicates, save_cleaned_data
)

from src.feature_engineering import (
    add_time_features, add_frequency_features, merge_ip_country, save_feature_engineered_data
)

from src.transformers import (
    get_preprocessor, apply_balancing
)

logging.basicConfig(level=logging.INFO)


In [17]:
# Setup logging
logging.basicConfig(level=logging.INFO)


In [18]:
# 2. Load raw data
fraud_df = pd.read_csv("../data/raw/Fraud_Data.csv")
ip_df = pd.read_csv("../data/raw/IpAddress_to_Country.csv")
credit_df = pd.read_csv("../data/raw/creditcard.csv")


In [19]:
#inspect datatypes
print("Fraud Dataset Types:\n", fraud_df.dtypes)
print("\nCredit Dataset Types:\n", credit_df.dtypes)
print("\nIP Dataset Sample:\n", ip_df.head())

Fraud Dataset Types:
 user_id             int64
signup_time        object
purchase_time      object
purchase_value      int64
device_id          object
source             object
browser            object
sex                object
age                 int64
ip_address        float64
class               int64
dtype: object

Credit Dataset Types:
 Time      float64
V1        float64
V2        float64
V3        float64
V4        float64
V5        float64
V6        float64
V7        float64
V8        float64
V9        float64
V10       float64
V11       float64
V12       float64
V13       float64
V14       float64
V15       float64
V16       float64
V17       float64
V18       float64
V19       float64
V20       float64
V21       float64
V22       float64
V23       float64
V24       float64
V25       float64
V26       float64
V27       float64
V28       float64
Amount    float64
Class       int64
dtype: object

IP Dataset Sample:
    lower_bound_ip_address  upper_bound_ip_address    country


In [20]:
# handle missing values
# convert data types
# remove duplicates
fraud_df = handle_missing_values(fraud_df)
fraud_df = clean_data_types(fraud_df, datetime_cols=["signup_time", "purchase_time"])
fraud_df = remove_duplicates(fraud_df)

credit_df = handle_missing_values(credit_df)
credit_df = remove_duplicates(credit_df)



INFO:root:Initial DataFrame shape: (151112, 11)
INFO:root:No missing values detected.
INFO:root:After dropping columns with >50.0% missing values: (151112, 11)
INFO:root:No missing values detected.
INFO:root:Converted column 'signup_time' to datetime.
INFO:root:Converted column 'purchase_time' to datetime.
INFO:root:Removed 0 duplicate rows.
INFO:root:Initial DataFrame shape: (284807, 31)
INFO:root:No missing values detected.
INFO:root:After dropping columns with >50.0% missing values: (284807, 31)
INFO:root:No missing values detected.
INFO:root:Removed 1081 duplicate rows.


In [21]:
save_cleaned_data(fraud_df, filename="fraud_data_cleaned.parquet")
save_cleaned_data(credit_df, filename="creditcard_data_cleaned.parquet")


INFO:root:Saved cleaned data to ../data/processed/fraud_data_cleaned.parquet
INFO:root:Saved cleaned data to ../data/processed/creditcard_data_cleaned.parquet


In [22]:
# Apply Basic Feature Engineering
fraud_df = add_time_features(fraud_df)
fraud_df = add_frequency_features(fraud_df)
fraud_df = merge_ip_country(fraud_df, ip_df)

INFO:root:Added time_since_signup, hour_of_day, and day_of_week features.


INFO:root:Added user_transaction_count feature.
INFO:root:Added device_transaction_count feature.


In [23]:
# Save feature-engineered version
save_feature_engineered_data(fraud_df, filename="fraud_data_basic_features.parquet")

INFO:root:Saved feature-engineered data to ../data/processed/fraud_data_basic_features.parquet


In [24]:
#8. Prepare columns for transformation
numeric_cols = ["purchase_value", "age", "time_since_signup", "hour_of_day", "day_of_week",
                "user_transaction_count", "device_transaction_count"]
categorical_cols = ["source", "browser", "sex", "country"]


In [25]:
# Get preprocessor
preprocessor = get_preprocessor(numeric_cols, categorical_cols)


In [26]:
# Apply preprocessing & balancing
X_fraud = fraud_df[numeric_cols + categorical_cols]
y_fraud = fraud_df["class"]

X_fraud_encoded = preprocessor.fit_transform(X_fraud)
X_fraud_balanced, y_fraud_balanced = apply_balancing(X_fraud_encoded, y_fraud, strategy="smote")


INFO:root:Original class distribution: {0: 136961, 1: 14151}
INFO:root:Balanced class distribution: {0: 136961, 1: 136961}


In [None]:
# Credit card dataset transformation
X_credit = credit_df.drop(columns=["Class"])
y_credit = credit_df["Class"]
# Normalize credit card data (only numeric)
credit_scaler = get_preprocessor(X_credit.columns.tolist(), [])
X_credit_scaled = credit_scaler.fit_transform(X_credit)
X_credit_balanced, y_credit_balanced = apply_balancing(X_credit_scaled, y_credit, strategy="undersample")

INFO:root:Original class distribution: {0: 283253, 1: 473}
INFO:root:Balanced class distribution: {0: 473, 1: 473}


In [28]:
# Summary Report
print(f"Fraud Data (balanced): {X_fraud_balanced.shape}, Fraud Count: {sum(y_fraud_balanced)}")
print(f"Credit Card Data (balanced): {X_credit_balanced.shape}, Fraud Count: {sum(y_credit_balanced)}")

Fraud Data (balanced): (273922, 199), Fraud Count: 136961
Credit Card Data (balanced): (946, 30), Fraud Count: 473
