In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer

#### setting working directory - REMOVE

In [None]:
import os
os.chdir(os.getcwd())
os.chdir("C:/Users/MelodyPogula/Downloads")

## Read in Data

### Transaction

In [None]:
transaction = pd.read_csv("transaction.csv")
transaction_data = transaction.copy()

transaction_x = transaction_data.drop(columns=["isFraud"])
transaction_y = transaction_data["isFraud"]

### Identity

In [None]:
identity = pd.read_csv("identity.csv")
identity_data = identity.copy()

identity_x = identity_data.drop(columns=["isFraud"])
identity_y = identity_data["isFraud"]

## Data for Random Forest 

In [None]:
identity_rf = identity_data.copy()
transaction_rf = transaction_data.copy()

### Handle Missing Data

In [None]:
def handle_missing_columns(df, missing_threshold=0.2, drop_threshold=0.9, exclude_cols=None):
    """
    Create missingness indicators for columns with NA fraction >= missing_threshold.
    Drop original columns if NA fraction >= drop_threshold.
    Impute mean for numeric columns that are kept.

    Parameters:
        df: DataFrame
        missing_threshold: create indicators for columns with NA >= this fraction
        drop_threshold: drop original columns if NA >= this fraction
        exclude_cols: list of columns not to modify (e.g., target, ID)

    Returns:
        df_copy: cleaned DataFrame
        dropped_cols: list of original columns that were dropped
    """
    df = df.copy()  # protect original

    if exclude_cols is None:
        exclude_cols = []

    cols = [c for c in df.columns if c not in exclude_cols]
    dropped_cols = []

    for col in cols:
        na_frac = df[col].isna().mean()
        
        # create missingness indicator for columns above missing_threshold
        if na_frac >= missing_threshold:
            df[f"{col}_missing"] = df[col].isna().astype(int)
        
        # drop original column if it exceeds drop_threshold
        if na_frac >= drop_threshold:
            df.drop(columns=[col], inplace=True)
            dropped_cols.append(col)

    # Impute mean for numeric columns that are kept
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    # exclude any _missing indicators
    numeric_cols = [c for c in numeric_cols if not c.endswith("_missing")]
    
    for col in numeric_cols:
        df[col] = df[col].fillna(df[col].mean())

    return df, dropped_cols

In [None]:
exclude = ["TransactionID", "isFraud"]

train_transaction_clean, dropped_cols = handle_missing_columns(
    transaction_rf, missing_threshold=0.2, drop_threshold=0.9, exclude_cols=exclude
)

In [None]:
exclude = ["TransactionID", "isFraud"]

train_identity_clean, dropped_cols = handle_missing_columns(
    identity_rf, missing_threshold=0.2, drop_threshold=0.9, exclude_cols=exclude
)

### Encoding

In [None]:
cat_cols = train_transaction_clean.select_dtypes(include=["object"]).columns
num_cols = train_transaction_clean.select_dtypes(exclude=["object"]).columns
# one-hot encode
rf_transaction_df = pd.get_dummies(train_transaction_clean, columns=cat_cols, drop_first=True)

In [None]:
cat_cols = train_identity_clean.select_dtypes(include=["object"]).columns
num_cols = train_identity_clean.select_dtypes(exclude=["object"]).columns
# one-hot encode
rf_identity_df = pd.get_dummies(train_identity_clean, columns=cat_cols, drop_first=True)

In [None]:
rf_transaction_df
rf_identity_df

## Data for XGBoost

In [None]:
identity_xgb = identity_data.copy()
transaction_xgb = transaction_data.copy()

In [None]:
cat_cols = transaction_xgb.select_dtypes(include=["object"]).columns
num_cols = transaction_xgb.select_dtypes(exclude=["object"]).columns
# one-hot encode
xgb_transaction_df = pd.get_dummies(transaction_xgb, columns=cat_cols, drop_first=True)

In [None]:
cat_cols = identity_xgb.select_dtypes(include=["object"]).columns
num_cols = identity_xgb.select_dtypes(exclude=["object"]).columns
# one-hot encode
xgb_identity_df = pd.get_dummies(identity_xgb, columns=cat_cols, drop_first=True)

In [None]:
xgb_transaction_df
xgb_identity_df

## Data for CatBoost and Light GBM

In [None]:
identity_boost = identity_data.copy()
transaction_boost = transaction_data.copy()

## EDA

### Class Imbalance

In [None]:
### EDA
# fraud vs legit
fraud_counts = transaction_y.value_counts(normalize=True)
sns.barplot(x=fraud_counts.index, y=fraud_counts.values)
plt.title("Transaction Distribution: Fraud vs Legit")
plt.xlabel("isFraud")
plt.ylabel("Proportion")
plt.show()

In [None]:
fraud_counts = identity_y.value_counts(normalize=True)
sns.barplot(x=fraud_counts.index, y=fraud_counts.values)
plt.title("Identity Distribution: Fraud vs Legit")
plt.xlabel("isFraud")
plt.ylabel("Proportion")
plt.show()

### Column Analysis

In [None]:
def analyze_unknown_columns(df):
    """
    column analysis function translated from R
    """
    analysis_data = []
    
    for col in df.columns:
        col_data = df[col]
        
        # Basic info
        col_type = col_data.dtype
        na_percent = round(col_data.isna().mean() * 100, 2)
        unique_values = col_data.nunique()
        
        # Min/Max for numeric columns
        if np.issubdtype(col_type, np.number):
            if col_data.notna().any():
                min_val = round(col_data.min(), 2)
                max_val = round(col_data.max(), 2)
            else:
                min_val = "All NA"
                max_val = "All NA"
        else:
            min_val = "N/A"
            max_val = "N/A"
        
        # Sample values (first 5 non-null unique values)
        sample_vals = col_data.dropna().unique()[:5]
        sample_str = ", ".join(str(x) for x in sample_vals)
        
        analysis_data.append({
            'column': col,
            'type': col_type,
            'na_percent': na_percent,
            'unique_values': unique_values,
            'min_value': min_val,
            'max_value': max_val,
            'sample_values': sample_str
        })
    
    # Create analysis DataFrame
    column_analysis = pd.DataFrame(analysis_data)
    
    print("=== AUTOMATED COLUMN ANALYSIS ===")
    pd.set_option('display.max_rows', None)
    print(column_analysis)
    pd.reset_option('display.max_rows')
    
    return column_analysis

#### Transaction

In [None]:
column_info = analyze_unknown_columns(transaction)

#### Identity

In [None]:
column_info = analyze_unknown_columns(identity)