In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, log_loss
)
import lightgbm as lgb
import xgboost as xgb
import time

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# ---------- 1. Load dataset ----------
path = 'E:/Projects/CBK_Assignment/Greenland_Registry/ethereum_raw.csv'   # adjust if different
df = pd.read_csv(path)
print("Loaded dataset:", path)
print("Rows:", len(df), "Columns:", len(df.columns))
display(df.head(2))

# ---------- 2. Identify target & basic cleanup ----------
# Find the target column (FLAG)
target_candidates = [c for c in df.columns if c.strip().lower() in ('flag', 'flg', 'label', 'isfraud')]
if not target_candidates:
    raise ValueError("No 'FLAG' column found. Rename your target column to 'FLAG' or 'flag'.")
target_col = target_candidates[0]
print("Using target column:", target_col)

# Strip whitespace from column names
df.columns = [c.strip() for c in df.columns]

# Drop obviously useless columns if present (Address-like identifiers) but keep if needed later
id_cols = [c for c in df.columns if c.lower() in ('address','addr','wallet','account')]
df_base = df.copy()

# ---------- 3. Prepare features: numeric + simple encodings ----------
# Convert numeric-like strings to numeric (remove commas)
for c in df_base.columns:
    if df_base[c].dtype == object:
        # Attempt numeric conversion
        cleaned = df_base[c].astype(str).str.replace(',', '').str.strip()
        coerced = pd.to_numeric(cleaned, errors='coerce')
        # If a good fraction converted, use it
        if coerced.notna().sum() / max(1, len(coerced)) > 0.6:
            df_base[c] = coerced

# Separate target
y = df_base[target_col].astype(int)
X = df_base.drop(columns=[target_col])

# Drop identifier columns from features (we can add them back later if needed)
for c in id_cols:
    if c in X.columns:
        X = X.drop(columns=[c])
        
# 🔥 Drop leakage / ID-like columns (row index artifacts)
leak_cols = [c for c in X.columns if "unnamed" in c.lower() or "index" in c.lower()]
print("Dropping potential leakage columns:", leak_cols)
X = X.drop(columns=leak_cols, errors='ignore')


# Identify numeric and categorical columns
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X.columns if c not in numeric_cols]

print("Numeric cols:", len(numeric_cols), "Categorical cols:", len(cat_cols))

# Frequency (count) encoding for categorical columns (safe baseline)
for c in cat_cols:
    # convert NaN to 'NA' string to preserve info
    X[c] = X[c].fillna('NA').astype(str)
    freq = X[c].value_counts(dropna=False)
    X[c + '_freqenc'] = X[c].map(freq).fillna(0).astype(float)
# drop original categorical cols
X = X.drop(columns=cat_cols)

# Recompute numeric cols after freq encoding
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()

# Fill NaNs for numeric features (simple)
X[numeric_cols] = X[numeric_cols].fillna(0.0)

# ---------- 4. Scaling ----------
scaler = RobustScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X[numeric_cols]),
                        columns=numeric_cols, index=X.index)


# ---------- 5. Train / Test split ----------
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.20, stratify=y, random_state=RANDOM_STATE
)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
# ====== Insert this BEFORE model training to sanitize column names for LightGBM ======
import re

def sanitize_name(name):
    # Replace any non-alphanumeric character with underscore, collapse multiple underscores
    s = re.sub(r'[^0-9a-zA-Z]', '_', str(name))
    s = re.sub(r'__+', '_', s)          # collapse repeated underscores
    s = s.strip('_')                    # remove leading/trailing underscores
    if s == '':
        s = 'col'
    return s

# Detect problematic feature names (list of special chars LightGBM hates)
special_chars_pattern = re.compile(r'[^0-9a-zA-Z_]')

orig_cols = X_scaled.columns.tolist()
bad_cols = [c for c in orig_cols if special_chars_pattern.search(c)]

print(f"Found {len(bad_cols)} feature(s) with special characters (examples):")
print(bad_cols[:30])

# Build mapping and sanitize
mapping = {}
new_cols = []
seen = {}
for c in orig_cols:
    new = sanitize_name(c)
    # ensure uniqueness
    if new in seen:
        seen[new] += 1
        new = f"{new}_{seen[new]}"
    else:
        seen[new] = 0
    mapping[c] = new
    new_cols.append(new)

# Apply mapping to X_scaled and also to training DataFrames if needed
X_scaled.columns = new_cols
# If you have X_train/X_test already defined, rename them too:
try:
    X_train.columns = [mapping.get(c, c) for c in X_train.columns]
    X_test.columns  = [mapping.get(c, c) for c in X_test.columns]
except Exception:
    pass

# Show sample of mapping
print("\nSample column renames (old -> new):")
for old, new in list(mapping.items())[:25]:
    print(f"{old!r}  -->  {new!r}")

# Optional: save mapping to disk for future traceability
# import json
# with open('/mnt/data/colname_mapping.json', 'w') as f:
#     json.dump(mapping, f, indent=2)
# print("\nSaved column mapping to /mnt/data/colname_mapping.json")
# ====== End sanitization cell ======

# Utility: corrected function to evaluate model
def eval_model(name, model, X_test, y_test):
    t0 = time.time()
    # Try to obtain predicted probabilities if available
    pred_proba = None
    try:
        if hasattr(model, "predict_proba"):
            pred_proba = model.predict_proba(X_test)[:, 1]
        else:
            # fallback to decision_function if exists and convert to 0-1 via logistic-like transform if necessary
            if hasattr(model, "decision_function"):
                dfun = model.decision_function(X_test)
                # scale to [0,1] using min-max
                minv, maxv = dfun.min(), dfun.max()
                if maxv > minv:
                    pred_proba = (dfun - minv) / (maxv - minv)
                else:
                    pred_proba = (dfun - dfun.min())  # degenerate; will be zeros
            else:
                # final fallback: use predict (class labels)
                preds = model.predict(X_test)
                pred_proba = preds.astype(float)  # integer 0/1 values; will not be useful for logloss/roc properly
    except Exception as e:
        # If any error getting probabilities, fallback to predictions
        try:
            preds = model.predict(X_test)
            pred_proba = preds.astype(float)
        except Exception:
            pred_proba = np.zeros(len(X_test), dtype=float)

    # If pred_proba is not a numpy array ensure it is
    pred_proba = np.asarray(pred_proba).ravel()

    # Build predicted classes
    y_pred = (pred_proba >= 0.5).astype(int)

    # Compute metrics; roc_auc & log_loss only when pred_proba is not degenerate (contains values not only 0/1)
    can_do_proba_metrics = (pred_proba.min() >= 0.0) and (pred_proba.max() <= 1.0) and ( (pred_proba != 0).any() or (pred_proba != 1).any() )

    res = {
        'model': name,
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, zero_division=0),
        'recall': recall_score(y_test, y_pred, zero_division=0),
        'f1': f1_score(y_test, y_pred, zero_division=0),
        'roc_auc': roc_auc_score(y_test, pred_proba) if can_do_proba_metrics and len(np.unique(y_test))>1 else np.nan,
        'log_loss': log_loss(y_test, pred_proba) if can_do_proba_metrics else np.nan,
        'time_sec': round(time.time() - t0, 3)
    }
    return res

# ---------- 6. Train baseline models ----------
results = []

# 6.1 Logistic Regression (simple)
print("\nTraining Logistic Regression...")
lr = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=RANDOM_STATE)
lr.fit(X_train, y_train)
results.append(eval_model('LogisticRegression', lr, X_test, y_test))

# 6.2 Random Forest
print("Training Random Forest...")
rf = RandomForestClassifier(n_estimators=200, max_depth=12, class_weight='balanced', random_state=RANDOM_STATE, n_jobs=-1)
rf.fit(X_train, y_train)
results.append(eval_model('RandomForest', rf, X_test, y_test))

# 6.3 XGBoost
print("Training XGBoost...")
xgb_model = xgb.XGBClassifier(
    n_estimators=300, max_depth=6, learning_rate=0.1,
    use_label_encoder=False, eval_metric='logloss', random_state=RANDOM_STATE, n_jobs=-1
)
xgb_model.fit(X_train, y_train)
results.append(eval_model('XGBoost', xgb_model, X_test, y_test))

# 6.4 LightGBM
print("Training LightGBM...")
lgb_model = lgb.LGBMClassifier(
    n_estimators=500, max_depth=-1, learning_rate=0.05,
    class_weight='balanced', random_state=RANDOM_STATE, n_jobs=-1
)
lgb_model.fit(X_train, y_train)
results.append(eval_model('LightGBM', lgb_model, X_test, y_test))

# ---------- 7. Results summary ----------
res_df = pd.DataFrame(results).set_index('model')
display(res_df.sort_values('roc_auc', ascending=False))

# ---------- 8. Feature importance (top 20) ----------
def show_importances(model, model_name, cols, top=20):
    print(f"\nTop {top} feature importances for {model_name}:")
    try:
        if hasattr(model, 'feature_importances_'):
            imp = pd.Series(model.feature_importances_, index=cols).sort_values(ascending=False).head(top)
            display(imp)
        elif hasattr(model, 'coef_'):
            coef = pd.Series(np.abs(model.coef_).ravel(), index=cols).sort_values(ascending=False).head(top)
            display(coef)
        else:
            print("No importances available for", model_name)
    except Exception as e:
        print("Error computing importances:", e)

cols = X_train.columns.tolist()
show_importances(rf, "RandomForest", cols, top=20)
show_importances(xgb_model, "XGBoost", cols, top=20)
show_importances(lgb_model, "LightGBM", cols, top=20)

# ---------- 9. Save baseline processed features for later experiments ----------
X_scaled.to_parquet('E:/Projects/CBK_Assignment/Greenland_Registry/ethereum_baseline_features.parquet', index=False,engine="fastparquet")
print("\nSaved processed (baseline) features to E:/Projects/CBK_Assignment/Greenland_Registry/ethereum_baseline_features.parquet")
# ===== end corrected cell =====


Loaded dataset: E:/Projects/CBK_Assignment/Greenland_Registry/ethereum_raw.csv
Rows: 9841 Columns: 51


Unnamed: 0.1,Unnamed: 0,Index,Address,FLAG,Avg min between sent tnx,Avg min between received tnx,Time Diff between first and last (Mins),Sent tnx,Received Tnx,Number of Created Contracts,...,ERC20 min val sent,ERC20 max val sent,ERC20 avg val sent,ERC20 min val sent contract,ERC20 max val sent contract,ERC20 avg val sent contract,ERC20 uniq sent token name,ERC20 uniq rec token name,ERC20 most sent token type,ERC20_most_rec_token_type
0,0,1,0x00009277775ac7d0d59eaad8fee3d10ac6c805e8,0,844.26,1093.71,704785.63,721,89,0,...,0.0,16831000.0,271779.92,0.0,0.0,0.0,39.0,57.0,Cofoundit,Numeraire
1,1,2,0x0002b44ddb1476db43c868bd494422ee4c136fed,0,12709.07,2958.44,1218216.73,94,8,0,...,2.260809,2.260809,2.260809,0.0,0.0,0.0,1.0,7.0,Livepeer Token,Livepeer Token


Using target column: FLAG
Dropping potential leakage columns: ['Unnamed: 0', 'Index']
Numeric cols: 45 Categorical cols: 2
Train shape: (7872, 47) Test shape: (1969, 47)
Found 46 feature(s) with special characters (examples):
['Avg min between sent tnx', 'Avg min between received tnx', 'Time Diff between first and last (Mins)', 'Sent tnx', 'Received Tnx', 'Number of Created Contracts', 'Unique Received From Addresses', 'Unique Sent To Addresses', 'min value received', 'max value received', 'avg val received', 'min val sent', 'max val sent', 'avg val sent', 'min value sent to contract', 'max val sent to contract', 'avg value sent to contract', 'total transactions (including tnx to create contract', 'total Ether sent', 'total ether received', 'total ether sent contracts', 'total ether balance', 'Total ERC20 tnxs', 'ERC20 total Ether received', 'ERC20 total ether sent', 'ERC20 total Ether sent contract', 'ERC20 uniq sent addr', 'ERC20 uniq rec addr', 'ERC20 uniq sent addr.1', 'ERC20 uniq 

Unnamed: 0_level_0,accuracy,precision,recall,f1,roc_auc,log_loss,time_sec
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
LightGBM,0.994921,0.997664,0.979358,0.988426,0.999503,0.027278,0.039
XGBoost,0.995429,1.0,0.979358,0.989571,0.99927,0.019361,0.037
RandomForest,0.98578,0.992754,0.942661,0.967059,0.999078,0.048624,0.079
LogisticRegression,0.781107,0.504472,0.646789,0.566834,0.826189,0.708492,0.016



Top 20 feature importances for RandomForest:


ERC20_most_rec_token_type_freqenc                      0.187309
ERC20_most_sent_token_type_freqenc                     0.161940
Time_Diff_between_first_and_last_Mins                  0.088401
avg_val_received                                       0.045314
total_ether_received                                   0.041327
Avg_min_between_received_tnx                           0.037112
total_Ether_sent                                       0.035612
Unique_Received_From_Addresses                         0.031239
total_transactions_including_tnx_to_create_contract    0.030965
max_value_received                                     0.026823
Received_Tnx                                           0.026231
total_ether_balance                                    0.026068
Total_ERC20_tnxs                                       0.024123
ERC20_min_val_rec                                      0.024026
min_value_received                                     0.023301
Sent_tnx                                


Top 20 feature importances for XGBoost:


ERC20_most_rec_token_type_freqenc                      0.441601
Time_Diff_between_first_and_last_Mins                  0.188372
ERC20_uniq_sent_addr                                   0.061104
Total_ERC20_tnxs                                       0.045694
min_val_sent                                           0.040508
ERC20_most_sent_token_type_freqenc                     0.030855
ERC20_total_ether_sent                                 0.028539
ERC20_uniq_sent_token_name                             0.024006
Unique_Received_From_Addresses                         0.016412
total_transactions_including_tnx_to_create_contract    0.013936
Received_Tnx                                           0.011686
total_ether_received                                   0.008302
ERC20_min_val_rec                                      0.007874
avg_val_received                                       0.007772
Sent_tnx                                               0.007693
total_ether_balance                     


Top 20 feature importances for LightGBM:


ERC20_most_sent_token_type_freqenc                     1377
Time_Diff_between_first_and_last_Mins                   912
Unique_Received_From_Addresses                          765
ERC20_most_rec_token_type_freqenc                       646
Avg_min_between_received_tnx                            543
Total_ERC20_tnxs                                        521
total_transactions_including_tnx_to_create_contract     515
ERC20_total_ether_sent                                  497
total_ether_balance                                     419
avg_val_received                                        381
min_value_received                                      354
Sent_tnx                                                289
total_ether_received                                    277
max_value_received                                      274
ERC20_total_Ether_received                              263
ERC20_min_val_sent                                      254
Avg_min_between_sent_tnx                


Saved processed (baseline) features to E:/Projects/CBK_Assignment/Greenland_Registry/ethereum_baseline_features.parquet


In [27]:
print(y.value_counts(normalize=True))


FLAG
0    0.778579
1    0.221421
Name: proportion, dtype: float64


In [28]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, lgb_model.predict(X_test)))


[[1532    1]
 [   9  427]]
