In [1]:
import pandas as pd

In [2]:
df = pd.read_csv(f"D:/portfolio_repo/arb_airdrop/data/processed/wallet_features.csv")
print("Shape:", df.shape)
print("Churn Label Distribution:\n", df['churn_label'].value_counts())
df.head()

Shape: (100000, 12)
Churn Label Distribution:
 churn_label
1    60737
0    39263
Name: count, dtype: int64


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,wallet,cumulative_balance_day_30,wallet_address,claimed_amount,balance_retention_ratio,num_transactions_30_days,total_inflow_30_days,total_outflow_30_days,days_active,churn_label
0,0,0,0x00000000009a41862f3b2b0c688b7c0d1940511e,0.0,0x00000000009a41862f3b2b0c688b7c0d1940511e,4250,0.0,40,4250.0,4250.0,1,1
1,1,1,0x0000000000dfd67ffd6c24251348f7c4f933cab4,0.0,0x0000000000dfd67ffd6c24251348f7c4f933cab4,1750,0.0,2,1750.0,1750.0,1,1
2,2,2,0x0000000000e189dd664b9ab08a33c4839953852c,0.0,0x0000000000e189dd664b9ab08a33c4839953852c,2250,0.0,2,2250.0,2250.0,1,1
3,3,3,0x000000000279ef217428b1c3906ec8124784b70f,0.0,0x000000000279ef217428b1c3906ec8124784b70f,3250,0.0,2,3250.0,3250.0,1,1
4,4,4,0x0000000009572a244a6c2d06ffe7be30e3bd2aec,10.0,0x0000000009572a244a6c2d06ffe7be30e3bd2aec,625,0.016,3,635.0,625.0,2,0


Build features and target

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

features = [
    'claimed_amount',
    'cumulative_balance_day_30',
    'num_transactions_30_days',
    'total_inflow_30_days',
    'total_outflow_30_days',
    'days_active',
    'balance_retention_ratio'
]
target = 'churn_label'

X = df[features]
y = df[target]

In [4]:
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale Features - done for linearity between features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Logistic Regression

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

lr = LogisticRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)

print("\n--- Logistic Regression ---")
print(classification_report(y_test, y_pred_lr))
print("ROC AUC:", roc_auc_score(y_test, lr.predict_proba(X_test_scaled)[:,1]))


--- Logistic Regression ---
              precision    recall  f1-score   support

           0       0.88      0.60      0.71      7853
           1       0.78      0.95      0.86     12147

    accuracy                           0.81     20000
   macro avg       0.83      0.77      0.78     20000
weighted avg       0.82      0.81      0.80     20000

ROC AUC: 0.8722553983451018


XG Boost

In [6]:
from xgboost import XGBClassifier

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

print("\n--- XGBoost ---")
print(classification_report(y_test, y_pred_xgb))
print("ROC AUC:", roc_auc_score(y_test, xgb.predict_proba(X_test)[:,1]))


--- XGBoost ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7853
           1       1.00      1.00      1.00     12147

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000

ROC AUC: 0.9999916238942768


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
