In [50]:
import pandas as pd
import numpy as np
import xgboost as xgb
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt
from itertools import product
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier, callback
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, make_scorer
from sklearn.model_selection import RandomizedSearchCV
from imblearn.over_sampling import BorderlineSMOTE

In [51]:
labels = ['Online', 'Channel', 'MCC_Group', 'Amount',
              'Type', 'Balance', 'Limit', 'Type_dst', 'Balance_dst',
              'Limit_dst', 'Age', 'Tenure', 'CreditScore', 'AnnualSalary',
              'Device_Count', 'Action_Count', 'Login_Count', 'Logout_Count',
              'Account_View_Count', 'Payment_Count', 'Transfer_Count',
              'Day_of_Week', 'Timestamp', 'in_home_city', 'prev_trans_in_city',
              'times_device_used', 'unique_devices_past', 'days_since_open',
              'days_since_open_dst', 'time_since_last_txn_hours',
              'xacts_earlier_today', 'avg_amount_past', 'amount_ratio',
              'amount_dev', 'mcc_group_prev_count']

df = pd.read_csv("final_train.csv") 

lag = pd.read_csv("LaggedTrain.csv")

In [52]:
train_set =pd.merge(df, lag, on="TxnID", how="left")

In [16]:
train_set

Unnamed: 0,TxnID,Online,CustomerID,SrcAccount,DstAccount,Channel,MCC_Group,Amount,FraudLabel,CustomerID_src,...,Lagged25HomeCity,Lagged25PayCt,Lagged25TransCt,Lagged100Amt,Lagged100ActCt,Lagged100HomeCity,Lagged100PayCt,Lagged100TransCt,Lagged50AmtDiff,Lagged50TransDiff
0,da3b1da1-2e6f-430b-ab02-b1678bbfccb8,0,C000005,A166e5515,,1,1,44.416179,0,C000005,...,,,,,,,,,,
1,19307e39-75f2-44ea-a1ce-6b6cc2f51f2e,0,C000005,A166e5515,,1,2,68.530604,0,C000005,...,,,,,,,,,,
2,aa2c3179-2917-4398-86ec-0f2e079f1128,0,C000005,A166e5515,,1,3,33.508761,0,C000005,...,,,,,,,,,,
3,dfd36246-ff9a-4458-98dc-df89aaa15cf2,1,C000005,A166e5515,,2,1,-33.583612,0,C000005,...,,,,,,,,,,
4,c4439812-4659-48fc-8b9b-2b0d2e6009d7,0,C000005,A166e5515,,0,3,122.470921,0,C000005,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152310,d5e6c023-54ab-4f31-ae8a-c3264b6a6302,0,C001000,Aeab65b63,,0,2,56.881983,0,C001000,...,0.20,0.08,0.48,29.039059,2.08,0.20,0.26,0.38,26.584421,-30.297562
152311,197a9dfb-e09d-4921-94b5-3928646baa63,0,C001000,Aeab65b63,,1,0,19.325107,0,C001000,...,0.16,0.04,0.36,29.662383,1.93,0.19,0.24,0.37,-10.144584,-29.469691
152312,8ee2858f-8d36-498d-a235-a4d31481be86,0,C001000,A5cd91130,,1,5,48.718987,0,C001000,...,0.16,0.04,0.36,30.588771,1.78,0.18,0.22,0.36,19.058957,-29.660030
152313,f96e1705-41a3-43cb-8bb7-e8e3e488451a,0,C001000,A5cd91130,,1,2,35.574378,0,C001000,...,0.16,0.04,0.36,31.499935,1.66,0.17,0.20,0.36,5.839397,-29.734981


In [53]:
# Selected features for our model
imp_labels = ['MCC_Group', 'Amount',
              'Type', 'Balance', 'Limit_dst', 'Age', 'Tenure', 'CreditScore', 'AnnualSalary',
              'Login_Count', 'Logout_Count',
              'Payment_Count', 'Transfer_Count',
              'in_home_city', 'prev_trans_in_city',
              'times_device_used', 'unique_devices_past',
              'days_since_open_dst', 'time_since_last_txn_hours',
              'xacts_earlier_today', 'avg_amount_past', 'mcc_group_prev_count','LaggedAmt', 'LaggedActCt', 'LaggedHomeCity', 'LaggedTransCt',
                'Lagged50Amt', 'Lagged50ActCt', 'Lagged50HomeCity', 'Lagged50PayCt', 'Lagged50TransCt','Lagged100ActCt', 'Lagged50AmtDiff']

# Read csv into X and Y dataframes
#df = pd.read_csv("final_train.csv")
X = train_set[imp_labels]
y = train_set[['FraudLabel']]

In [54]:
X = X.to_numpy()
y = y.to_numpy()

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state = 42)

In [45]:
# neg_count = np.sum(y_train == 0)
# pos_count = np.sum(y_train == 1)
# ratio = np.sqrt(neg_count/pos_count)

X_train = X_train.astype(float)
X_test = X_test.astype(float)

X_train = np.nan_to_num(X_train, nan=-1)
X_test = np.nan_to_num(X_test, nan=-1)

smote = BorderlineSMOTE(kind='borderline-1', random_state=42, k_neighbors=5)
X_res, y_res = smote.fit_resample(X_train, y_train)

clf = XGBClassifier(
    n_estimators=20000,                # many trees + early stopping
    learning_rate=0.03,               # small eta ‚Üí stabler generalization
    max_depth=4,                      # shallow trees reduce overfit
    min_child_weight=6,               # be conservative with splits
    subsample=0.7,                    # add randomness
    colsample_bytree=0.8,
    colsample_bylevel=0.8,          # sample cols at each level
    colsample_bynode=0.8,
    gamma=2.0,                        # require a gain to split
    reg_lambda=3.0,                   # L2
    reg_alpha=1.0,                    # L1 (helps sparsity)
    max_delta_step=2,                 # stabilizes updates for rare positives
    #scale_pos_weight=ratio,           # critical for 1:300 imbalance
    tree_method="hist",
    random_state=42,
    n_jobs=1,
    eval_metric="aucpr",
    early_stopping_rounds = 200 # best for heavy imbalance
)


#weights = np.where(y_train == 1, 70, 1)
clf.fit(
    X_res, y_res, 
    eval_set=[(X_test, y_test)], 
    verbose = False,
)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,0.8
,colsample_bynode,0.8
,colsample_bytree,0.8
,device,
,early_stopping_rounds,200
,enable_categorical,False


In [None]:
# neg_count = np.sum(y_train == 0)
# pos_count = np.sum(y_train == 1)
# ratio = np.sqrt(neg_count/pos_count)

X_train = X_train.astype(float)
X_test = X_test.astype(float)

X_train = np.nan_to_num(X_train, nan=-1)
X_test = np.nan_to_num(X_test, nan=-1)

smote = BorderlineSMOTE(kind='borderline-1', random_state=42, k_neighbors=5)
X_res, y_res = smote.fit_resample(X_train, y_train)

param_grid = {
    # Tree complexity
    'max_depth': [4, 5, 8],
    'min_child_weight': [1, 3, 5],

    # Sampling
    'subsample': [0.6, 0.8],
    'colsample_bytree': [0.6, 0.8],
    'colsample_bynode': [0.6, 0.8]
    'eta': [0.01, 0.03],
    'max_delta_step': [2, 3]
    'n_estimators': [500, 800, 1000],

    # Class imbalance
    # 'scale_pos_weight': [1, 5, 10, 20]  # skip if using SMOTE
}

# clf = XGBClassifier(
#     n_estimators=20000,                # many trees + early stopping
#     learning_rate=0.03,               # small eta ‚Üí stabler generalization
#     max_depth=4,                      # shallow trees reduce overfit
#     min_child_weight=6,               # be conservative with splits
#     subsample=0.7,                    # add randomness
#     colsample_bytree=0.8,
#     colsample_bylevel=0.8,          # sample cols at each level
#     colsample_bynode=0.8,
#     gamma=2.0,                        # require a gain to split
#     reg_lambda=3.0,                   # L2
#     reg_alpha=1.0,                    # L1 (helps sparsity)
#     max_delta_step=2,                 # stabilizes updates for rare positives
#     #scale_pos_weight=ratio,           # critical for 1:300 imbalance
#     tree_method="hist",
#     random_state=42,
#     n_jobs=1,
#     eval_metric="aucpr",
#     early_stopping_rounds = 200 # best for heavy imbalance
# )

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

xgb_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='aucpr',
    random_state=42,
    tree_method='hist'
)

grid = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='f1',
    cv=cv,
    verbose=2,
    n_jobs=1
)

grid.fit(X_train_smote, y_train_smote)


#weights = np.where(y_train == 1, 70, 1)
clf.fit(
    X_res, y_res, 
    eval_set=[(X_test, y_test)], 
    verbose = False,
)

In [33]:
print(train_set[imp_labels].columns)

Index(['MCC_Group', 'Amount', 'Type', 'Balance', 'Limit_dst', 'Age', 'Tenure',
       'CreditScore', 'AnnualSalary', 'Login_Count', 'Logout_Count',
       'Payment_Count', 'Transfer_Count', 'in_home_city', 'prev_trans_in_city',
       'times_device_used', 'unique_devices_past', 'days_since_open_dst',
       'time_since_last_txn_hours', 'xacts_earlier_today', 'avg_amount_past',
       'mcc_group_prev_count', 'LaggedAmt', 'LaggedActCt', 'LaggedHomeCity',
       'LaggedPayCt', 'LaggedTransCt', 'Lagged50Amt', 'Lagged50ActCt',
       'Lagged50HomeCity', 'Lagged50PayCt', 'Lagged50TransCt', 'Lagged25Amt',
       'Lagged25ActCt', 'Lagged25HomeCity', 'Lagged25PayCt', 'Lagged25TransCt',
       'Lagged100Amt', 'Lagged100ActCt', 'Lagged100HomeCity', 'Lagged100PayCt',
       'Lagged100TransCt', 'Lagged50AmtDiff', 'Lagged50TransDiff'],
      dtype='object')


In [41]:
importance = clf.get_booster().get_score(importance_type='gain')
feat_imp = pd.DataFrame({
    'feature': list(importance.keys()),
    'importance': list(importance.values())
}).sort_values(by='importance', ascending=False)
feat_imp.sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
10,f10,13102.975586
13,f13,11025.09375
14,f14,10291.549805
9,f9,8384.625977
16,f16,2352.356445
38,f38,1890.258057
8,f8,1292.48645
35,f35,1286.801025
4,f4,1145.755371
18,f18,1126.357178


In [46]:
df = pd.read_csv("final_test.csv")
test_set = pd.merge(df, lag, on="TxnID", how="left")
X_df = test_set[imp_labels]
X = X_df.to_numpy()

In [47]:
# Make predictions
predictions = clf.predict(X)
#predictions = model.predict(X)

# Create submission file (format: TxnID,FraudLabel)
submission_df = pd.DataFrame({
    'TxnID': df['TxnID'],
    'FraudLabel': predictions
})

# Save predictions
submission_df.to_csv("retailbanking_challenge1_predictions2.csv", index=False)
print(len(X))
print(f"‚úÖ Predictions saved: {submission_df.shape[0]} predictions")
print(f"   Preview: {submission_df.head(3)}")
print(f"   Fraud rate: {predictions.mean():.3f} ({predictions.sum()} fraud cases out of {len(predictions)})")

151587
‚úÖ Predictions saved: 151587 predictions
   Preview:                                   TxnID  FraudLabel
0  c77cfd37-2a4f-41a2-ae45-696aaa2ec4e9           0
1  1e3da79d-891c-4769-af85-4b9bfc2aab7c           0
2  b1e5643f-3fee-41e6-b003-a360810dfa3d           0
   Fraud rate: 0.003 (479 fraud cases out of 151587)


In [48]:
from agentds import BenchmarkClient

client = BenchmarkClient(
    api_key="adsb_hdm1DRk1iW2I1VA84Oc9jz6z_1756090832",        # Get from your team dashboard
    team_name="agi"     # Your exact team name
)

# 3. Submit Predictions

# Submit predictions to the competition
print("üöÄ Submitting predictions...")

try:
    result = client.submit_prediction("Retailbanking", 1, "retailbanking_challenge1_predictions2.csv")

    if result['success']:
        print("‚úÖ Submission successful!")
        print(f"   üìä Score: {result['score']:.4f}")
        print(f"   üìè Metric: {result['metric_name']}")
        print(f"   ‚úîÔ∏è  Validation: {'Passed' if result['validation_passed'] else 'Failed'}")
    else:
        print("‚ùå Submission failed!")
        print(f"   Error details: {result.get('details', {}).get('validation_errors', 'Unknown error')}")

except Exception as e:
    print(f"üí• Submission error: {e}")
    print("üîß Check your API key and team name are correct!")

print("\nüéØ Next steps:")
print("   1. Try incorporating relevant information outside this table!")
print("   2. Move on to Retail Banking Challenge 2!")

üöÄ Submitting predictions...
‚úÖ Prediction submitted successfully!
üìä Score: 0.4997 (Macro-F1)
‚úÖ Validation passed
‚úÖ Submission successful!
   üìä Score: 0.4997
   üìè Metric: Macro-F1
   ‚úîÔ∏è  Validation: Passed

üéØ Next steps:
   1. Try incorporating relevant information outside this table!
   2. Move on to Retail Banking Challenge 2!
