In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from train import load_dataset,train,eval
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.ensemble import BalancedRandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [380]:
data = pd.read_csv("data/datatrain.csv")

In [381]:
X = data.drop(columns=['target','User_ID','Created At Year','Created At time',"Created At Month","Countries_ID"])
Y = data['target']

In [382]:
test_df = pd.read_csv("data/datatest.csv")
test = test_df.select_dtypes(include=["number"]).drop(columns=["Created At Year","Created At Month","Countries_ID"])

In [372]:
X_train, X_test, y_train, y_test = load_dataset(test_size=0.2)

In [385]:
# ================================
# 1. Balanced Random Forest
# ================================
model1 = BalancedRandomForestClassifier(n_estimators=200,max_depth=8,min_samples_leaf=4,random_state=42)
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=297)
oof = np.zeros(len(X))
predictions1 = []

for fold, (trn_idx, val_idx) in enumerate(skfold.split(X, Y)):
    print(f'BalancedRF Fold {fold + 1}')
    X_train, y_train = X.iloc[trn_idx], Y.iloc[trn_idx]
    X_valid, y_test  = X.iloc[val_idx], Y.iloc[val_idx]

    model1.fit(X_train, y_train)

    # Validation predictions
    y_pred_valid = model1.predict(X_valid)
    oof[val_idx] = model1.predict_proba(X_valid)[:, 1]

    # Training predictions
    y_pred_train = model1.predict(X_train)

    print("Training classification report:\n", classification_report(y_train, y_pred_train))
    print("Validation classification report:\n", classification_report(y_test, y_pred_valid))

    predictions1.append(model1.predict_proba(test)[:, 1])

predictions1 = np.mean(predictions1, axis=0)
# X['rf'] = oof
# test['rf'] = predictions1


# ================================
# 2. Logistic Regression + StandardScaler
# ================================
model1 = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(class_weight='balanced', max_iter=10000))
])

skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(len(X))
predictions1 = []

for fold, (trn_idx, val_idx) in enumerate(skfold.split(X, Y)):
    print(f'Logistic Regression Fold {fold + 1}')
    X_train, y_train = X.iloc[trn_idx], Y.iloc[trn_idx]
    X_valid, y_test  = X.iloc[val_idx], Y.iloc[val_idx]

    model1.fit(X_train, y_train)

    # Validation
    y_pred_valid = model1.predict(X_valid)
    oof[val_idx] = model1.predict_proba(X_valid)[:, 1]

    # Training
    y_pred_train = model1.predict(X_train)

    print("Training classification report:\n", classification_report(y_train, y_pred_train))
    print("Validation classification report:\n", classification_report(y_test, y_pred_valid))

    predictions1.append(model1.predict_proba(test)[:, 1])

predictions1 = np.mean(predictions1, axis=0)

X['lg'] = oof
test['lg'] = predictions1


# ================================
# 3. CatBoost Classifier (stacking)
# ================================
model1 = CatBoostClassifier(
    n_estimators=1000,
    learning_rate=0.0100800800100051124,
    depth=7,
    random_seed=0,
    auto_class_weights='Balanced',
    verbose=1
)

skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(len(X))
predictions1 = []

for fold, (trn_idx, val_idx) in enumerate(skfold.split(X, Y)):
    print(f'CatBoost (stacking) Fold {fold + 1}')
    X_train, y_train = X.iloc[trn_idx], Y.iloc[trn_idx]
    X_valid, y_test  = X.iloc[val_idx], Y.iloc[val_idx]

    model1.fit(
        X_train, y_train,
        eval_set=(X_valid, y_test),
        verbose=100,
        early_stopping_rounds=100,
    )

    # Validation
    y_pred_valid = model1.predict(X_valid)
    oof[val_idx] = model1.predict_proba(X_valid)[:, 1]

    # Training
    y_pred_train = model1.predict(X_train)

    print("Training classification report:\n", classification_report(y_train, y_pred_train))
    print("Validation classification report:\n", classification_report(y_test, y_pred_valid))

    predictions1.append(model1.predict_proba(test)[:, 1])

predictions1 = np.mean(predictions1, axis=0)

X['cat'] = oof
test['cat'] = predictions1


# ================================
# 4. Weighted average stacking
# ================================
X['weighted_avg'] = X[['lg', 'cat']].mean(axis=1)
test['weighted_avg'] = test[['lg', 'cat']].mean(axis=1)

# Filter misclassified samples (based on CatBoost oof diff)
X['y'] = Y
X['diff'] = X['y'] - X['cat']
X = X[X['diff'] > -0.85]

Y = X['y']
X = X.drop(['diff', 'y'], axis=1)


# ================================
# 5. Final CatBoost on stacked features
# ================================
model1 = CatBoostClassifier(
    n_estimators=2000,
    learning_rate=0.0010050000800100051124,
    depth=9,
    random_seed=0,
    auto_class_weights='Balanced',
    verbose=1
)

skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(len(X))
predictions1 = []

for fold, (trn_idx, val_idx) in enumerate(skfold.split(X, Y)):
    print(f'Final CatBoost Fold {fold + 1}')
    X_train, y_train = X.iloc[trn_idx], Y.iloc[trn_idx]
    X_valid, y_test  = X.iloc[val_idx], Y.iloc[val_idx]

    model1.fit(
        X_train, y_train,
        eval_set=(X_valid, y_test),
        verbose=100,
        early_stopping_rounds=100,
    )

    # Validation
    y_pred_valid = model1.predict(X_valid)
    oof[val_idx] = y_pred_valid

    # Training
    y_pred_train = model1.predict(X_train)

    print("Training classification report:\n", classification_report(y_train, y_pred_train))
    print("Validation classification report:\n", classification_report(y_test, y_pred_valid))

    predictions1.append(model1.predict_proba(test)[:, 1])

predictions1 = np.mean(predictions1, axis=0)

# ================================
# 6. Global OOF classification report
# ================================
print("\n=== Global OOF Classification Report ===")
print(classification_report(Y, oof))


BalancedRF Fold 1
Training classification report:
               precision    recall  f1-score   support

           0       0.93      0.91      0.92      5187
           1       0.78      0.83      0.80      1961

    accuracy                           0.89      7148
   macro avg       0.86      0.87      0.86      7148
weighted avg       0.89      0.89      0.89      7148

Validation classification report:
               precision    recall  f1-score   support

           0       0.92      0.90      0.91      1297
           1       0.74      0.78      0.76       491

    accuracy                           0.86      1788
   macro avg       0.83      0.84      0.83      1788
weighted avg       0.87      0.86      0.87      1788

BalancedRF Fold 2
Training classification report:
               precision    recall  f1-score   support

           0       0.93      0.92      0.92      5187
           1       0.79      0.82      0.80      1962

    accuracy                           0.89  

In [377]:
predictions_bool = (predictions1  > 0.5).astype('int')
predictions_bool.sum()

np.int64(186)

In [263]:
predictions = clf.predict(X)

In [264]:
predictions.sum()

np.int64(225)

In [363]:
sub = pd.DataFrame({
    'User_ID_Next_month_Activity': test_df.User_ID,
    'Active': predictions_bool
})
sub.User_ID_Next_month_Activity = sub.User_ID_Next_month_Activity + '_Month_5'
SampleSubmission = pd.read_csv("data/SampleSubmission.csv")
sub = pd.merge(SampleSubmission.drop(columns='Active'), sub, on='User_ID_Next_month_Activity', how='left').fillna(0)
sub.to_csv("stacked_submission.csv", index=False)
