<a href="https://colab.research.google.com/github/Sg134-ch/Machine-Learning-Projects-/blob/main/ML_LAB_EXPERIMENT_04_BY_23102C0051.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
"""
FAST Binary Classification Solution - Optimized for Speed
Trains in under 2 minutes while maintaining high performance
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import warnings
import os
warnings.filterwarnings('ignore')

print("="*70)
print("FAST BINARY CLASSIFICATION SOLUTION")
print("="*70)

# ============================================================================
# STEP 1: LOAD DATA
# ============================================================================

def find_file(filename, search_paths):
    """Search for file in multiple locations"""
    for path in search_paths:
        full_path = os.path.join(path, filename)
        if os.path.exists(full_path):
            return full_path
    return None

search_paths = ['.', '/mnt/user-data/uploads', '/home/claude']

print("\nLoading data...")

# Find train.csv
train_path = find_file('train.csv', search_paths)
if train_path:
    train = pd.read_csv(train_path)
    print(f"✓ Train: {train.shape}")
else:
    print("✗ train.csv not found - downloading UCI dataset...")
    try:
        import urllib.request
        import zipfile
        import io

        url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip"
        response = urllib.request.urlopen(url, timeout=30)
        zip_file = zipfile.ZipFile(io.BytesIO(response.read()))

        with zip_file.open('bank-additional/bank-additional-full.csv') as f:
            train = pd.read_csv(f, sep=';')

        train['y'] = (train['y'] == 'yes').astype(int)
        if 'id' not in train.columns:
            train.insert(0, 'id', range(len(train)))

        print(f"✓ Downloaded UCI dataset: {train.shape}")
    except Exception as e:
        print(f"ERROR: {e}")
        exit(1)

# Find test.csv
test_path = find_file('test.csv', search_paths)
if test_path:
    test = pd.read_csv(test_path)
    print(f"✓ Test: {test.shape}")
else:
    print("ERROR: test.csv not found!")
    exit(1)

# ============================================================================
# STEP 2: QUICK FEATURE ENGINEERING
# ============================================================================

print("\nFeature engineering...")

X_train = train.drop(['y', 'id'], axis=1, errors='ignore')
y_train = train['y']
X_test = test.drop(['id'], axis=1, errors='ignore')
test_ids = test['id']

def quick_features(df):
    """Fast feature engineering"""
    df = df.copy()

    # Domain features
    if 'duration' in df.columns:
        df['duration_log'] = np.log1p(df['duration'])
        df['long_call'] = (df['duration'] > 300).astype(int)

    if 'pdays' in df.columns:
        df['contacted_before'] = (df['pdays'] != -1).astype(int)

    if 'campaign' in df.columns:
        df['multi_contact'] = (df['campaign'] > 1).astype(int)

    if 'age' in df.columns and 'balance' in df.columns:
        df['age_balance'] = df['age'] * df['balance']

    if 'balance' in df.columns:
        df['balance_pos'] = (df['balance'] > 0).astype(int)

    return df

X_train = quick_features(X_train)
X_test = quick_features(X_test)

# Encode categoricals
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
for col in categorical_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))

# Fill missing
X_train = X_train.fillna(-999)
X_test = X_test.fillna(-999)

print(f"Features: {X_train.shape[1]}")
print(f"Samples: {len(X_train):,} train, {len(X_test):,} test")

# ============================================================================
# STEP 3: FAST MODEL TRAINING (2 models only)
# ============================================================================

print("\n" + "="*70)
print("Training models...")
print("="*70)

# Calculate class weight
class_ratio = len(y_train[y_train==0]) / len(y_train[y_train==1])

# Use only 3-fold CV for speed
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Model 1: LightGBM (fastest)
print("\n[1/2] LightGBM...")
lgbm = LGBMClassifier(
    n_estimators=100,  # Reduced from 300
    learning_rate=0.1,  # Increased for faster convergence
    max_depth=5,
    num_leaves=20,
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight='balanced',
    random_state=42,
    verbose=-1,
    n_jobs=-1
)

lgbm_scores = []
for train_idx, val_idx in cv.split(X_train, y_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    lgbm.fit(X_tr, y_tr)
    pred = lgbm.predict_proba(X_val)[:, 1]
    lgbm_scores.append(roc_auc_score(y_val, pred))

lgbm_cv = np.mean(lgbm_scores)
print(f"  CV ROC AUC: {lgbm_cv:.5f}")

lgbm.fit(X_train, y_train)
lgbm_pred = lgbm.predict_proba(X_test)[:, 1]

# Model 2: XGBoost
print("\n[2/2] XGBoost...")
xgb = XGBClassifier(
    n_estimators=100,  # Reduced from 300
    learning_rate=0.1,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=class_ratio,
    random_state=42,
    eval_metric='logloss',
    verbosity=0,
    n_jobs=-1
)

xgb_scores = []
for train_idx, val_idx in cv.split(X_train, y_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    xgb.fit(X_tr, y_tr)
    pred = xgb.predict_proba(X_val)[:, 1]
    xgb_scores.append(roc_auc_score(y_val, pred))

xgb_cv = np.mean(xgb_scores)
print(f"  CV ROC AUC: {xgb_cv:.5f}")

xgb.fit(X_train, y_train)
xgb_pred = xgb.predict_proba(X_test)[:, 1]

# ============================================================================
# STEP 4: ENSEMBLE
# ============================================================================

print("\n" + "="*70)
print("Creating ensemble...")
print("="*70)

# Weighted average
total_score = lgbm_cv + xgb_cv
lgbm_weight = lgbm_cv / total_score
xgb_weight = xgb_cv / total_score

print(f"\nLightGBM weight: {lgbm_weight:.3f} (CV: {lgbm_cv:.5f})")
print(f"XGBoost weight:  {xgb_weight:.3f} (CV: {xgb_cv:.5f})")

final_pred = lgbm_weight * lgbm_pred + xgb_weight * xgb_pred

# ============================================================================
# STEP 5: SAVE SUBMISSION
# ============================================================================

print("\n" + "="*70)
print("Saving submission...")
print("="*70)

submission = pd.DataFrame({
    'id': test_ids,
    'y': final_pred
})

output_dir = '/mnt/user-data/outputs'
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, 'submission.csv')

submission.to_csv(output_path, index=False)
submission.to_csv('submission.csv', index=False)

print(f"\n✓ Saved to: {output_path}")
print(f"✓ Backup: submission.csv")

print("\n" + "="*70)
print("STATISTICS")
print("="*70)
print(f"Predictions: [{final_pred.min():.5f}, {final_pred.max():.5f}]")
print(f"Mean: {final_pred.mean():.5f}")
print(f"Expected CV ROC AUC: {np.mean([lgbm_cv, xgb_cv]):.5f}")

print("\n" + "="*70)
print("✓ COMPLETE! Submit submission.csv")
print("="*70)

FAST BINARY CLASSIFICATION SOLUTION

Loading data...
✓ Train: (750000, 18)
✓ Test: (250000, 17)

Feature engineering...
Features: 22
Samples: 750,000 train, 250,000 test

Training models...

[1/2] LightGBM...
  CV ROC AUC: 0.96080

[2/2] XGBoost...
  CV ROC AUC: 0.96114

Creating ensemble...

LightGBM weight: 0.500 (CV: 0.96080)
XGBoost weight:  0.500 (CV: 0.96114)

Saving submission...

✓ Saved to: /mnt/user-data/outputs/submission.csv
✓ Backup: submission.csv

STATISTICS
Predictions: [0.00121, 0.99776]
Mean: 0.23652
Expected CV ROC AUC: 0.96097

✓ COMPLETE! Submit submission.csv
