In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from imblearn.combine import SMOTETomek
import joblib

def preprocess_data(df):
    education_map = {'High School': 1, 'Other': 2, "Bachelor's": 3, "Master's": 4, 'PhD': 5}
    df['education_level'] = df['education_level'].map(education_map)

    categorical_columns = ['marital_status', 'gender']
    existing_categorical_cols = [col for col in categorical_columns if col in df.columns]
    if existing_categorical_cols:
        df = pd.get_dummies(df, columns=existing_categorical_cols, dtype=int)

    categorical_for_freq = ['employment_status', 'loan_purpose']
    for col in categorical_for_freq:
        if col in df.columns:
            freq_map = df[col].value_counts(normalize=True)
            df[col] = df[col].map(freq_map)

    grade_subgrade_order = {
        'A1': 1, 'A2': 2, 'A3': 3, 'A4': 4, 'A5': 5,
        'B1': 6, 'B2': 7, 'B3': 8, 'B4': 9, 'B5': 10,
        'C1': 11, 'C2': 12, 'C3': 13, 'C4': 14, 'C5': 15,
        'D1': 16, 'D2': 17, 'D3': 18, 'D4': 19, 'D5': 20,
        'E1': 21, 'E2': 22, 'E3': 23, 'E4': 24, 'E5': 25,
        'F1': 26, 'F2': 27, 'F3': 28, 'F4': 29, 'F5': 30
    }
    if 'grade_subgrade' in df.columns:
        df['grade_subgrade'] = df['grade_subgrade'].map(grade_subgrade_order)

    required_cols_for_features = {
        'income_to_loan_ratio': ['annual_income', 'loan_amount'],
        'debt_burden': ['debt_to_income_ratio', 'loan_amount'],
        'interest_burden': ['interest_rate', 'loan_amount', 'annual_income'],
        'credit_utilization': ['debt_to_income_ratio', 'credit_score'],
        'payment_capacity': ['annual_income', 'debt_to_income_ratio']
    }

    for feature_name, required_cols in required_cols_for_features.items():
        if all(col in df.columns for col in required_cols):
            if feature_name == 'income_to_loan_ratio':
                df[feature_name] = df['annual_income'] / (df['loan_amount'] + 1)
            elif feature_name == 'debt_burden':
                df[feature_name] = df['debt_to_income_ratio'] * df['loan_amount']
            elif feature_name == 'interest_burden':
                df[feature_name] = df['interest_rate'] * df['loan_amount'] / df['annual_income']
            elif feature_name == 'credit_utilization':
                df[feature_name] = df['debt_to_income_ratio'] * (1 - df['credit_score'] / 850)
            elif feature_name == 'payment_capacity':
                df[feature_name] = (df['annual_income'] / 12) * (1 - df['debt_to_income_ratio'])

    columns_to_drop = ['loan_paid_back']
    if 'id' in df.columns:
        columns_to_drop.append('id')

    X = df.drop(columns_to_drop, axis=1)
    y = df['loan_paid_back']


    mask = X.notnull().all(axis=1) & y.notnull()
    X = X[mask]
    y = y[mask]

    smt = SMOTETomek(random_state=42)
    X_resampled, y_resampled = smt.fit_resample(X, y)

    return X_resampled, y_resampled

def state_of_the_art_ensemble(X, y):
    base_models = {
        'xgb1': XGBClassifier(n_estimators=1200, learning_rate=0.01, max_depth=8, random_state=42, verbosity=0),
        'xgb2': XGBClassifier(n_estimators=1000, learning_rate=0.02, max_depth=6, random_state=43, verbosity=0),
        'lgb1': LGBMClassifier(n_estimators=1200, learning_rate=0.01, max_depth=8, random_state=42, verbose=-1),
        'catboost': CatBoostClassifier(iterations=1200, learning_rate=0.01, depth=8, random_state=42, verbose=False)
    }

    for model in base_models.values():
        model.fit(X, y)

    meta_features = np.column_stack([model.predict_proba(X)[:, 1] for model in base_models.values()])

    meta_learners = {
        'logistic': LogisticRegression(random_state=42, max_iter=1000, C=0.1),
        'xgb_meta': XGBClassifier(n_estimators=300, learning_rate=0.1, max_depth=4, random_state=42, verbosity=0)
    }

    for meta_model in meta_learners.values():
        meta_model.fit(meta_features, y)

    return {
        'base_models': base_models,
        'meta_learners': meta_learners
    }


df = pd.read_csv("train.csv")

X, y = preprocess_data(df)
ensemble_models = state_of_the_art_ensemble(X, y)
joblib.dump(ensemble_models, "ensemble_models.pkl")
