In [2]:
import pandas as pd, numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV

TARGET_VAR = "LoanApproved"

In [6]:
DATA_PATH = "/kaggle/input/train-ddd/train_c.csv"
df = pd.read_csv(DATA_PATH)

In [7]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

TOP_K = 8
DEGREE = 4

X = df.copy()
X = X[~X[TARGET_VAR].isna()].copy()

categorial_feature = ['MaritalStatus', 'HomeOwnershipStatus', 'LoanPurpose', 'EmploymentStatus', 'EducationLevel']
X_num = X.select_dtypes(include=[np.number]).drop(columns=[TARGET_VAR], errors="ignore")
X_categorial = X[categorial_feature].copy()
y = X[TARGET_VAR].astype(float).values

quantile_min = X_num.quantile(0.01)
quantile_max = X_num.quantile(0.99)
X_wins = X_num.clip(lower=quantile_min, upper=quantile_max, axis=1)

imp = SimpleImputer(strategy="median")
X_imp = pd.DataFrame(imp.fit_transform(X_wins), columns=X_wins.columns)


def split_date(df):
    s = pd.to_datetime(df['ApplicationDate'], errors='coerce')
    return pd.DataFrame({
        'year': s.dt.year,
        'quarter': s.dt.quarter,
        'month': s.dt.month,
        'dow': s.dt.dayofweek
    }, index=df.index)


scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_imp), columns=X_imp.columns)

corr = X_scaled.assign(LoanApproved=y).corr()[TARGET_VAR].drop(TARGET_VAR).abs().sort_values(ascending=False)
top_numeric = corr.head(TOP_K).index.tolist()
other_columns = [c for c in X_scaled.columns if c not in top_numeric]

poly = PolynomialFeatures(degree=DEGREE)
X_poly = pd.DataFrame(poly.fit_transform(X_scaled[top_numeric]), columns=poly.get_feature_names_out(top_numeric))

dates = split_date(X)

X_cat_dummies = pd.get_dummies(X_categorial, drop_first=True)
X_final = pd.concat(
    [X_scaled[other_columns].reset_index(drop=True),
     X_poly.reset_index(drop=True),
     X_cat_dummies.reset_index(drop=True),
     dates.reset_index(drop=True)],
    axis=1)
print(
    f"Frame: {X_final.shape[1]} features (other columns: {len(other_columns)}, polynomial on {TOP_K} cols, {DEGREE} degree)")

X_train, X_validation, y_train, y_validation = train_test_split(X_final, y, train_size=0.8)
clf = LogisticRegression(
    solver="liblinear",
    max_iter=1000,
)


param_grid = {
    "C": [0.01, 0.1, 1.0, 10.0],
    "class_weight": [None, "balanced"],
}

grid = GridSearchCV(
    clf,
    param_grid=param_grid,
    scoring="roc_auc",
    cv=5,
    n_jobs=4,
)

grid.fit(X_final, y)

best_clf = grid.best_estimator_

val_proba = best_clf.predict_proba(X_validation)[:, 1]
train_proba = best_clf.predict_proba(X_train)[:, 1]

train_auc = roc_auc_score(y_train, train_proba)
val_auc = roc_auc_score(y_validation, val_proba)

print(f"TRAIN ROC-AUC: {train_auc:.4f}")
print(f"VALIDATION ROC-AUC: {val_auc:.4f}")


  return op(a, b)
  return op(a, b)


Frame: 535 features (other columns: 20, polynomial on 8 cols, 4 degree)
TRAIN ROC-AUC: 0.9871
VALIDATION ROC-AUC: 0.9872


In [3]:
from typing import Any


def split_date(df):
    s = pd.to_datetime(df['ApplicationDate'], errors='coerce')
    return pd.DataFrame({
        'year': s.dt.year,
        'quarter': s.dt.quarter,
        'month': s.dt.month,
        'dow': s.dt.dayofweek
    }, index=df.index)


categorial_feature = ['MaritalStatus', 'HomeOwnershipStatus', 'LoanPurpose', 'EmploymentStatus', 'EducationLevel']


def fit_model(train_path, top_k, degree) -> dict[str, Any]:
    df = pd.read_csv(train_path).copy()

    df = df[~df[TARGET_VAR].isna()].copy()

    X_num = df.select_dtypes(include=[np.number]).drop(columns=[TARGET_VAR], errors="ignore")
    X_categorial = df[categorial_feature].copy()
    y = df[TARGET_VAR].astype(float).values

    quantile_min = X_num.quantile(0.01)
    quantile_max = X_num.quantile(0.99)
    X_wins = X_num.clip(lower=quantile_min, upper=quantile_max, axis=1)

    imp = SimpleImputer(strategy="median").fit(X_wins)
    X_imp = pd.DataFrame(imp.transform(X_wins), columns=X_wins.columns)

    scaler = StandardScaler().fit(X_imp)
    X_scaled = pd.DataFrame(scaler.transform(X_imp), columns=X_imp.columns)

    corr = X_scaled.assign(LoanApproved=y).corr()[TARGET_VAR].drop(TARGET_VAR).abs().sort_values(ascending=False)
    top_numeric = corr.head(top_k).index.tolist()
    other_columns = [c for c in X_scaled.columns if c not in top_numeric]

    poly = PolynomialFeatures(degree=degree).fit(X_scaled[top_numeric])
    X_poly = pd.DataFrame(poly.transform(X_scaled[top_numeric]),
                          columns=poly.get_feature_names_out(top_numeric))

    dates = split_date(df)
    X_cat_dummies = pd.get_dummies(X_categorial, drop_first=True)
    X_final = pd.concat([
        X_scaled[other_columns].reset_index(drop=True),
        X_poly.reset_index(drop=True),
        X_cat_dummies.reset_index(drop=True),
        dates.reset_index(drop=True)],
        axis=1)

    param_grid = {
    "C": [0.01, 0.1, 1.0, 10.0],
    "class_weight": [None, "balanced"],
    }
    clf = LogisticRegression(
        penalty="l2",
        solver="liblinear",
        max_iter=1000,
    )

    grid = GridSearchCV(
        clf,
        param_grid=param_grid,
        scoring="roc_auc",
        cv=5,
        n_jobs=4,
    )
    grid.fit(X_final, y)
    
    best_clf = grid.best_estimator_

    model_fit_data = {
        "numeric_columns": X_num.columns.tolist(), "other_columns": other_columns, "top_numeric": top_numeric,
        "quantile_min": quantile_min, "quantile_max": quantile_max, "scaler": scaler, "imputer": imp, "poly": poly,
        "model": best_clf
    }
    return model_fit_data

In [8]:
def predict_data(test_path, model_fit_data, out_path="predictions.csv"):
    df = pd.read_csv(test_path).copy()

    X_num = df.reindex(columns=model_fit_data["numeric_columns"])
    X_categorial = df[categorial_feature].copy()
    X_wins = X_num.clip(lower=model_fit_data["quantile_min"], upper=model_fit_data["quantile_max"], axis=1)
    X_imputer = pd.DataFrame(model_fit_data["imputer"].transform(X_wins), columns=model_fit_data["numeric_columns"])
    X_scaled = pd.DataFrame(model_fit_data["scaler"].transform(X_imputer), columns=model_fit_data["numeric_columns"])

    other_columns = model_fit_data["other_columns"]
    top_numeric = model_fit_data["top_numeric"]
    X_poly = pd.DataFrame(
        model_fit_data["poly"].transform(X_scaled[top_numeric]),
        columns=model_fit_data["poly"].get_feature_names_out(top_numeric)
    )

    dates = split_date(df)
    X_cat_dummies = pd.get_dummies(X_categorial, drop_first=True)
    X_final = pd.concat([X_scaled[other_columns].reset_index(drop=True), X_poly.reset_index(drop=True),
                         X_cat_dummies.reset_index(drop=True), dates.reset_index(drop=True)], axis=1)

    model = model_fit_data["model"]
    proba = model.predict_proba(X_final)[:, 1]

    output = pd.DataFrame({
        "ID": df["ID"].values,
        "LoanApproved": proba
    })
    output.to_csv(out_path, index=False)

    return output

In [9]:
model_fit_data = fit_model("/kaggle/input/train-ddd/train_c.csv", top_k=8, degree=4)
sub = predict_data("/kaggle/input/train-ddd/test_c.csv", model_fit_data, out_path="predictions.csv")
print(sub.head())


  return op(a, b)
  return op(a, b)


   ID  LoanApproved
0   0      0.992848
1   1      0.018712
2   2      0.999728
3   3      0.999881
4   4      0.998898
