V3 only uses numerical features

V2 converts classification features to numerical types without standardization

V1 converts classification features into numerical types and standardizes all features

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

In [None]:
train = pd.read_csv("/kaggle/input/playground-series-s5e11/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s5e11/test.csv")
sub = pd.read_csv("/kaggle/input/playground-series-s5e11/sample_submission.csv")

In [None]:
value_distribution = train['loan_paid_back'].value_counts(normalize=True) * 100
print("distribution (%):")
print(value_distribution)

In [None]:
cat = train.select_dtypes(include=['number']).columns.tolist()
cat = [col for col in cat if col not in ['id', 'loan_paid_back']]

In [None]:
X = train[cat]
y = train['loan_paid_back'].astype(int)

In [None]:
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

oof_preds = np.zeros(len(X))                    # OOF 的预测类别
oof_probas = np.zeros(len(X))                   # OOF 的预测概率
test_preds = np.zeros(len(test))                # 测试集预测类别（可选）
test_proba = np.zeros(len(test))                # 测试集预测概率（累加用）
fold_scores = []


log_reg = LogisticRegression(
    random_state=42,
    max_iter=1000,
    solver='liblinear',  # 适合小数据集和二分类
    penalty='l2',
    C=0.01
)

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"\nFold {fold+1}/{n_splits}")
    
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    log_reg.fit(X_tr, y_tr)
    
    val_pred = log_reg.predict(X_val)
    val_proba = log_reg.predict_proba(X_val)[:, 1]  # 正类概率
    
    oof_preds[val_idx] = val_pred
    oof_probas[val_idx] = val_proba
    
    test_proba += log_reg.predict_proba(test[cat])[:, 1]
    
    acc = accuracy_score(y_val, val_pred)
    fold_scores.append(acc)
    print(f"Fold {fold+1} Accuracy: {acc:.6f}")

print("\n" + "="*50)
print(f"CV Accuracy Scores: {[f'{s:.6f}' for s in fold_scores]}")
print(f"Mean CV Accuracy: {np.mean(fold_scores):.6f} (+/- {np.std(fold_scores)*2:.6f})")
print("="*50)

y_pred = test_proba / n_splits  

In [None]:
sub.loan_paid_back=y_pred
sub.to_csv("submission.csv", index=False)
sub.head()