In [1]:
import re
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

SEED = 42
N_FOLDS = 5
TE_ALPHA = 10.0

class CreditScoreModel:
    def __init__(self, train_path, test_path):
        self.train_path = train_path
        self.test_path = test_path
        self.model_params = {
            'hidden_layer_sizes': (256, 128),
            'max_iter': 500,
            'learning_rate_init': 0.001,
            'early_stopping': True
        }

    def _clean_numeric(self, series):
        # Clean artifacts
        s = series.astype(str).copy()
        replacements = {
            "__-333333333333333333333333333__": np.nan,
            "__10000__": "10000",
            "_": np.nan,
            "-100": np.nan
        }
        for old, new in replacements.items():
            s = s.replace(old, new)
            
        s = s.apply(lambda x: re.sub(r'[^\d.-]', '', x) if pd.notna(x) else x)
        return pd.to_numeric(s, errors="coerce")

    def _parse_duration(self, val):
        if pd.isna(val): return np.nan
        m = re.match(r"\s*(\d+)\s+Years?\s+and\s+(\d+)\s+Months?", str(val))
        return int(m.group(1)) * 12 + int(m.group(2)) if m else np.nan

    def preprocess(self, df):
        df = df.copy()
        
        if "Возраст_кредитной_истории" in df.columns:
            df["History_Months"] = df["Возраст_кредитной_истории"].apply(self._parse_duration)
            df.drop(columns=["Возраст_кредитной_истории"], inplace=True)

        cols_to_clean = [
            "Колво_отсроченных_платежей", "Месячный_баланс", "Годовой_доход",
            "Колво_займов", "Оставшийся_долг", "Сумма_инвестиций",
            "Изменение_кредитного_лимита", "Сумма_ежемесячных_выплат", "Месячная_зарплата"
        ]
        for col in cols_to_clean:
            if col in df.columns:
                df[col] = self._clean_numeric(df[col])

        # Log transform
        log_cols = [
            "Месячный_баланс", "Годовой_доход", "Сумма_ежемесячных_выплат",
            "Сумма_инвестиций", "Оставшийся_долг", "Месячная_зарплата"
        ]
        for col in log_cols:
            if col in df.columns:
                mask = df[col].notna()
                vals = df.loc[mask, col].astype(float).clip(lower=0)
                df.loc[mask, col] = np.log1p(vals)

        drop_cols = ["SSN", "Клиент_Инфо", "ID", "Customer_ID", "Name", "ID_записи", "Профессия", 
                     "TARGET", "Кредитный_микс", "Credit_Mix", "TARGET_CLEAN"]
        df.drop(columns=[c for c in drop_cols if c in df.columns], inplace=True)
        
        return df

    def get_te_features(self, X_ids, y, n_classes, alpha=10.0):
        # Simple TE logic
        df = pd.DataFrame({'id': X_ids, 'target': y})
        counts = np.bincount(y, minlength=n_classes)
        global_probs = counts / counts.sum()
        
        stats = df.groupby('id')['target'].value_counts().unstack(fill_value=0)
        for c in range(n_classes):
            if c not in stats.columns: stats[c] = 0
            
        counts_val = stats.values
        probs = (counts_val + alpha * global_probs) / (counts_val.sum(axis=1, keepdims=True) + alpha)
        
        return {uid: probs[i] for i, uid in enumerate(stats.index)}, global_probs

    def apply_te(self, ids, mapping, global_probs, n_classes):
        res = np.zeros((len(ids), n_classes))
        for i, uid in enumerate(ids):
            res[i] = mapping.get(uid, global_probs)
        return res

    def run(self):
        print("Loading data...")
        train = pd.read_csv(self.train_path)
        test = pd.read_csv(self.test_path)
        
        # Target selection
        target_col = 'Кредитный_микс'
        if 'TARGET' in train.columns: target_col = 'TARGET'
        elif 'Credit_Mix' in train.columns: target_col = 'Credit_Mix'
            
        data = train.dropna(subset=[target_col])
        data = data[data[target_col] != '_'].copy()
        
        mapper = {'Bad': 'Poor', 'Standard': 'Standard', 'Good': 'Good'}
        data['TARGET_CLEAN'] = data[target_col].map(mapper).fillna(data[target_col])
        data = data[data['TARGET_CLEAN'].isin(['Standard', 'Good', 'Poor'])]
        
        le = LabelEncoder()
        y = le.fit_transform(data['TARGET_CLEAN'])
        n_classes = len(le.classes_)
        
        print("Preprocessing...")
        X = self.preprocess(data)
        X_test = self.preprocess(test)
        
        id_col = 'ID_клиента'
        if id_col not in X.columns:
            X[id_col] = data[id_col]
            X_test[id_col] = test[id_col]
            
        X_ids = X[id_col].astype(str).values
        test_ids = X_test[id_col].astype(str).values
        
        X_feats = X.drop(columns=[id_col])
        X_test_feats = X_test.drop(columns=[id_col])
        
        # Align columns
        common_cols = [c for c in X_feats.columns if c in X_test_feats.columns]
        X_test_feats = X_test_feats[common_cols]
        for c in set(X_feats.columns) - set(common_cols):
            X_test_feats[c] = 0
        X_test_feats = X_test_feats[X_feats.columns]

        skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
        folds = list(skf.split(X_feats, y))
        
        oof_preds = np.zeros((len(X), n_classes))
        test_preds = np.zeros((len(X_test), n_classes))
        
        imputer = SimpleImputer(strategy='median')
        scaler = StandardScaler()
        ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False, min_frequency=0.01)
        
        num_cols = X_feats.select_dtypes(include=np.number).columns
        cat_cols = X_feats.select_dtypes(include='object').columns

        print("Training...")
        
        for i, (tr_idx, val_idx) in enumerate(folds, 1):
            X_tr, y_tr = X_feats.iloc[tr_idx], y[tr_idx]
            X_val = X_feats.iloc[val_idx]
            
            # 1. Target Encoding
            mapping, global_probs = self.get_te_features(X_ids[tr_idx], y_tr, n_classes, TE_ALPHA)
            te_tr = self.apply_te(X_ids[tr_idx], mapping, global_probs, n_classes)
            te_val = self.apply_te(X_ids[val_idx], mapping, global_probs, n_classes)
            te_test = self.apply_te(test_ids, mapping, global_probs, n_classes)
            
            # 2. Basic features
            imputer.fit(X_tr[num_cols])
            num_tr = scaler.fit_transform(imputer.transform(X_tr[num_cols]))
            num_val = scaler.transform(imputer.transform(X_val[num_cols]))
            num_test = scaler.transform(imputer.transform(X_test_feats[num_cols]))
            
            cat_tr = ohe.fit_transform(X_tr[cat_cols].fillna('MISSING'))
            cat_val = ohe.transform(X_val[cat_cols].fillna('MISSING'))
            cat_test = ohe.transform(X_test_feats[cat_cols].fillna('MISSING'))
            
            # Stack
            X_train_fold = np.hstack([num_tr, cat_tr, te_tr])
            X_val_fold = np.hstack([num_val, cat_val, te_val])
            X_test_fold = np.hstack([num_test, cat_test, te_test])
            
            model = MLPClassifier(random_state=SEED + i, **self.model_params)
            model.fit(X_train_fold, y_tr)
            
            val_p = model.predict_proba(X_val_fold)
            oof_preds[val_idx] = val_p
            test_preds += model.predict_proba(X_test_fold)
            
            print(f"Fold {i}: {f1_score(y[val_idx], val_p.argmax(axis=1), average='macro'):.4f}")

        print(f"CV Score: {f1_score(y, oof_preds.argmax(axis=1), average='macro'):.4f}")
        
        final_classes = le.inverse_transform(test_preds.argmax(axis=1))
        id_out = 'ID_записи' if 'ID_записи' in test.columns else 'ID'
        sub = pd.DataFrame({'ID': test[id_out], 'TARGET': final_classes})
        sub.to_csv('submission_simple.csv', index=False)
        print("Saved.")

model = CreditScoreModel('train.csv', 'test.csv')
model.run()


Loading data...
Preprocessing...
Training...
Fold 1: 0.7839
Fold 2: 0.7892
Fold 3: 0.7806
Fold 4: 0.7947
Fold 5: 0.7776
CV Score: 0.7853
Saved.
