## Поптка использовать всю информациб из обучения k-fold - лучший результат

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
import mlflow
import mlflow.catboost
from datetime import datetime

#=== Инициализация MLflow === 
tracking_uri = "http://localhost:5000"
mlflow.set_tracking_uri(tracking_uri)
mlflow.set_experiment("Per_Prediction_Base model + Feature engineering")

with mlflow.start_run(run_name=f"run_{datetime.now().strftime('%Y%m%d_%H%M%S')}"):
    df = pd.read_csv("train.csv")
    
    #--- Преобразования данных ---
    person_replace = {"Extrovert": 1, 
                      "Introvert": 0}
    enother_replace = {"Yes": 1, 
                       "No": 0}
    
    del df["id"]
    df["Personality"] = df["Personality"].replace(person_replace)
    df["Stage_fear"] = df["Stage_fear"].replace(enother_replace)
    df["Drained_after_socializing"] = df["Drained_after_socializing"].replace(enother_replace)
    
    columns_to_fill = ["Time_spent_Alone", 
                       "Stage_fear", 
                       "Social_event_attendance", 
                      "Going_outside", 
                      "Friends_circle_size", 
                      "Post_frequency"]
    df[columns_to_fill] = df[columns_to_fill].fillna(df[columns_to_fill].mean())
    df["is_Drained_NaN"] = df["Drained_after_socializing"].isna().astype(int)
    df["Social_Load"] = df["Social_event_attendance"] / (df["Time_spent_Alone"] + 1)
    df["Outdoor_Social"] = df["Going_outside"] * df["Social_event_attendance"]
    df["Small_Friends_Circle"] = (df["Friends_circle_size"] < 3).astype(int)
    df["Large_Friends_Circle"] = (df["Friends_circle_size"] > 10).astype(int)
    df["Social_Burnout"] = df["Social_event_attendance"] * df["Drained_after_socializing"]
    df["Posts_per_Friend"] = df["Post_frequency"] / (df["Friends_circle_size"] + 1)
    df["Post_vs_Social"] = df["Post_frequency"] / (df["Social_event_attendance"] + 1)

    #--- Инициализация параметров и лагирование ---
    model_params = {
        "verbose": 0,
        "allow_writing_files": False,
        # "class_weights": [1, 2.8],
        "random_state": 42
    }
    
    mlflow.log_params({
        "test_size": 0.2,
        "random_state": 42,
        **model_params
    })
    
    #--- Обучение модели ---
    model = CatBoostClassifier(**model_params)
    
    X = df.drop("Personality", axis=1)
    y = df["Personality"]
    
    from sklearn.model_selection import StratifiedKFold

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    y_val_preds = []
    y_val_probs = []
    y_true = []
    models = []

    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model.fit(X_train, y_train, eval_set=(X_val, y_val))
        
        y_pred = model.predict(X_val)
        y_prob = model.predict_proba(X_val)[:, 1]
        
        y_val_preds.extend(y_pred)
        y_val_probs.extend(y_prob)
        y_true.extend(y_val)

        models.append(model)
        
    report = classification_report(y_true, y_val_preds, output_dict=True)
    roc_auc = roc_auc_score(y_true, y_val_probs)
    
    #--- Логирование метри ---
    mlflow.log_metrics({
        "roc_auc": roc_auc,
        "precision_0": report["0"]["precision"],
        "recall_0": report["0"]["recall"],
        "f1_0": report["0"]["f1-score"],
        "precision_1": report["1"]["precision"],
        "recall_1": report["1"]["recall"],
        "f1_1": report["1"]["f1-score"],
        "accuracy": report["accuracy"]
    })
    
    mlflow.catboost.log_model(model, "model")
    print(report)
    print(f"ROC AUC: {roc_auc}. MIN: {0.96495}")
    
    #--- Test ---
    final_model = CatBoostClassifier(**model_params)
    final_model.fit(X, y)

    df_test = pd.read_csv("test.csv")
    
    df_test["Stage_fear"] = df_test["Stage_fear"].replace(enother_replace)
    df_test["Drained_after_socializing"] = df_test["Drained_after_socializing"].replace(enother_replace)
    df_test["is_Drained_NaN"] = df_test["Drained_after_socializing"].isna().astype(int)
    df_test[columns_to_fill] = df_test[columns_to_fill].fillna(df_test[columns_to_fill].mean())

    df_test["Social_Load"] = df_test["Social_event_attendance"] / (df_test["Time_spent_Alone"] + 1)
    df_test["Outdoor_Social"] = df_test["Going_outside"] * df_test["Social_event_attendance"]
    df_test["Small_Friends_Circle"] = (df_test["Friends_circle_size"] < 3).astype(int)
    df_test["Large_Friends_Circle"] = (df_test["Friends_circle_size"] >10).astype(int)
    df_test["Social_Burnout"] = df_test["Social_event_attendance"] * df_test["Drained_after_socializing"]
    df_test["Posts_per_Friend"] = df_test["Post_frequency"] / (df_test["Friends_circle_size"] + 1)
    df_test["Post_vs_Social"] = df_test["Post_frequency"] / (df_test["Social_event_attendance"] + 1)
    
    test_probs = np.zeros(df_test.shape[0])
    for model in models:
        test_probs += model.predict_proba(df_test)[:, 1]

    test_probs /= 5
    test_pred = (test_probs >= 0.5).astype(int)

    person_replace_rev = {v: k for k, v in person_replace.items()}
    test_pred_txt = np.array([person_replace_rev[value] for value in test_pred])
    
    df_semp = pd.read_csv("sample_submission.csv")
    df_semp["Personality"] = test_pred_txt
    df_semp.to_csv("Base+futire_submission_V3.csv", index=False)
    
    mlflow.log_artifact("Base+futire_submission_V3.csv")

  df["Personality"] = df["Personality"].replace(person_replace)
  df["Stage_fear"] = df["Stage_fear"].replace(enother_replace)
  df["Drained_after_socializing"] = df["Drained_after_socializing"].replace(enother_replace)


{'0': {'precision': 0.945713686858101, 'recall': 0.9351295336787565, 'f1-score': 0.9403918299291372, 'support': 4825.0}, '1': {'precision': 0.9772413291645459, 'recall': 0.9810935104752172, 'f1-score': 0.9791636310651318, 'support': 13699.0}, 'accuracy': 0.9691211401425178, 'macro avg': {'precision': 0.9614775080113234, 'recall': 0.9581115220769869, 'f1-score': 0.9597777304971344, 'support': 18524.0}, 'weighted avg': {'precision': 0.9690292327421427, 'recall': 0.9691211401425178, 'f1-score': 0.9690646275841789, 'support': 18524.0}}
ROC AUC: 0.9697864107928154. MIN: 0.96495
🏃 View run run_20250718_200057 at: http://localhost:5000/#/experiments/751304132778755520/runs/c8fba37c85fd4191bf7cc85119cf569d
🧪 View experiment at: http://localhost:5000/#/experiments/751304132778755520


  df_test["Stage_fear"] = df_test["Stage_fear"].replace(enother_replace)
  df_test["Drained_after_socializing"] = df_test["Drained_after_socializing"].replace(enother_replace)
