In [122]:
import numpy as np 
import pandas as pd 
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.preprocessing import OrdinalEncoder

from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

warnings.filterwarnings("ignore",  category=FutureWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning,)
warnings.filterwarnings("ignore", category=UserWarning)

/kaggle/input/playground-series-s5e7/sample_submission.csv
/kaggle/input/playground-series-s5e7/train.csv
/kaggle/input/playground-series-s5e7/test.csv


In [127]:
train_df = pd.read_csv('/kaggle/input/playground-series-s5e7/train.csv')

### Zmiana typu kolumn numerycznych na INT64

In [128]:
cols_to_convert = ['Time_spent_Alone', 'Social_event_attendance','Going_outside',  'Friends_circle_size','Post_frequency']
train_df[cols_to_convert] = train_df[cols_to_convert].astype('Int64')

In [None]:
train_df['Drained_after_socializing' + '_MISS'] = train_df['Drained_after_socializing'].notna().astype(int)

In [None]:
# Uzupełnienie danych Yes - Yes // No- No
train_df['Stage_fear'] = train_df['Stage_fear'].mask(train_df['Stage_fear'].isna() & train_df['Drained_after_socializing']
                                            .notna(), train_df['Drained_after_socializing'])
train_df['Drained_after_socializing'] = train_df['Drained_after_socializing'].mask(train_df['Drained_after_socializing']
                                            .isna() & train_df['Stage_fear'].notna(), train_df['Stage_fear'])

In [None]:
cat_cols = ['Stage_fear','Drained_after_socializing']  # <-- tu wpisz swoje kolumny kategoryczne
train_df[cat_cols]=train_df[cat_cols].fillna('Missing').astype(str)

In [None]:
train_df['Personality']=train_df['Personality'].map({'Introvert':0,'Extrovert':1})

### Alternatywne Markery do wartości granicznych

def three(x):
    if x==3:
        return 1
    else:
        return 0

def four(x):
    if x==4:
        return 1
    else:
        return 0

def five(x):
    if x==5:
        return 1
    else:
        return 0

train_df['Time_spent_Alone_MARK'] = train_df['Time_spent_Alone'].apply(four)
train_df['Social_event_attendance_MARK'] = train_df['Social_event_attendance'].apply(three)
train_df['Going_outside_MARK'] = train_df['Going_outside'].apply(three)
train_df['Friends_circle_size_MARK'] = train_df['Friends_circle_size'].apply(five)
train_df['Post_frequency_MARK'] = train_df['Post_frequency'].apply(three)

cat_features = ['Drained_after_socializing', 'Outside_mult_Friends', 'Time_Alone_dev_Outside','Drained_after_socializing_MISS','Stage_fear',
               'Time_spent_Alone_MARK', 'Social_event_attendance_MARK', 'Going_outside_MARK', 'Friends_circle_size_MARK', 'Post_frequency_MARK']

## dwie dobre kolumny do właczenia

In [None]:
train_df['Outside_mult_Friends'] = train_df['Going_outside'] * train_df['Friends_circle_size']
def Outside_mult_Friends (x):
    try:
        x=float(x)
        if x <= 11:
            return 0
        elif x > 11 and x <= 15:
            return 1
        elif x > 15 and x < 400:
            return 2
        else:
            return 2
    except ValueError:
        return 2

train_df['Outside_mult_Friends']=train_df['Outside_mult_Friends'].apply(Outside_mult_Friends).astype('Int64')

In [None]:
train_df['Time_Alone_dev_Outside'] = train_df['Time_spent_Alone'] / train_df['Going_outside']
train_df['Time_Alone_dev_Outside']=train_df['Time_Alone_dev_Outside'].round(2).astype(float)
def Time_Alone_dev_Outside (x):
    try:
        x=float(x)
        if x <= 1:
            return 0
        elif x > 1 and x < 2:
            return 1
        elif x >= 2 and x < 100:
            return 2
        else:
            return 3
    except ValueError:
        return 3

train_df['Time_Alone_dev_Outside']=train_df['Time_Alone_dev_Outside'].apply(Time_Alone_dev_Outside).astype('Int64')

## Dodajemy kolumny z odejmowaniem

train_df['Fri_sub_Post']=train_df['Friends_circle_size'] - train_df['Post_frequency']

train_df['Goi_sub_Post']=train_df['Going_outside'] - train_df['Post_frequency']

train_df['Goi_sub_Fri']=train_df['Going_outside'] - train_df['Friends_circle_size']

train_df['Soc_sub_Post']=train_df['Social_event_attendance'] - train_df['Post_frequency']

train_df['Soc_sub_Fri']=train_df['Social_event_attendance'] - train_df['Friends_circle_size']

train_df['Soc_sub_Goi']=train_df['Social_event_attendance'] - train_df['Going_outside']

train_df['Tim_sub_Post']=train_df['Time_spent_Alone'] - train_df['Post_frequency']

train_df['Tim_sub_Fri']=train_df['Time_spent_Alone'] - train_df['Friends_circle_size']

train_df['Tim_sub_Goi']=train_df['Time_spent_Alone'] - train_df['Going_outside']

train_df['Tim_sub_Soc']=train_df['Time_spent_Alone'] - train_df['Social_event_attendance']

## Dodatkowe kolumny z dodawaniemm

train_df['Fri_Sum_Post']=train_df['Friends_circle_size'] + train_df['Post_frequency']

train_df['Goi_Sum_Fri']=train_df['Going_outside'] + train_df['Friends_circle_size']

train_df['Goi_Sum_Post']=train_df['Going_outside'] + train_df['Post_frequency']

train_df['Soc_Sum_Post']=train_df['Social_event_attendance'] + train_df['Post_frequency']

train_df['Soc_Sum_Goi']=train_df['Social_event_attendance'] + train_df['Going_outside']

## CATBOOST

In [None]:
X=train_df.drop([ 'id','Personality','Stage_fear'], axis=1).copy()
y=train_df['Personality'].copy()

In [None]:
cat_features = ['Drained_after_socializing',  'Outside_mult_Friends', 'Time_Alone_dev_Outside','Drained_after_socializing_MISS']

#'Drained_after_socializing', 'Outside_mult_Friends', 'Time_Alone_dev_Outside', 
# 1. Inicjalizacja modelu (bez uczenia, bez pipeline'u)Time_Alone_dev_Outside
model = CatBoostClassifier(random_seed=42,verbose=100, cat_features=cat_features)  # verbose=0 żeby nie spamował

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 2. Zbiór parametrów do przeszukania
param_grid = {
    'learning_rate': [  0.04],#, 0.05, 0.1],
    'depth': [ 6],#, 8],
    #'l2_leaf_reg': [4,5,3,6],#, 3],
    'iterations': [  250], #opt 200
    #'auto_class_weights': ['SqrtBalanced'] #'Balanced', 'None', 
}

# 3. GridSearchCV (tu możesz dodać scoring, cv, n_jobs itd.)
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='roc_auc',       # lub 'roc_auc', 'f1', zależnie od problemu
    cv=cv,                      # 5-fold cross-validation
    n_jobs=-1,                 # pełne równoległe obciążenie CPU
    return_train_score=True
)

# 4. Dopasowanie do danych (X to features, y to target)
grid_search.fit(X, y)

# 5. Wyniki
print("Najlepsze parametry:", grid_search.best_params_)
print("Najlepszy wynik:", grid_search.best_score_)

best_model_CatBoost = grid_search.best_estimator_

# Ponowne fitowanie z cat_features
#best_model.set_params(cat_features=cat_features)
#best_model.fit(train_df.drop(['id','Personality'], axis=1), train_df['Personality'])
eval_pool = Pool(X, y, cat_features=cat_features)
importances = best_model_CatBoost.get_feature_importance(eval_pool)

res_df = pd.DataFrame(grid_search.cv_results_)
res_df[['rank_test_score','mean_test_score','mean_train_score','std_test_score','params']]

In [None]:

# Wyciągnięcie ważności cech
#importances = best_model.get_feature_importance(Pool(train_df.drop(['id','Personality'], axis=1), train_df['Personality']))
feature_names = X.columns

# DataFrame z ważnościami
fi_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Wyświetlenie najważniejszych cech
plt.figure(figsize=(10, 6))
fi_df.head(30).plot(kind='barh', x='Feature', y='Importance', legend=False)
plt.gca().invert_yaxis()
plt.title("Top 20 najważniejszych cech")
plt.tight_layout()
plt.show()

# __________________________________ NOWY MODEL XGBOOST_______________________________________

In [123]:
train_df = pd.read_csv('/kaggle/input/playground-series-s5e7/train.csv')

cols_to_convert = ['Time_spent_Alone', 'Social_event_attendance','Going_outside',  'Friends_circle_size','Post_frequency']
train_df[cols_to_convert] = train_df[cols_to_convert].astype('Int64')

train_df['Drained_after_socializing' + '_MISS'] = train_df['Drained_after_socializing'].notna().astype(int)

# Uzupełnienie danych Yes - Yes // No- No
train_df['Stage_fear'] = train_df['Stage_fear'].mask(train_df['Stage_fear'].isna() & train_df['Drained_after_socializing']
                                            .notna(), train_df['Drained_after_socializing'])
train_df['Drained_after_socializing'] = train_df['Drained_after_socializing'].mask(train_df['Drained_after_socializing']
                                            .isna() & train_df['Stage_fear'].notna(), train_df['Stage_fear'])

cat_cols = ['Stage_fear','Drained_after_socializing']  # <-- tu wpisz swoje kolumny kategoryczne
train_df['Drained_after_socializing']=train_df['Drained_after_socializing'].map({'Yes':2,'No':1})
train_df['Drained_after_socializing']=train_df['Drained_after_socializing'].fillna(0).astype('int')
train_df = train_df.drop('Stage_fear', axis=1)
# Kodowanie labelek
train_df['Personality']=train_df['Personality'].map({'Introvert':0,'Extrovert':1})

In [124]:
num_cols = ['Time_spent_Alone', 'Social_event_attendance', 'Going_outside', 'Friends_circle_size','Post_frequency']   
num_imputer = IterativeImputer(estimator=LGBMRegressor(n_estimators=500, learning_rate=0.03, max_depth=6, subsample=0.8, colsample_bytree=0.8, verbosity=-1),
                               max_iter=10, random_state=42)
train_df[num_cols] = num_imputer.fit_transform(train_df[num_cols])
columns = ['Time_spent_Alone', 'Social_event_attendance', 'Going_outside', 'Friends_circle_size','Post_frequency']
train_df[columns]=train_df[columns].round().astype(int)

In [125]:
X = train_df.drop('Personality', axis=1).copy()
y = train_df['Personality'].copy()

In [126]:
model = XGBClassifier()  # verbose=0 żeby nie spamował

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 2. Zbiór parametrów do przeszukania
param_grid = {
    'n_estimators': [200],#[100, 200, 300],
    'learning_rate': [0.02],#[0.01, 0.05, 0.1],
    'max_depth': [ 4, 3 ],#[3, 5, 7, 10],
    'subsample': [1.0],#, 1.0],
    'colsample_bytree': [ 0.6],#, 0.9, 1.0],
    'gamma': [0],
    'reg_alpha': [0],
    'reg_lambda': [0.5]
}

# 3. GridSearchCV (tu możesz dodać scoring, cv, n_jobs itd.)
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='roc_auc',       # lub 'roc_auc', 'f1', zależnie od problemu
    cv=cv,                      # 5-fold cross-validation
    n_jobs=-1,                 # pełne równoległe obciążenie CPU
    return_train_score=True
)

# 4. Dopasowanie do danych (X to features, y to target)
grid_search.fit(X, y)

# 5. Wyniki
print("Najlepsze parametry:", grid_search.best_params_)
print("Najlepszy wynik:", grid_search.best_score_)

best_model_XGBC = grid_search.best_estimator_

# Ponowne fitowanie z cat_features
#best_model.set_params(cat_features=cat_features)
#best_model.fit(train_df.drop(['id','Personality'], axis=1), train_df['Personality'])
#eval_pool = Pool(X, y)
#importances = best_model.get_feature_importance(eval_pool)

res_df = pd.DataFrame(grid_search.cv_results_)
res_df[['rank_test_score','mean_test_score','mean_train_score','std_test_score','params']]

Najlepsze parametry: {'colsample_bytree': 0.6, 'gamma': 0, 'learning_rate': 0.02, 'max_depth': 3, 'n_estimators': 200, 'reg_alpha': 0, 'reg_lambda': 0.5, 'subsample': 1.0}
Najlepszy wynik: 0.9705427026430223


Unnamed: 0,rank_test_score,mean_test_score,mean_train_score,std_test_score,params
0,2,0.970121,0.976935,0.004508,"{'colsample_bytree': 0.6, 'gamma': 0, 'learnin..."
1,1,0.970543,0.974006,0.004333,"{'colsample_bytree': 0.6, 'gamma': 0, 'learnin..."


In [None]:
feature_names = X.columns

# DataFrame z ważnościami
fi_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Wyświetlenie najważniejszych cech
plt.figure(figsize=(10, 6))
fi_df.head(30).plot(kind='barh', x='Feature', y='Importance', legend=False)
plt.gca().invert_yaxis()
plt.title("Top 20 najważniejszych cech")
plt.tight_layout()
plt.show()

# __________________________________ LIGHT XBGOOST ____________

In [None]:
model = LGBMClassifier(verbose=-1)  # verbose=0 żeby nie spamował

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 2. Zbiór parametrów do przeszukania
param_grid = {
    'n_estimators': [375],
    'learning_rate': [0.01],
    'max_depth': [ 3],  # -1 to bez limitu
    'num_leaves': [15],  # powiązane z max_depth
    'min_child_samples': [ 9 ],
    'subsample': [0.8],
    'colsample_bytree': [0.7],
    'reg_alpha': [0],
    'reg_lambda': [0.5]
}

# 3. GridSearchCV (tu możesz dodać scoring, cv, n_jobs itd.)
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='roc_auc',       # lub 'roc_auc', 'f1', zależnie od problemu
    cv=cv,                      # 5-fold cross-validation
    n_jobs=-1,                 # pełne równoległe obciążenie CPU
    return_train_score=True
)

# 4. Dopasowanie do danych (X to features, y to target)
grid_search.fit(X, y)

# 5. Wyniki
print("Najlepsze parametry:", grid_search.best_params_)
print("Najlepszy wynik:", grid_search.best_score_)

best_model_LightGBMC = grid_search.best_estimator_

# Ponowne fitowanie z cat_features
#best_model.set_params(cat_features=cat_features)
#best_model.fit(train_df.drop(['id','Personality'], axis=1), train_df['Personality'])
#eval_pool = Pool(X, y)
#importances = best_model.get_feature_importance(eval_pool)

res_df = pd.DataFrame(grid_search.cv_results_)
res_df[['rank_test_score','mean_test_score','mean_train_score','std_test_score','params']]

# __________________________________ ENESEMBLE______________________________

# __________________________________ NOWY MODEL CATBOOST TYLKO KATEGORIE _______________________________

In [None]:
train_df = pd.read_csv('/kaggle/input/playground-series-s5e7/train.csv')

cols_to_convert = ['Time_spent_Alone', 'Social_event_attendance','Going_outside',  'Friends_circle_size','Post_frequency']
train_df[cols_to_convert] = train_df[cols_to_convert].astype('Int64')

train_df['Drained_after_socializing' + '_MISS'] = train_df['Drained_after_socializing'].notna().astype(int)

# Uzupełnienie danych Yes - Yes // No- No
train_df['Stage_fear'] = train_df['Stage_fear'].mask(train_df['Stage_fear'].isna() & train_df['Drained_after_socializing']
                                            .notna(), train_df['Drained_after_socializing'])
train_df['Drained_after_socializing'] = train_df['Drained_after_socializing'].mask(train_df['Drained_after_socializing']
                                            .isna() & train_df['Stage_fear'].notna(), train_df['Stage_fear'])

cat_cols = ['Stage_fear','Drained_after_socializing']  # <-- tu wpisz swoje kolumny kategoryczne
train_df[cat_cols]=train_df[cat_cols].fillna('Missing').astype(str)
train_df = train_df.drop('Stage_fear', axis=1)

In [None]:
excluded_cols = ['id', 'Personality', 'Drained_after_socializing_MISS']
all_columns = train_df.columns
# Przetwarzanie pozostałych kolumn
for col in all_columns:
    if col not in excluded_cols:
        train_df[col + '_MISS'] = train_df[col].notna().astype(int)

columns = ['Time_spent_Alone_MISS', 'Social_event_attendance_MISS', 'Going_outside_MISS','Drained_after_socializing_MISS', 
           'Friends_circle_size_MISS','Post_frequency_MISS']
train_df['not_MISS_total'] = train_df[columns].sum(axis=1)

# dwie dobre kolumny do właczenia

In [None]:
train_df['Outside_mult_Friends'] = train_df['Going_outside'] * train_df['Friends_circle_size']
def Outside_mult_Friends (x):
    try:
        x=float(x)
        if x <= 11:
            return 0
        elif x > 11 and x <= 15:
            return 1
        elif x > 15 and x < 400:
            return 2
        else:
            return 2
    except ValueError:
        return 2

train_df['Outside_mult_Friends']=train_df['Outside_mult_Friends'].apply(Outside_mult_Friends).astype('Int64')

In [None]:
train_df['Time_Alone_dev_Outside'] = train_df['Time_spent_Alone'] / train_df['Going_outside']
train_df['Time_Alone_dev_Outside']=train_df['Time_Alone_dev_Outside'].round(2).astype(float)
def Time_Alone_dev_Outside (x):
    try:
        x=float(x)
        if x <= 1:
            return 0
        elif x > 1 and x < 2:
            return 1
        elif x >= 2 and x < 100:
            return 2
        else:
            return 3
    except ValueError:
        return 3

train_df['Time_Alone_dev_Outside']=train_df['Time_Alone_dev_Outside'].apply(Time_Alone_dev_Outside).astype('Int64')

## Odejmowanie kolumn

In [None]:
train_df['Fri_sub_Post']=train_df['Friends_circle_size'] - train_df['Post_frequency']

In [None]:
train_df['Goi_sub_Post']=train_df['Going_outside'] - train_df['Post_frequency']

In [None]:
train_df['Goi_sub_Fri']=train_df['Going_outside'] - train_df['Friends_circle_size']

In [None]:
train_df['Soc_sub_Post']=train_df['Social_event_attendance'] - train_df['Post_frequency']

In [None]:
train_df['Soc_sub_Fri']=train_df['Social_event_attendance'] - train_df['Friends_circle_size']

In [None]:
train_df['Soc_sub_Goi']=train_df['Social_event_attendance'] - train_df['Going_outside']
def soc_goi (x):
    if x <= -4:
        return 0
    elif x >= -3 and x <= 3:
        return 1
    elif x > 3 and x<= 7:
        return 2
    elif x > 8:
        return 3
    else:
        return 4
train_df['Soc_sub_Goi']=train_df['Soc_sub_Goi'].apply(soc_goi).astype('Int64')

In [None]:
train_df['Tim_sub_Post']=train_df['Time_spent_Alone'] - train_df['Post_frequency']

In [None]:
train_df['Tim_sub_Fri']=train_df['Time_spent_Alone'] - train_df['Friends_circle_size']

In [None]:
train_df['Tim_sub_Goi']=train_df['Time_spent_Alone'] - train_df['Going_outside']

In [None]:
train_df['Tim_sub_Soc']=train_df['Time_spent_Alone'] - train_df['Social_event_attendance']
def tim_soc (x):
    if x <= 0:
        return 0
    elif x > 0:
        return 1
    else:
        return 4
train_df['Tim_sub_Soc']=train_df['Tim_sub_Soc'].apply(tim_soc).astype('Int64')

In [None]:
train_df.groupby(['Tim_sub_Soc','Personality']).size().unstack().plot(kind='bar', stacked=True, title='Summary')
pd.crosstab(train_df['Tim_sub_Soc'],train_df['Personality'], normalize='index')*100

##  Dodawanie kolumn

In [None]:
train_df['Fri_Sum_Post']=train_df['Friends_circle_size'] + train_df['Post_frequency']

In [None]:
train_df['Goi_Sum_Fri']=train_df['Going_outside'] + train_df['Friends_circle_size']

In [None]:
train_df['Goi_Sum_Post']=train_df['Going_outside'] + train_df['Post_frequency']

train_df['Soc_Sum_Post']=train_df['Social_event_attendance'] + train_df['Post_frequency']
def soc_post (x):
    if x <= 5:
        return 0
    elif x == 6:
        return 1
    elif x >= 7 and x <= 18:
        return 2
    elif x == 19:
        return 2
    elif x > 19:
        return 2
    else:
        return 6
train_df['Soc_Sum_Post']=train_df['Soc_Sum_Post'].apply(soc_post).astype('Int64')

In [None]:
train_df['Soc_Sum_Goi']=train_df['Social_event_attendance'] + train_df['Going_outside']

## Konwertujemy wszystkie kol do kategorii

In [None]:
def time_fun(x):
    if x <= 3:
        return 0
    elif x == 4:
        return 1
    elif x >= 5:
        return 2
    else:
        return 3
train_df['Time_spent_Alone']=train_df['Time_spent_Alone'].apply(time_fun).astype('Int64')

def social_fun(x):
    if x <= 2:
        return 0
    elif x == 3:
        return 1
    elif x >= 4 and x<= 9:
        return 2
    elif x >= 10:
        return 3
    else:
        return 4
train_df['Social_event_attendance']=train_df['Social_event_attendance'].apply(social_fun).astype('Int64')

def out_fun(x):
    if x <= 2:
        return 0
    elif x == 3:
        return 1
    elif x >= 4:
        return 2
    else:
        return 3
train_df['Going_outside']=train_df['Going_outside'].apply(out_fun).astype('Int64')

def friend_fun(x):
    if x <= 3:
        return 0
    elif x==4 or x==5:
        return 1
    elif x >= 6:
        return 2
    else:
        return 3
train_df['Friends_circle_size']=train_df['Friends_circle_size'].apply(friend_fun).astype('Int64')

def post_fun(x):
    if x <= 2:
        return 0
    elif x == 3:
        return 1
    elif x >= 4 and x<= 9:
        return 2
    elif x >= 10:
        return 3
    else:
        return 4
train_df['Post_frequency']=train_df['Post_frequency'].apply(post_fun).astype('Int64')

In [None]:
train_df.groupby(['Post_frequency','Personality']).size().unstack().plot(kind='bar', stacked=True, title='col')

## CATBOOST

In [None]:
train_df.columns

In [None]:
X=train_df.drop([ 'id','Personality'], axis=1).copy()
y=train_df['Personality'].copy()

In [None]:
cat_features = ['Drained_after_socializing','Time_spent_Alone','Social_event_attendance','Going_outside','Friends_circle_size', 'Post_frequency',
                'not_MISS_total','Time_spent_Alone_MISS', 'Social_event_attendance_MISS', 'Going_outside_MISS','Drained_after_socializing_MISS', 
               'Friends_circle_size_MISS','Post_frequency_MISS', 'Outside_mult_Friends', 'Time_Alone_dev_Outside' ] #, 'Outside_mult_Friends', 'Time_Alone_dev_Outside']
#'Soc_sub_Goi', 'Tim_sub_Soc',
#'Drained_after_socializing', 'Outside_mult_Friends', 'Time_Alone_dev_Outside', 
# 1. Inicjalizacja modelu (bez uczenia, bez pipeline'u)Time_Alone_dev_Outside
model = CatBoostClassifier(random_seed=42,verbose=100, cat_features=cat_features)  # verbose=0 żeby nie spamował

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 2. Zbiór parametrów do przeszukania
param_grid = {
    'learning_rate': [ 0.04],#, 0.05, 0.1],
    'depth': [6],#, 8],
    #'l2_leaf_reg': [4,5,3,6],#, 3],
    'iterations': [  250], #opt 200
    #'auto_class_weights': ['SqrtBalanced', 'Balanced' ] #'Balanced', 'None', 
}

# 3. GridSearchCV (tu możesz dodać scoring, cv, n_jobs itd.)
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='roc_auc',       # lub 'roc_auc', 'f1', zależnie od problemu
    cv=cv,                      # 5-fold cross-validation
    n_jobs=-1,                 # pełne równoległe obciążenie CPU
    return_train_score=True
)

# 4. Dopasowanie do danych (X to features, y to target)
grid_search.fit(X, y)

# 5. Wyniki
print("Najlepsze parametry:", grid_search.best_params_)
print("Najlepszy wynik:", grid_search.best_score_)

best_model = grid_search.best_estimator_

# Ponowne fitowanie z cat_features
#best_model.set_params(cat_features=cat_features)
#best_model.fit(train_df.drop(['id','Personality'], axis=1), train_df['Personality'])
eval_pool = Pool(X, y, cat_features=cat_features)
importances = best_model.get_feature_importance(eval_pool)

res_df = pd.DataFrame(grid_search.cv_results_)
res_df[['rank_test_score','mean_test_score','mean_train_score','std_test_score','params']]

In [None]:
# Wyciągnięcie ważności cech
#importances = best_model.get_feature_importance(Pool(train_df.drop(['id','Personality'], axis=1), train_df['Personality']))
feature_names = X.columns

# DataFrame z ważnościami
fi_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Wyświetlenie najważniejszych cech
plt.figure(figsize=(10, 6))
fi_df.head(30).plot(kind='barh', x='Feature', y='Importance', legend=False)
plt.gca().invert_yaxis()
plt.title("Top 20 najważniejszych cech")
plt.tight_layout()
plt.show()

# OPtyna experiments

In [None]:
import optuna
from catboost import CatBoostClassifier

def objective(trial):
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "depth": trial.suggest_int("depth", 4, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
        "iterations": 500,
        "verbose": 0,
        "loss_function": "Logloss"
    }

    model = CatBoostClassifier(**params)
    model.fit(X_train, y_train)
    preds = model.predict_proba(X_valid)[:,1]
    return roc_auc_score(y_valid, preds)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

best_params = study.best_params