In [28]:
import numpy as np 
import pandas as pd 
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.preprocessing import OrdinalEncoder

from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

warnings.filterwarnings("ignore",  category=FutureWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning,)
warnings.filterwarnings("ignore", category=UserWarning)

/kaggle/input/playground-series-s5e7/sample_submission.csv
/kaggle/input/playground-series-s5e7/train.csv
/kaggle/input/playground-series-s5e7/test.csv


In [29]:
train_df = pd.read_csv('/kaggle/input/playground-series-s5e7/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s5e7/test.csv')

In [30]:
cols_to_convert = ['Time_spent_Alone', 'Social_event_attendance','Going_outside',  'Friends_circle_size','Post_frequency']
train_df[cols_to_convert] = train_df[cols_to_convert].astype('Int64')
test_df[cols_to_convert] = test_df[cols_to_convert].astype('Int64')

In [31]:
train_df['Drained_after_socializing' + '_MISS'] = train_df['Drained_after_socializing'].notna().astype(int)
test_df['Drained_after_socializing' + '_MISS'] = test_df['Drained_after_socializing'].notna().astype(int)

In [32]:
# Uzupełnienie danych Yes - Yes // No- No
train_df['Stage_fear'] = train_df['Stage_fear'].mask(train_df['Stage_fear'].isna() & train_df['Drained_after_socializing']
                                            .notna(), train_df['Drained_after_socializing'])
train_df['Drained_after_socializing'] = train_df['Drained_after_socializing'].mask(train_df['Drained_after_socializing']
                                            .isna() & train_df['Stage_fear'].notna(), train_df['Stage_fear'])


test_df['Stage_fear'] = test_df['Stage_fear'].mask(test_df['Stage_fear'].isna() & test_df['Drained_after_socializing']
                                            .notna(), test_df['Drained_after_socializing'])
test_df['Drained_after_socializing'] = test_df['Drained_after_socializing'].mask(test_df['Drained_after_socializing']
                                            .isna() & test_df['Stage_fear'].notna(), test_df['Stage_fear'])

In [33]:
cat_cols = ['Stage_fear','Drained_after_socializing']  # <-- tu wpisz swoje kolumny kategoryczne
train_df[cat_cols]=train_df[cat_cols].fillna('Missing').astype(str)
test_df[cat_cols]=test_df[cat_cols].fillna('Missing').astype(str)

In [34]:
train_df['Personality']=train_df['Personality'].map({'Introvert':0,'Extrovert':1})

In [35]:
train_df['Outside_mult_Friends'] = train_df['Going_outside'] * train_df['Friends_circle_size']
test_df['Outside_mult_Friends'] = test_df['Going_outside'] * test_df['Friends_circle_size']
def Outside_mult_Friends (x):
    try:
        x=float(x)
        if x <= 11:
            return 0
        elif x > 11 and x <= 15:
            return 1
        elif x > 15 and x < 400:
            return 2
        else:
            return 2
    except ValueError:
        return 2

train_df['Outside_mult_Friends']=train_df['Outside_mult_Friends'].apply(Outside_mult_Friends).astype('Int64')
test_df['Outside_mult_Friends']=test_df['Outside_mult_Friends'].apply(Outside_mult_Friends).astype('Int64')

In [36]:
train_df['Time_Alone_dev_Outside'] = train_df['Time_spent_Alone'] / train_df['Going_outside']
train_df['Time_Alone_dev_Outside']=train_df['Time_Alone_dev_Outside'].round(2).astype(float)

test_df['Time_Alone_dev_Outside'] = test_df['Time_spent_Alone'] / train_df['Going_outside']
test_df['Time_Alone_dev_Outside']=test_df['Time_Alone_dev_Outside'].round(2).astype(float)
def Time_Alone_dev_Outside (x):
    try:
        x=float(x)
        if x <= 1:
            return 0
        elif x > 1 and x < 2:
            return 1
        elif x >= 2 and x < 100:
            return 2
        else:
            return 3
    except ValueError:
        return 3

train_df['Time_Alone_dev_Outside']=train_df['Time_Alone_dev_Outside'].apply(Time_Alone_dev_Outside).astype('Int64')

test_df['Time_Alone_dev_Outside']=test_df['Time_Alone_dev_Outside'].apply(Time_Alone_dev_Outside).astype('Int64')

In [37]:
X=train_df.drop([ 'id','Personality','Stage_fear'], axis=1).copy()
y=train_df['Personality'].copy()

X_test = test_df.drop([ 'id','Stage_fear'], axis=1).copy()


In [38]:
cat_features = ['Drained_after_socializing',  'Outside_mult_Friends', 'Time_Alone_dev_Outside','Drained_after_socializing_MISS']

#'Drained_after_socializing', 'Outside_mult_Friends', 'Time_Alone_dev_Outside', 
# 1. Inicjalizacja modelu (bez uczenia, bez pipeline'u)Time_Alone_dev_Outside
model = CatBoostClassifier(random_seed=42,verbose=100, cat_features=cat_features)  # verbose=0 żeby nie spamował

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 2. Zbiór parametrów do przeszukania
param_grid = {
    'learning_rate': [  0.04],#, 0.05, 0.1],
    'depth': [ 6],#, 8],
    #'l2_leaf_reg': [4,5,3,6],#, 3],
    'iterations': [  250], #opt 200
    #'auto_class_weights': ['SqrtBalanced'] #'Balanced', 'None', 
}

# 3. GridSearchCV (tu możesz dodać scoring, cv, n_jobs itd.)
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='roc_auc',       # lub 'roc_auc', 'f1', zależnie od problemu
    cv=cv,                      # 5-fold cross-validation
    n_jobs=-1,                 # pełne równoległe obciążenie CPU
    return_train_score=True
)

# 4. Dopasowanie do danych (X to features, y to target)
grid_search.fit(X, y)

# 5. Wyniki
print("Najlepsze parametry:", grid_search.best_params_)
print("Najlepszy wynik:", grid_search.best_score_)

best_model_CatBoost = grid_search.best_estimator_

# Ponowne fitowanie z cat_features
#best_model.set_params(cat_features=cat_features)
#best_model.fit(train_df.drop(['id','Personality'], axis=1), train_df['Personality'])
eval_pool = Pool(X, y, cat_features=cat_features)
importances = best_model_CatBoost.get_feature_importance(eval_pool)

y_proba_cat = best_model_CatBoost.predict_proba(X)[:, 1]
y_proba_cat_test = best_model_CatBoost.predict_proba(X_test)[:, 1]

res_df = pd.DataFrame(grid_search.cv_results_)
res_df[['rank_test_score','mean_test_score','mean_train_score','std_test_score','params']]

0:	learn: 0.6293924	total: 18.4ms	remaining: 4.59s
100:	learn: 0.1248784	total: 1.46s	remaining: 2.16s
200:	learn: 0.1214170	total: 2.83s	remaining: 691ms
249:	learn: 0.1200600	total: 3.53s	remaining: 0us
Najlepsze parametry: {'depth': 6, 'iterations': 250, 'learning_rate': 0.04}
Najlepszy wynik: 0.969893113835637


Unnamed: 0,rank_test_score,mean_test_score,mean_train_score,std_test_score,params
0,1,0.969893,0.977584,0.002933,"{'depth': 6, 'iterations': 250, 'learning_rate..."


# __________________________________ NOWY MODEL XGBOOST_______________________________________

In [39]:
train_df = pd.read_csv('/kaggle/input/playground-series-s5e7/train.csv')

cols_to_convert = ['Time_spent_Alone', 'Social_event_attendance','Going_outside',  'Friends_circle_size','Post_frequency']
train_df[cols_to_convert] = train_df[cols_to_convert].astype('Int64')

train_df['Drained_after_socializing' + '_MISS'] = train_df['Drained_after_socializing'].notna().astype(int)

# Uzupełnienie danych Yes - Yes // No- No
train_df['Stage_fear'] = train_df['Stage_fear'].mask(train_df['Stage_fear'].isna() & train_df['Drained_after_socializing']
                                            .notna(), train_df['Drained_after_socializing'])
train_df['Drained_after_socializing'] = train_df['Drained_after_socializing'].mask(train_df['Drained_after_socializing']
                                            .isna() & train_df['Stage_fear'].notna(), train_df['Stage_fear'])

cat_cols = ['Stage_fear','Drained_after_socializing']  # <-- tu wpisz swoje kolumny kategoryczne
train_df['Drained_after_socializing']=train_df['Drained_after_socializing'].map({'Yes':2,'No':1})
train_df['Drained_after_socializing']=train_df['Drained_after_socializing'].fillna(0).astype('int')
train_df = train_df.drop('Stage_fear', axis=1)
# Kodowanie labelek
train_df['Personality']=train_df['Personality'].map({'Introvert':0,'Extrovert':1})

In [40]:
test_df = pd.read_csv('/kaggle/input/playground-series-s5e7/test.csv')

cols_to_convert = ['Time_spent_Alone', 'Social_event_attendance','Going_outside',  'Friends_circle_size','Post_frequency']
test_df[cols_to_convert] = test_df[cols_to_convert].astype('Int64')

test_df['Drained_after_socializing' + '_MISS'] = test_df['Drained_after_socializing'].notna().astype(int)

# Uzupełnienie danych Yes - Yes // No- No
test_df['Stage_fear'] = test_df['Stage_fear'].mask(test_df['Stage_fear'].isna() & test_df['Drained_after_socializing']
                                            .notna(), test_df['Drained_after_socializing'])
test_df['Drained_after_socializing'] = test_df['Drained_after_socializing'].mask(test_df['Drained_after_socializing']
                                            .isna() & test_df['Stage_fear'].notna(), test_df['Stage_fear'])

cat_cols = ['Stage_fear','Drained_after_socializing']  # <-- tu wpisz swoje kolumny kategoryczne
test_df['Drained_after_socializing']=test_df['Drained_after_socializing'].map({'Yes':2,'No':1})
test_df['Drained_after_socializing']=test_df['Drained_after_socializing'].fillna(0).astype('int')
test_df = test_df.drop('Stage_fear', axis=1)



In [41]:
num_cols = ['Time_spent_Alone', 'Social_event_attendance', 'Going_outside', 'Friends_circle_size','Post_frequency']   
num_imputer = IterativeImputer(estimator=LGBMRegressor(n_estimators=500, learning_rate=0.03, max_depth=6, subsample=0.8, colsample_bytree=0.8, verbosity=-1),
                               max_iter=10, random_state=42)
train_df[num_cols] = num_imputer.fit_transform(train_df[num_cols])
columns = ['Time_spent_Alone', 'Social_event_attendance', 'Going_outside', 'Friends_circle_size','Post_frequency']
train_df[columns]=train_df[columns].round().astype(int)

In [42]:
num_cols = ['Time_spent_Alone', 'Social_event_attendance', 'Going_outside', 'Friends_circle_size','Post_frequency']   

test_df[num_cols] = num_imputer.transform(test_df[num_cols])
columns = ['Time_spent_Alone', 'Social_event_attendance', 'Going_outside', 'Friends_circle_size','Post_frequency']
test_df[columns]=test_df[columns].round().astype(int)

In [43]:
X = train_df.drop(['id','Personality'], axis=1).copy()
y = train_df['Personality'].copy()

X_test = test_df.drop('id', axis=1).copy()

In [44]:
model = XGBClassifier()  # verbose=0 żeby nie spamował

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 2. Zbiór parametrów do przeszukania
param_grid = {
    'n_estimators': [200],#[100, 200, 300],
    'learning_rate': [0.02],#[0.01, 0.05, 0.1],
    'max_depth': [  3 ],#[3, 5, 7, 10],
    'subsample': [1.0],#, 1.0],
    'colsample_bytree': [ 0.6],#, 0.9, 1.0],
    'gamma': [0],
    'reg_alpha': [0],
    'reg_lambda': [0.5]
}

# 3. GridSearchCV (tu możesz dodać scoring, cv, n_jobs itd.)
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='roc_auc',       # lub 'roc_auc', 'f1', zależnie od problemu
    cv=cv,                      # 5-fold cross-validation
    n_jobs=-1,                 # pełne równoległe obciążenie CPU
    return_train_score=True
)

# 4. Dopasowanie do danych (X to features, y to target)
grid_search.fit(X, y)

# 5. Wyniki
print("Najlepsze parametry:", grid_search.best_params_)
print("Najlepszy wynik:", grid_search.best_score_)

best_model_XGBC = grid_search.best_estimator_

y_proba_xgb = best_model_XGBC.predict_proba(X)[:, 1]
y_proba_xgb_test = best_model_XGBC.predict_proba(X_test)[:, 1]
# Ponowne fitowanie z cat_features
#best_model.set_params(cat_features=cat_features)
#best_model.fit(train_df.drop(['id','Personality'], axis=1), train_df['Personality'])
#eval_pool = Pool(X, y)
#importances = best_model.get_feature_importance(eval_pool)

res_df = pd.DataFrame(grid_search.cv_results_)
res_df[['rank_test_score','mean_test_score','mean_train_score','std_test_score','params']]

Najlepsze parametry: {'colsample_bytree': 0.6, 'gamma': 0, 'learning_rate': 0.02, 'max_depth': 3, 'n_estimators': 200, 'reg_alpha': 0, 'reg_lambda': 0.5, 'subsample': 1.0}
Najlepszy wynik: 0.9703008761479728


Unnamed: 0,rank_test_score,mean_test_score,mean_train_score,std_test_score,params
0,1,0.970301,0.973203,0.003877,"{'colsample_bytree': 0.6, 'gamma': 0, 'learnin..."


# __________________________________ LIGHT XBGOOST ____________

In [45]:
model = LGBMClassifier(verbose=-1)  # verbose=0 żeby nie spamował

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 2. Zbiór parametrów do przeszukania
param_grid = {
    'n_estimators': [375],
    'learning_rate': [0.01],
    'max_depth': [ 3],  # -1 to bez limitu
    'num_leaves': [15],  # powiązane z max_depth
    'min_child_samples': [ 9 ],
    'subsample': [0.8],
    'colsample_bytree': [0.7],
    'reg_alpha': [0],
    'reg_lambda': [0.5]
}

# 3. GridSearchCV (tu możesz dodać scoring, cv, n_jobs itd.)
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='roc_auc',       # lub 'roc_auc', 'f1', zależnie od problemu
    cv=cv,                      # 5-fold cross-validation
    n_jobs=-1,                 # pełne równoległe obciążenie CPU
    return_train_score=True
)

# 4. Dopasowanie do danych (X to features, y to target)
grid_search.fit(X, y)

# 5. Wyniki
print("Najlepsze parametry:", grid_search.best_params_)
print("Najlepszy wynik:", grid_search.best_score_)

best_model_LightGBMC = grid_search.best_estimator_

# Ponowne fitowanie z cat_features
#best_model.set_params(cat_features=cat_features)
#best_model.fit(train_df.drop(['id','Personality'], axis=1), train_df['Personality'])
#eval_pool = Pool(X, y)
#importances = best_model.get_feature_importance(eval_pool)

y_proba_lgb = best_model_LightGBMC.predict_proba(X)[:, 1]
y_proba_lgb_test = best_model_LightGBMC.predict_proba(X_test)[:, 1]

res_df = pd.DataFrame(grid_search.cv_results_)
res_df[['rank_test_score','mean_test_score','mean_train_score','std_test_score','params']]

Najlepsze parametry: {'colsample_bytree': 0.7, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_samples': 9, 'n_estimators': 375, 'num_leaves': 15, 'reg_alpha': 0, 'reg_lambda': 0.5, 'subsample': 0.8}
Najlepszy wynik: 0.9702742998909762


Unnamed: 0,rank_test_score,mean_test_score,mean_train_score,std_test_score,params
0,1,0.970274,0.972823,0.003967,"{'colsample_bytree': 0.7, 'learning_rate': 0.0..."


# __________________________________ KNN & Logistic regression ______________________________

In [46]:
scaler = StandardScaler()

In [47]:
X = train_df.drop(['id','Personality'], axis=1).copy()
y = train_df['Personality'].copy()

X_test = test_df.drop('id', axis=1).copy()

X = scaler.fit_transform(X)


In [48]:
X_test = scaler.transform(X_test)

In [49]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Parametry do testu
param_grid_lr = {
    'C': [1, ],
    'penalty': ['l2'],  # l1 dostępne tylko przy solver='liblinear'
    'solver': ['lbfgs'],  # stabilny solver dla multiclass
    'max_iter': [ 3000]
}

grid_lr = GridSearchCV(
    estimator=LogisticRegression(),
    param_grid=param_grid_lr,
    cv=cv,
    scoring='accuracy',  # lub 'roc_auc' dla binary
    n_jobs=-1,
    verbose=1
)

grid_lr.fit(X, y)

# Najlepszy model
best_lr = grid_lr.best_estimator_

# Prawdopodobieństwa (dla danych treningowych X)
y_proba_lr = best_lr.predict_proba(X)[:, 1]
y_proba_lr_test = best_lr.predict_proba(X_test)[:, 1]

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [50]:
res_df = pd.DataFrame(grid_lr.cv_results_)
res_df[['rank_test_score','mean_test_score','std_test_score','params']]

Unnamed: 0,rank_test_score,mean_test_score,std_test_score,params
0,1,0.96761,0.002177,"{'C': 1, 'max_iter': 3000, 'penalty': 'l2', 's..."


In [51]:

# Parametry do testu
param_grid_knn = {
    'n_neighbors': [ 11],
    'weights': ['uniform'],
    'metric': ['euclidean']
}

grid_knn = GridSearchCV(
    estimator=KNeighborsClassifier(),
    param_grid=param_grid_knn,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_knn.fit(X, y)

# Najlepszy model
best_knn = grid_knn.best_estimator_

# Prawdopodobieństwa

res_df = pd.DataFrame(grid_knn.cv_results_)
res_df[['rank_test_score','mean_test_score','std_test_score','params']]
print("Najlepsze parametry:", grid_knn.best_params_)
print("Najlepszy wynik:", grid_knn.best_score_)

y_proba_knn = best_knn.predict_proba(X)[:, 1]
y_proba_knn_test = best_knn.predict_proba(X_test)[:, 1]

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Najlepsze parametry: {'metric': 'euclidean', 'n_neighbors': 11, 'weights': 'uniform'}
Najlepszy wynik: 0.9688513566687945


# __________________________________________ ENESEMBLE________________________________

In [52]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# Stacking: tworzymy nową macierz X dla metamodelu
X_stack = np.column_stack([
    y_proba_lr,
    y_proba_knn,
    y_proba_cat,
    y_proba_xgb,
    y_proba_lgb
])

# Możesz też nazwać kolumny, jeśli chcesz później analizować feature importance

# Meta-model: Logistic Regression
meta_model = LogisticRegression()
meta_model.fit(X_stack, y)

X_stack_test = np.column_stack([
    y_proba_lr_test,
    y_proba_knn_test,
    y_proba_cat_test,
    y_proba_xgb_test,
    y_proba_lgb_test
])

y_proba_meta_test = meta_model.predict_proba(X_stack_test)[:, 1]

# Predykcja końcowa (prawdopodobieństwo klasy 1)
#y_proba_meta = meta_model.predict_proba(X_stack)[:, 1]



# Jeśli chcesz ocenić skuteczność:
#from sklearn.metrics import roc_auc_score
#print("Meta-model ROC AUC:", roc_auc_score(y, y_proba_meta))

In [53]:
# Zamień progiem 0.5
#y_pred_labels = (y_proba_meta >= 0.5).astype(int)

y_pred_test = (y_proba_meta_test >= 0.5).astype(int)


# Jeśli Twoje klasy są np. ['Extrovert', 'Introvert'], to:
label_map = {0: meta_model.classes_[0], 1: meta_model.classes_[1]}
y_pred_final = [label_map[i] for i in y_pred_test]

In [54]:


submission = pd.DataFrame({
    "Id": test_df["id"],        # lub np. range(len(y_pred_final))
    "Personality": y_pred_final
})

submission['Personality']=submission['Personality'].map({0:'Introvert',1:'Extrovert'})
submission.head()

Unnamed: 0,Id,Personality
0,18524,Extrovert
1,18525,Introvert
2,18526,Extrovert
3,18527,Extrovert
4,18528,Introvert


In [55]:
submission.to_csv("submission.csv", index=False)

print('Sukcesss!!!')

Sukcesss!!!
