In [5]:
import numpy as np 
import pandas as pd 
import warnings

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

warnings.filterwarnings("ignore",  category=FutureWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning,)
warnings.filterwarnings("ignore", category=UserWarning)


/kaggle/input/playground-series-s5e7/sample_submission.csv
/kaggle/input/playground-series-s5e7/train.csv
/kaggle/input/playground-series-s5e7/test.csv


In [6]:
train_df = pd.read_csv('/kaggle/input/playground-series-s5e7/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s5e7/test.csv')

In [7]:
excluded_cols = ['id', 'Personality']
all_columns = train_df.columns
# Przetwarzanie pozostałych kolumn
for col in all_columns:
    if col not in excluded_cols:
        train_df[col + '_MISS'] = train_df[col].notna().astype(int)
        test_df[col + '_MISS'] = test_df[col].notna().astype(int)

In [8]:
train_df['Stage_fear'] = train_df['Stage_fear'].mask(train_df['Stage_fear'].isna() & train_df['Drained_after_socializing']
                                            .notna(), train_df['Drained_after_socializing'])
train_df['Drained_after_socializing'] = train_df['Drained_after_socializing'].mask(train_df['Drained_after_socializing']
                                            .isna() & train_df['Stage_fear'].notna(), train_df['Stage_fear'])

In [9]:
test_df['Stage_fear'] = test_df['Stage_fear'].mask(test_df['Stage_fear'].isna() & test_df['Drained_after_socializing']
                                            .notna(), test_df['Drained_after_socializing'])
test_df['Drained_after_socializing'] = test_df['Drained_after_socializing'].mask(test_df['Drained_after_socializing']
                                            .isna() & test_df['Stage_fear'].notna(), test_df['Stage_fear'])

In [10]:
train_df.columns

Index(['id', 'Time_spent_Alone', 'Stage_fear', 'Social_event_attendance',
       'Going_outside', 'Drained_after_socializing', 'Friends_circle_size',
       'Post_frequency', 'Personality', 'Time_spent_Alone_MISS',
       'Stage_fear_MISS', 'Social_event_attendance_MISS', 'Going_outside_MISS',
       'Drained_after_socializing_MISS', 'Friends_circle_size_MISS',
       'Post_frequency_MISS'],
      dtype='object')

In [11]:
cols_to_convert = ['Time_spent_Alone', 'Social_event_attendance','Going_outside',  'Friends_circle_size','Post_frequency']
train_df[cols_to_convert] = train_df[cols_to_convert].astype('Int64')
test_df[cols_to_convert] = test_df[cols_to_convert].astype('Int64')




In [12]:
cat_cols = ['Stage_fear','Drained_after_socializing']  # <-- tu wpisz swoje kolumny kategoryczne

train_df[cat_cols]=train_df[cat_cols].fillna('Missing').astype(str)
test_df[cat_cols] = test_df[cat_cols].fillna('Missing').astype(str)


In [14]:
train_df.columns

Index(['id', 'Time_spent_Alone', 'Stage_fear', 'Social_event_attendance',
       'Going_outside', 'Drained_after_socializing', 'Friends_circle_size',
       'Post_frequency', 'Personality', 'Time_spent_Alone_MISS',
       'Stage_fear_MISS', 'Social_event_attendance_MISS', 'Going_outside_MISS',
       'Drained_after_socializing_MISS', 'Friends_circle_size_MISS',
       'Post_frequency_MISS'],
      dtype='object')


train_df['Sum'] = train_df['Social_event_attendance']+train_df['Going_outside']+train_df['Friends_circle_size']+train_df['Post_frequency']
test_df['Sum'] = test_df['Social_event_attendance']+test_df['Going_outside']+test_df['Friends_circle_size']+test_df['Post_frequency']


In [15]:
from catboost import CatBoostClassifier

cat_features = ['Stage_fear','Drained_after_socializing', 'Time_spent_Alone_MISS',
       'Stage_fear_MISS', 'Social_event_attendance_MISS', 'Going_outside_MISS',
       'Drained_after_socializing_MISS', 'Friends_circle_size_MISS',
       'Post_frequency_MISS']

model = CatBoostClassifier(
    cat_features=cat_features,
    iterations=2000,
    learning_rate=0.02,
    depth=6,
    l2_leaf_reg = 5,
    eval_metric = 'Logloss',
    verbose=100,
    random_seed=42
)

# Zakładamy, że target to np. 'label'
model.fit(train_df.drop(['id','Personality'], axis=1), train_df['Personality'])

# Predykcja
preds = model.predict(test_df.drop('id', axis=1))

submission = pd.DataFrame({
    'id': test_df['id'],
    'Personality': preds
})


submission.to_csv("submission.csv", index=False)

print('Sukcesss!!!!')

0:	learn: 0.6605268	total: 70.4ms	remaining: 2m 20s
100:	learn: 0.1310379	total: 1.53s	remaining: 28.8s
200:	learn: 0.1246264	total: 2.99s	remaining: 26.8s
300:	learn: 0.1226706	total: 4.46s	remaining: 25.2s
400:	learn: 0.1213308	total: 5.89s	remaining: 23.5s
500:	learn: 0.1200283	total: 7.35s	remaining: 22s
600:	learn: 0.1185022	total: 8.83s	remaining: 20.6s
700:	learn: 0.1165994	total: 10.4s	remaining: 19.3s
800:	learn: 0.1148847	total: 12s	remaining: 18s
900:	learn: 0.1135292	total: 14.2s	remaining: 17.3s
1000:	learn: 0.1122773	total: 15.8s	remaining: 15.8s
1100:	learn: 0.1111093	total: 17.4s	remaining: 14.2s
1200:	learn: 0.1099688	total: 18.9s	remaining: 12.6s
1300:	learn: 0.1085698	total: 20.6s	remaining: 11s
1400:	learn: 0.1073436	total: 22.2s	remaining: 9.47s
1500:	learn: 0.1060452	total: 23.7s	remaining: 7.89s
1600:	learn: 0.1051339	total: 25.3s	remaining: 6.29s
1700:	learn: 0.1040707	total: 26.8s	remaining: 4.72s
1800:	learn: 0.1030705	total: 28.4s	remaining: 3.14s
1900:	learn