# Notebook for competition with highest score on privet leaderboard

### This Notebook was created after many experiments and gridsearch for best parameters. 

In [2]:
import numpy as np 
import pandas as pd 
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.preprocessing import OrdinalEncoder

from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

warnings.filterwarnings("ignore",  category=FutureWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning,)
warnings.filterwarnings("ignore", category=UserWarning)

/kaggle/input/playground-series-s5e7/sample_submission.csv
/kaggle/input/playground-series-s5e7/train.csv
/kaggle/input/playground-series-s5e7/test.csv


In [3]:
train_df = pd.read_csv('/kaggle/input/playground-series-s5e7/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s5e7/test.csv')

### Train and test data are processed together to assure same data structure for fitting predition. Numerical columns are change from float to Int64 that CatBost can use them as categorical data

In [4]:
cols_to_convert = ['Time_spent_Alone', 'Social_event_attendance','Going_outside',  'Friends_circle_size','Post_frequency']
train_df[cols_to_convert] = train_df[cols_to_convert].astype('Int64')
test_df[cols_to_convert] = test_df[cols_to_convert].astype('Int64')

### New column Drained_after_socializing_MISS is created as flag of missing values in column Drained_after_socializing. 

In [5]:
train_df['Drained_after_socializing' + '_MISS'] = train_df['Drained_after_socializing'].notna().astype(int)
test_df['Drained_after_socializing' + '_MISS'] = test_df['Drained_after_socializing'].notna().astype(int)

### Based on EDA observation Categorical binary columns are imputed with data Yes if other column is Yes and No if other column is No

In [6]:
train_df['Stage_fear'] = train_df['Stage_fear'].mask(train_df['Stage_fear'].isna() & train_df['Drained_after_socializing']
                                            .notna(), train_df['Drained_after_socializing'])
train_df['Drained_after_socializing'] = train_df['Drained_after_socializing'].mask(train_df['Drained_after_socializing']
                                            .isna() & train_df['Stage_fear'].notna(), train_df['Stage_fear'])

test_df['Stage_fear'] = test_df['Stage_fear'].mask(test_df['Stage_fear'].isna() & test_df['Drained_after_socializing']
                                            .notna(), test_df['Drained_after_socializing'])
test_df['Drained_after_socializing'] = test_df['Drained_after_socializing'].mask(test_df['Drained_after_socializing']
                                            .isna() & test_df['Stage_fear'].notna(), test_df['Stage_fear'])

### Columns 'Stage_fear','Drained_after_socializing' have still some small ammount of missing data (in case if both columns are Nan) - they will be illed with string

In [7]:
cat_cols = ['Stage_fear','Drained_after_socializing'] 
train_df[cat_cols]=train_df[cat_cols].fillna('Missing').astype(str)
test_df[cat_cols]=test_df[cat_cols].fillna('Missing').astype(str)

### Below There are two columns created based on multiplication and devision of two different columns. Such Feature engineering increased results received by model on train data with CrossValidation

In [8]:
train_df['Outside_mult_Friends'] = train_df['Going_outside'] * train_df['Friends_circle_size']
def Outside_mult_Friends (x):
    try:
        x=float(x)
        if x <= 11:
            return 0
        elif x > 11 and x <= 15:
            return 1
        elif x > 15 and x < 400:
            return 2
        else:
            return 2
    except ValueError:
        return 2

train_df['Outside_mult_Friends']=train_df['Outside_mult_Friends'].apply(Outside_mult_Friends).astype('Int64')

test_df['Outside_mult_Friends'] = test_df['Going_outside'] * test_df['Friends_circle_size']
test_df['Outside_mult_Friends']=test_df['Outside_mult_Friends'].apply(Outside_mult_Friends).astype('Int64')

In [9]:
train_df['Time_Alone_dev_Outside'] = train_df['Time_spent_Alone'] / train_df['Going_outside']
train_df['Time_Alone_dev_Outside']=train_df['Time_Alone_dev_Outside'].round(2).astype(float)
def Time_Alone_dev_Outside (x):
    try:
        x=float(x)
        if x <= 1:
            return 0
        elif x > 1 and x < 2:
            return 1
        elif x >= 2 and x < 100:
            return 2
        else:
            return 3
    except ValueError:
        return 3

train_df['Time_Alone_dev_Outside']=train_df['Time_Alone_dev_Outside'].apply(Time_Alone_dev_Outside).astype('Int64')

test_df['Time_Alone_dev_Outside'] = test_df['Time_spent_Alone'] / test_df['Going_outside']
test_df['Time_Alone_dev_Outside']=test_df['Time_Alone_dev_Outside'].round(2).astype(float)
test_df['Time_Alone_dev_Outside']=test_df['Time_Alone_dev_Outside'].apply(Time_Alone_dev_Outside).astype('Int64')

### X and y are assigned but without Stage_Fear column - usage of this column decreased results achieved by model 

In [10]:
X=train_df.drop(['id','Personality','Stage_fear'], axis=1).copy()
y=train_df['Personality'].copy()
y_test = test_df.drop(['id','Stage_fear'], axis=1).copy()

In [11]:
# Definition of column used as categories
cat_features = ['Drained_after_socializing', 'Outside_mult_Friends', 'Time_Alone_dev_Outside', 'Drained_after_socializing_MISS']

# Model parameters definition
model = CatBoostClassifier(
    cat_features=cat_features,
    iterations=250,
    learning_rate=0.04, 
    depth=6,
    #l2_leaf_reg = 5,
    #eval_metric = 'Logloss',
    verbose=100,
    random_seed=42
)

# Model fitting with parameters found during experiments
model.fit(X, y)

# Predition 
preds = model.predict(y_test)

# Finalize data for submission
submission = pd.DataFrame({
    'id': test_df['id'],
    'Personality': preds
})

# Save file with data
submission.to_csv("submission.csv", index=False)

print('OK!\n', submission.head(4))

0:	learn: 0.6293924	total: 81.5ms	remaining: 20.3s
100:	learn: 0.1250639	total: 1.62s	remaining: 2.39s
200:	learn: 0.1214883	total: 3.1s	remaining: 757ms
249:	learn: 0.1200442	total: 3.87s	remaining: 0us
OK!
       id Personality
0  18524   Extrovert
1  18525   Introvert
2  18526   Extrovert
3  18527   Extrovert
