In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from xgboost import XGBClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import datetime
import csv

In [2]:
train_df = pd.read_csv('/kaggle/input/playground-series-s5e7/train.csv')
train_ids = train_df['id']
train_df = train_df.loc[:, train_df.columns != 'id']
print(f"Train df shape: {train_df.shape}")
print(f"Train df columns: {train_df.columns}")

test_df = pd.read_csv('/kaggle/input/playground-series-s5e7/test.csv')
test_ids = test_df['id']
test_df = test_df.loc[:, test_df.columns != 'id']
print(f"Test df shape: {test_df.shape}")
print(f"Test df columns: {test_df.columns}")

Train df shape: (18524, 8)
Train df columns: Index(['Time_spent_Alone', 'Stage_fear', 'Social_event_attendance',
       'Going_outside', 'Drained_after_socializing', 'Friends_circle_size',
       'Post_frequency', 'Personality'],
      dtype='object')
Test df shape: (6175, 7)
Test df columns: Index(['Time_spent_Alone', 'Stage_fear', 'Social_event_attendance',
       'Going_outside', 'Drained_after_socializing', 'Friends_circle_size',
       'Post_frequency'],
      dtype='object')


In [3]:
column_name_map = {
    'Time_spent_Alone' : 'tsa',
    'Stage_fear' : 'sf',
    'Social_event_attendance': 'sea',
    'Going_outside' : 'go',
    'Drained_after_socializing' : 'das',
    'Friends_circle_size' : 'fcs',
    'Post_frequency' : 'pf',
    'Personality' : 'label'
}

train_df = train_df.rename(columns= column_name_map)
test_df = test_df.rename(columns= column_name_map)

In [4]:
train_df.head(5)

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,tsa,sf,sea,go,das,fcs,pf,label
0,0.0,No,6.0,4.0,No,15.0,5.0,Extrovert
1,1.0,No,7.0,3.0,No,10.0,8.0,Extrovert
2,6.0,Yes,1.0,0.0,,3.0,0.0,Introvert
3,3.0,No,7.0,3.0,No,11.0,5.0,Extrovert
4,1.0,No,4.0,4.0,No,13.0,,Extrovert


In [5]:
trainset_rows_with_null = train_df.isnull().any(axis= 1).sum()
print(f"Train df rows w/ atleast one null val.: {trainset_rows_with_null}({trainset_rows_with_null / train_df.shape[0] * 100:0.2f}%)")

# testset_rows_with_null = test_df.isnull().any(axis= 1).sum()
# print(f"Test df rows w/ atleast one null val.: {testset_rows_with_null}({testset_rows_with_null / test_df.shape[0] * 100:0.2f}%)")

Train df rows w/ atleast one null val.: 8335(45.00%)


In [6]:
print(train_df.dtypes)
print("\n~~~~~~~~~~~~~~~~\n")
print(train_df.isna().sum())

tsa      float64
sf        object
sea      float64
go       float64
das       object
fcs      float64
pf       float64
label     object
dtype: object

~~~~~~~~~~~~~~~~

tsa      1190
sf       1893
sea      1180
go       1466
das      1149
fcs      1054
pf       1264
label       0
dtype: int64


In [7]:
categorical_cols = ['sf', 'das']
pd.Series({col: train_df[col].unique() for col in categorical_cols})

sf     [No, Yes, nan]
das    [No, nan, Yes]
dtype: object

In [8]:
for col in categorical_cols:
    train_df[col] = train_df[col].fillna('unk')
    test_df[col] = test_df[col].fillna('unk')

In [9]:
pd.Series({col: train_df[col].unique() for col in categorical_cols})

sf     [No, Yes, unk]
das    [No, unk, Yes]
dtype: object

In [10]:
encoder = OneHotEncoder(handle_unknown= 'ignore', sparse= False)
encoded_train_cols = encoder.fit_transform(train_df[categorical_cols])
encoded_train_cols = pd.DataFrame(encoded_train_cols, index= train_df.index)
train_df = train_df.drop(columns= categorical_cols)
train_df = pd.concat([encoded_train_cols, train_df], axis= 1)

train_df.shape



(18524, 12)

In [11]:
encoded_test_cols = encoder.fit_transform(test_df[categorical_cols])
encoded_test_cols = pd.DataFrame(encoded_test_cols, index= test_df.index)
test_df = test_df.drop(columns= categorical_cols)
test_df = pd.concat([encoded_test_cols, test_df], axis= 1)

test_df.shape



(6175, 11)

In [12]:
mapping = {
    'Introvert': 0,
    'Extrovert': 1
}

train_df['label'] = train_df['label'].replace(mapping)
train_df['label'].head(5)

  train_df['label'] = train_df['label'].replace(mapping)


0    1
1    1
2    0
3    1
4    1
Name: label, dtype: int64

In [13]:
y = train_df['label']
x = train_df.drop('label', axis= 1)

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.35)

print(f"x_train size: {x_train.shape}; x_test.shape: {x_test.shape}")
print(f"y_train size: {y_train.shape}; y_test.shape: {y_test.shape}")

x_train size: (12040, 11); x_test.shape: (6484, 11)
y_train size: (12040,); y_test.shape: (6484,)


### Method 1: Do nothing and use XGBoost

In [15]:
params = {
    'nEstimators' : 1000,
    'maxDepth' : 7,
    'lr': 1e-3,
}

xgbc = XGBClassifier(n_estimators= params['nEstimators'],
                     max_depth= params['maxDepth'],
                     learning_rate= params['lr'])

In [16]:
xgbc.fit(x_train, y_train)
preds = xgbc.predict(x_test)

acc = accuracy_score(y_test, preds)

In [17]:
print(f"Accuracy: {acc * 100:0.3f}%")
print(f"Accuracy report\n~~~~~~~~~~~~~~~~~~~~~~~\n")
print(classification_report(y_test, preds))

Accuracy: 96.823%
Accuracy report
~~~~~~~~~~~~~~~~~~~~~~~

              precision    recall  f1-score   support

           0       0.95      0.93      0.94      1714
           1       0.98      0.98      0.98      4770

    accuracy                           0.97      6484
   macro avg       0.96      0.96      0.96      6484
weighted avg       0.97      0.97      0.97      6484



In [18]:
now = datetime.datetime.now()
out_file = "/kaggle/working/submission-method-1-" + str(now).split(' ')[0] + ".csv"

submission_preds = xgbc.predict(test_df)
preds_with_ids = []

for i in range(test_df.shape[0]):
    label = 'Introvert' if submission_preds[i] == 0 else 'Extrovert'
    pred = {
        'id': test_ids[i],
        'Personality': label
    }
    preds_with_ids.append(pred)

In [19]:
with open(out_file, 'w') as f:
    writer = csv.DictWriter(f, fieldnames= preds_with_ids[0].keys())
    writer.writeheader()
    writer.writerows(preds_with_ids)

### Method 2: Impute missing numeric values