In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_pipeline

In [2]:
df = pd.read_csv("../data/train.csv").drop(["id"], axis=1)
sub_df = pd.read_csv("../data/test.csv")
print("Raw DataFrame:")
display(df.head(6))

imputer = SimpleImputer(strategy="most_frequent")
le = LabelEncoder()

for feature in df.columns:
	if feature == "Stage_fear" or feature == "Drained_after_socializing":
		for i in range(len(df)):
			if pd.isna(df.loc[i, feature]):
				df.loc[i, feature] = "Unknown"
		df[feature] = le.fit_transform(df[feature])
	else:
		df[feature] = imputer.fit_transform(df[[feature]]).flatten()
		df[feature] = le.fit_transform(df[feature])

print("Cleaned df:")
display(df.head(6))

Raw DataFrame:


Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0.0,No,6.0,4.0,No,15.0,5.0,Extrovert
1,1.0,No,7.0,3.0,No,10.0,8.0,Extrovert
2,6.0,Yes,1.0,0.0,,3.0,0.0,Introvert
3,3.0,No,7.0,3.0,No,11.0,5.0,Extrovert
4,1.0,No,4.0,4.0,No,13.0,,Extrovert
5,2.0,No,8.0,5.0,No,,3.0,Extrovert


Cleaned df:


Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0,6,4,0,15,5,0
1,1,0,7,3,0,10,8,0
2,6,2,1,0,1,3,0,1
3,3,0,7,3,0,11,5,0
4,1,0,4,4,0,13,3,0
5,2,0,8,5,0,5,3,0


In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(["Personality"], axis=1),
    df["Personality"],
    test_size=0.1,
    random_state=42
)

print("X_train:")
display(X_train)
print("X_test:")
display(X_test)
print("y_train:")
display(y_train)
print("y_test:")
display(y_test)

X_train:


Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
11881,3,0,4,5,0,10,7
5199,2,0,7,3,0,5,8
17115,2,0,7,5,0,12,4
14120,3,1,4,4,0,10,4
6202,0,0,3,3,0,6,5
...,...,...,...,...,...,...,...
11284,9,1,1,3,2,5,3
11964,3,0,6,6,0,5,3
5390,3,1,7,3,0,14,8
860,3,0,4,5,0,9,9


X_test:


Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
13209,0,0,9,7,0,8,5
2740,0,0,9,5,0,6,4
10249,3,0,8,7,0,10,10
7608,1,0,7,5,0,12,4
10528,0,0,6,4,0,6,10
...,...,...,...,...,...,...,...
5658,1,1,9,5,0,10,8
8883,0,0,5,7,0,5,4
9093,8,2,3,2,2,1,3
9001,5,2,3,2,2,0,2


y_train:


11881    0
5199     0
17115    0
14120    0
6202     0
        ..
11284    1
11964    0
5390     0
860      0
15795    0
Name: Personality, Length: 16671, dtype: int64

y_test:


13209    0
2740     0
10249    0
7608     0
10528    0
        ..
5658     0
8883     0
9093     1
9001     1
16550    0
Name: Personality, Length: 1853, dtype: int64

In [4]:
baysian_model = GaussianNB()
baysian_model.fit(X_train, y_train)
pred = baysian_model.predict(X_test)
baysian_f1 = f1_score(y_test, pred, average='weighted')
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.97      0.98      0.98      1356
           1       0.96      0.93      0.94       497

    accuracy                           0.97      1853
   macro avg       0.97      0.96      0.96      1853
weighted avg       0.97      0.97      0.97      1853



In [5]:
SGD_model = SGDClassifier(
    learning_rate="adaptive",
    eta0=0.01,
    early_stopping=True,
    n_iter_no_change=60,
    verbose=False
)
SGD_model.fit(X_train, y_train)
pred = SGD_model.predict(X_test)
SGD_f1 = f1_score(y_test, pred, average='weighted')
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.97      0.98      0.98      1356
           1       0.96      0.93      0.94       497

    accuracy                           0.97      1853
   macro avg       0.97      0.96      0.96      1853
weighted avg       0.97      0.97      0.97      1853



In [6]:
MLP_model = MLPClassifier(
    hidden_layer_sizes=(100, 100),
    learning_rate="adaptive",
    early_stopping=True,
    n_iter_no_change=20,
    verbose=False
)
MLP_model.fit(X_train, y_train)
pred = MLP_model.predict(X_test)
MLP_f1 = f1_score(y_test, pred, average='weighted')
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1356
           1       0.96      0.93      0.94       497

    accuracy                           0.97      1853
   macro avg       0.97      0.96      0.96      1853
weighted avg       0.97      0.97      0.97      1853



In [7]:
print("Baysian F1:", baysian_f1)
print("SGD F1:", SGD_f1)
print("MLP F1:", MLP_f1)

if max(baysian_f1, SGD_f1, MLP_f1) == baysian_f1:
    model = baysian_model
    print("Baysian Classifier better")
if max(baysian_f1, SGD_f1, MLP_f1) == SGD_f1:
    model = SGD_model
    print("SGD Classifier better")
if max(baysian_f1, SGD_f1, MLP_f1) == MLP_f1:
    model = MLP_model
    print("MLP Classifier better")

Baysian F1: 0.9696404751431222
SGD F1: 0.9696404751431222
MLP F1: 0.970192535673754
MLP Classifier better


In [8]:
sub_df = pd.read_csv("../data/test.csv")
imputer = SimpleImputer(strategy="most_frequent")
le = LabelEncoder()
for feature in sub_df.columns:
    sub_df[feature] = imputer.fit_transform(sub_df[[feature]]).flatten()
    if feature == "Stage_fear" or feature == "Drained_after_socializing":
        sub_df[feature] = le.fit_transform(sub_df[feature])
display(sub_df)

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
0,18524,3.0,0,7.0,4.0,0,6.0,3.0
1,18525,0.0,1,0.0,0.0,1,5.0,1.0
2,18526,3.0,0,5.0,6.0,0,15.0,9.0
3,18527,3.0,0,4.0,4.0,0,5.0,6.0
4,18528,9.0,1,1.0,2.0,1,1.0,1.0
...,...,...,...,...,...,...,...,...
6170,24694,3.0,0,5.0,5.0,0,9.0,6.0
6171,24695,8.0,1,2.0,1.0,1,0.0,0.0
6172,24696,2.0,0,4.0,3.0,0,9.0,7.0
6173,24697,3.0,0,4.0,4.0,0,11.0,9.0


In [9]:
sub_pred = model.predict(sub_df.drop(["id"], axis=1))
sub_pred = ["Extrovert" if e == 0 else "Introvert" for e in sub_pred]
submission = pd.DataFrame({"id": sub_df["id"], "Personality": sub_pred})
display(submission)
submission.to_csv("../data/submission.csv", index=False)

Unnamed: 0,id,Personality
0,18524,Extrovert
1,18525,Introvert
2,18526,Extrovert
3,18527,Extrovert
4,18528,Introvert
...,...,...
6170,24694,Extrovert
6171,24695,Introvert
6172,24696,Extrovert
6173,24697,Extrovert
