In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

In [22]:
train_df = pd.read_csv("../data/train.csv").drop(["id"], axis=1)
sub_df = pd.read_csv("../data/test.csv").drop(["id"], axis=1)
print("Raw Train DataFrame:")
display(train_df.head(6))
print("Raw Sub DataFrame:")
display(sub_df.head(6))

imputer = SimpleImputer(strategy="mean")
le = LabelEncoder()

for df in [train_df, sub_df]:
	for feature in df.columns:
		if feature == "Stage_fear" or feature == "Drained_after_socializing":
			df[feature] = df[feature].map({"No": 0, "Yes": 1})
			df[feature] = imputer.fit_transform(df[[feature]]).flatten()
		else:
			df[feature] = le.fit_transform(df[feature])
		df[feature] = imputer.fit_transform(df[[feature]]).flatten()

print("Cleaned Train DataFrame:")
display(train_df.head(6))
print("Cleaned Test DataFrame:")
display(sub_df.head(6))

Raw Train DataFrame:


Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0.0,No,6.0,4.0,No,15.0,5.0,Extrovert
1,1.0,No,7.0,3.0,No,10.0,8.0,Extrovert
2,6.0,Yes,1.0,0.0,,3.0,0.0,Introvert
3,3.0,No,7.0,3.0,No,11.0,5.0,Extrovert
4,1.0,No,4.0,4.0,No,13.0,,Extrovert
5,2.0,No,8.0,5.0,No,,3.0,Extrovert


Raw Sub DataFrame:


Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
0,3.0,No,7.0,4.0,No,6.0,
1,,Yes,0.0,0.0,Yes,5.0,1.0
2,3.0,No,5.0,6.0,No,15.0,9.0
3,3.0,No,4.0,4.0,No,5.0,6.0
4,9.0,Yes,1.0,2.0,Yes,1.0,1.0
5,2.0,No,5.0,3.0,No,10.0,4.0


Cleaned Train DataFrame:


Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0.0,0.0,6.0,4.0,0.0,15.0,5.0,0.0
1,1.0,0.0,7.0,3.0,0.0,10.0,8.0,0.0
2,6.0,1.0,1.0,0.0,0.233784,3.0,0.0,1.0
3,3.0,0.0,7.0,3.0,0.0,11.0,5.0,0.0
4,1.0,0.0,4.0,4.0,0.0,13.0,11.0,0.0
5,2.0,0.0,8.0,5.0,0.0,16.0,3.0,0.0


Cleaned Test DataFrame:


Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
0,3.0,0.0,7.0,4.0,0.0,6.0,11.0
1,12.0,1.0,0.0,0.0,1.0,5.0,1.0
2,3.0,0.0,5.0,6.0,0.0,15.0,9.0
3,3.0,0.0,4.0,4.0,0.0,5.0,6.0
4,9.0,1.0,1.0,2.0,1.0,1.0,1.0
5,2.0,0.0,5.0,3.0,0.0,10.0,4.0


In [23]:
df = train_df
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(["Personality"], axis=1),
    df["Personality"],
    test_size=0.20,
    random_state=42
)

print("X_train:")
display(X_train)
print("X_test:")
display(X_test)
print("y_train:")
display(y_train)
print("y_test:")
display(y_test)

X_train:


Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
1799,1.0,0.241838,7.0,4.0,0.0,10.0,5.0
11931,2.0,0.000000,4.0,6.0,0.0,6.0,8.0
14307,4.0,0.000000,5.0,5.0,0.0,7.0,6.0
12157,3.0,0.000000,6.0,8.0,0.0,8.0,8.0
18124,2.0,0.000000,7.0,7.0,0.0,15.0,4.0
...,...,...,...,...,...,...,...
11284,9.0,0.241838,1.0,3.0,1.0,5.0,3.0
11964,3.0,0.000000,6.0,6.0,0.0,16.0,3.0
5390,3.0,0.241838,7.0,3.0,0.0,14.0,8.0
860,3.0,0.000000,4.0,8.0,0.0,9.0,9.0


X_test:


Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency
13209,12.0,0.0,9.0,7.0,0.000000,8.0,5.0
2740,0.0,0.0,9.0,5.0,0.000000,6.0,4.0
10249,3.0,0.0,8.0,7.0,0.000000,10.0,10.0
7608,1.0,0.0,7.0,8.0,0.000000,12.0,4.0
10528,0.0,0.0,6.0,4.0,0.000000,6.0,10.0
...,...,...,...,...,...,...,...
3269,0.0,0.0,8.0,3.0,0.000000,7.0,7.0
15214,11.0,1.0,2.0,0.0,0.233784,1.0,1.0
1127,1.0,0.0,4.0,5.0,0.000000,12.0,4.0
11368,0.0,0.0,4.0,6.0,0.000000,7.0,10.0


y_train:


1799     0.0
11931    0.0
14307    0.0
12157    0.0
18124    0.0
        ... 
11284    1.0
11964    0.0
5390     0.0
860      0.0
15795    0.0
Name: Personality, Length: 14819, dtype: float64

y_test:


13209    0.0
2740     0.0
10249    0.0
7608     0.0
10528    0.0
        ... 
3269     0.0
15214    1.0
1127     0.0
11368    0.0
17737    0.0
Name: Personality, Length: 3705, dtype: float64

In [24]:
baysian_model = GaussianNB()
baysian_model.fit(X_train, y_train)
pred = baysian_model.predict(X_test)
baysian_f1 = f1_score(y_test, pred, average='weighted')
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         0.0       0.98      0.98      0.98      2753
         1.0       0.95      0.93      0.94       952

    accuracy                           0.97      3705
   macro avg       0.96      0.96      0.96      3705
weighted avg       0.97      0.97      0.97      3705



In [25]:
SGD_model = SGDClassifier(
    learning_rate="adaptive",
    eta0=0.01,
    early_stopping=True,
    n_iter_no_change=60,
    verbose=False
)
SGD_model.fit(X_train, y_train)
pred = SGD_model.predict(X_test)
SGD_f1 = f1_score(y_test, pred, average='weighted')
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         0.0       0.98      0.98      0.98      2753
         1.0       0.95      0.93      0.94       952

    accuracy                           0.97      3705
   macro avg       0.96      0.96      0.96      3705
weighted avg       0.97      0.97      0.97      3705



In [26]:
random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train, y_train)
pred = random_forest_model.predict(X_test)
random_forest_f1 = f1_score(y_test, pred, average='weighted')
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         0.0       0.97      0.98      0.98      2753
         1.0       0.95      0.93      0.94       952

    accuracy                           0.97      3705
   macro avg       0.96      0.95      0.96      3705
weighted avg       0.97      0.97      0.97      3705



In [27]:
MLP_model = MLPClassifier(
    hidden_layer_sizes=(100, 100),
    learning_rate="adaptive",
    early_stopping=True,
    n_iter_no_change=20,
    verbose=False
)
MLP_model.fit(X_train, y_train)
pred = MLP_model.predict(X_test)
MLP_f1 = f1_score(y_test, pred, average='weighted')
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         0.0       0.98      0.98      0.98      2753
         1.0       0.95      0.93      0.94       952

    accuracy                           0.97      3705
   macro avg       0.96      0.96      0.96      3705
weighted avg       0.97      0.97      0.97      3705



In [28]:
print("Baysian F1:", baysian_f1)
print("SGD F1:", SGD_f1)
print("Random Forest F1:", random_forest_f1)
print("MLP F1:", MLP_f1)

if max(baysian_f1, SGD_f1, random_forest_f1, MLP_f1) == baysian_f1:
    model = baysian_model
    print("Baysian Classifier better")
if max(baysian_f1, SGD_f1, random_forest_f1, MLP_f1) == SGD_f1:
    model = SGD_model
    print("SGD Classifier better")
if max(baysian_f1, SGD_f1, random_forest_f1, MLP_f1) == random_forest_f1:
    model = random_forest_model
    print("Random Forest Classifier better")
if max(baysian_f1, SGD_f1, random_forest_f1, MLP_f1) == MLP_f1:
    model = MLP_model
    print("MLP Classifier better")

Baysian F1: 0.9683274057098351
SGD F1: 0.9683274057098351
Random Forest F1: 0.9674980330879502
MLP F1: 0.9686036517216703
MLP Classifier better


In [29]:
sub_pred = model.predict(sub_df)
sub_pred = pd.Series(sub_pred).map({0: "Extrovert", 1: "Introvert"})
sub_ids = pd.read_csv("../data/test.csv")["id"]
submission = pd.DataFrame({"id": sub_ids, "Personality": sub_pred})
display(submission)
submission.to_csv("../data/submission.csv", index=False)

Unnamed: 0,id,Personality
0,18524,Extrovert
1,18525,Introvert
2,18526,Extrovert
3,18527,Extrovert
4,18528,Introvert
...,...,...
6170,24694,Extrovert
6171,24695,Introvert
6172,24696,Extrovert
6173,24697,Extrovert
