In [122]:
import pandas as pd
import pickle
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

kd = pd.read_csv('data_missing.csv')
kd.columns
kd.isnull().any()

Age                   True
Gender                True
Education             True
Introversion Score    True
Sensing Score         True
Thinking Score        True
Judging Score         True
Interest              True
Personality           True
dtype: bool

In [123]:
kd = kd.dropna()
kd

Unnamed: 0,Age,Gender,Education,Introversion Score,Sensing Score,Thinking Score,Judging Score,Interest,Personality
4,31.0,Female,0.0,3.59804,6.189259,5.31347,3.677984,Others,ISFP
5,33.0,Female,0.0,1.06869,7.143507,3.84411,6.347241,Sports,ISFJ
7,27.0,Male,1.0,3.98957,4.406797,5.09055,5.556500,Technology,INFP
10,32.0,Male,0.0,3.98624,6.287163,1.83208,5.447141,Arts,ISFP
16,32.0,Female,0.0,5.61706,6.190431,0.77300,7.598795,Others,ESFJ
...,...,...,...,...,...,...,...,...,...
43722,24.0,Female,0.0,4.30484,6.673149,4.31227,6.573628,Technology,ESFJ
43725,28.0,Male,0.0,7.21415,2.405862,5.60943,8.164824,Arts,ENTJ
43730,21.0,Female,0.0,1.80978,5.889667,9.35024,6.066070,Others,INTJ
43739,26.0,Male,1.0,8.88656,5.118399,8.48784,5.331942,Arts,ENTP


In [124]:
def map_data(x):
    convert = x.unique()
    return x.map(dict(zip(convert, range(1,len(convert) + 1))))
for column in kd.columns:
    if kd[column].dtype == 'object':
        kd[column] = map_data(kd[column])
kd = kd.astype(int)
kd

Unnamed: 0,Age,Gender,Education,Introversion Score,Sensing Score,Thinking Score,Judging Score,Interest,Personality
4,31,1,0,3,6,5,3,1,1
5,33,1,0,1,7,3,6,2,2
7,27,2,1,3,4,5,5,3,3
10,32,2,0,3,6,1,5,4,1
16,32,1,0,5,6,0,7,1,4
...,...,...,...,...,...,...,...,...,...
43722,24,1,0,4,6,4,6,3,4
43725,28,2,0,7,2,5,8,4,13
43730,21,1,0,1,5,9,6,1,5
43739,26,2,1,8,5,8,5,4,10


In [125]:
rfcX = kd.drop(columns=['Personality'])
rfcy = kd['Personality']

scaler = MinMaxScaler()
rfcX_scaled = pd.DataFrame(scaler.fit_transform(rfcX), columns=rfcX.columns)
rfcX_scaled

Unnamed: 0,Age,Gender,Education,Introversion Score,Sensing Score,Thinking Score,Judging Score,Interest
0,0.382353,0.0,0.0,0.333333,0.666667,0.555556,0.333333,0.00
1,0.441176,0.0,0.0,0.111111,0.777778,0.333333,0.666667,0.25
2,0.264706,1.0,1.0,0.333333,0.444444,0.555556,0.555556,0.50
3,0.411765,1.0,0.0,0.333333,0.666667,0.111111,0.555556,0.75
4,0.411765,0.0,0.0,0.555556,0.666667,0.000000,0.777778,0.00
...,...,...,...,...,...,...,...,...
7266,0.176471,0.0,0.0,0.444444,0.666667,0.444444,0.666667,0.50
7267,0.294118,1.0,0.0,0.777778,0.222222,0.555556,0.888889,0.75
7268,0.088235,0.0,0.0,0.111111,0.555556,1.000000,0.666667,0.00
7269,0.235294,1.0,1.0,0.888889,0.555556,0.888889,0.555556,0.75


In [126]:
rfcX_train, rfcX_test, rfcy_train, rfcy_test = train_test_split(rfcX_scaled, rfcy, test_size=0.2, random_state=42)

rfc_model = RandomForestClassifier(n_estimators=150, random_state=42, class_weight="balanced")
rfc_model.fit(rfcX_train, rfcy_train)

rfcy_pred = rfc_model.predict(rfcX_test)

rfc_acc = accuracy_score(rfcy_test, rfcy_pred)

print(f"Random Forest Classifier Accuracy: {rfc_acc:.2f}\n")

#save
pickle.dump(rfc_model, open('rfc_model.pkl', 'wb'))
pickle.dump(scaler, open('rfc_scaler.pkl', 'wb'))

Random Forest Classifier Accuracy: 0.83



In [127]:
svmX = kd.drop(columns=['Personality'])
svmy = kd['Personality']

scaler = MinMaxScaler()
svmX_scaled = pd.DataFrame(scaler.fit_transform(svmX), columns=svmX.columns)
svmX_scaled

Unnamed: 0,Age,Gender,Education,Introversion Score,Sensing Score,Thinking Score,Judging Score,Interest
0,0.382353,0.0,0.0,0.333333,0.666667,0.555556,0.333333,0.00
1,0.441176,0.0,0.0,0.111111,0.777778,0.333333,0.666667,0.25
2,0.264706,1.0,1.0,0.333333,0.444444,0.555556,0.555556,0.50
3,0.411765,1.0,0.0,0.333333,0.666667,0.111111,0.555556,0.75
4,0.411765,0.0,0.0,0.555556,0.666667,0.000000,0.777778,0.00
...,...,...,...,...,...,...,...,...
7266,0.176471,0.0,0.0,0.444444,0.666667,0.444444,0.666667,0.50
7267,0.294118,1.0,0.0,0.777778,0.222222,0.555556,0.888889,0.75
7268,0.088235,0.0,0.0,0.111111,0.555556,1.000000,0.666667,0.00
7269,0.235294,1.0,1.0,0.888889,0.555556,0.888889,0.555556,0.75


In [128]:
svmX_train, svmX_test, svmy_train, svmy_test = train_test_split(svmX_scaled, svmy, test_size=0.2, random_state=42)

svm_model = SVC(kernel="poly", random_state=42, class_weight="balanced")
svm_model.fit(svmX_train, svmy_train)

svmy_pred = svm_model.predict(svmX_test)

svm_acc = accuracy_score(svmy_test, svmy_pred)

print(f"SVM Accuracy: {svm_acc:.2f}\n")

#save
pickle.dump(rfc_model, open('svm_model.pkl', 'wb'))
pickle.dump(scaler, open('svm_scaler.pkl', 'wb'))

SVM Accuracy: 0.80

