In [31]:
import pandas as pd
import pickle
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

kd = pd.read_csv('data_missing.csv')
kd.columns
kd.isnull().any()

Age                   True
Gender                True
Education             True
Introversion Score    True
Sensing Score         True
Thinking Score        True
Judging Score         True
Interest              True
Personality           True
dtype: bool

In [32]:
kd = kd.dropna()
kd["Personality"].unique()

array(['ISFP', 'ISFJ', 'INFP', 'ESFJ', 'INTJ', 'INFJ', 'ESTP', 'ISTJ',
       'ISTP', 'ENTP', 'ESFP', 'INTP', 'ENTJ', 'ESTJ', 'ENFJ', 'ENFP'],
      dtype=object)

In [33]:
def map_data(column, value):
    mapping_dict = {
        "Gender": {"Male": 0, "Female": 1},
        "Interest": {'Arts': 0, 'Technology': 1, 'Sports': 2, 'Others': 3, 'Unknown': 4},
        "Personality": {'ISFP': 0, 'ISFJ': 1, 'INFP': 2, 'ESFJ': 3, 'INTJ': 4, 'INFJ': 5, 'ESTP': 6, 'ISTJ': 7,
       'ISTP': 8, 'ENTP': 9, 'ESFP': 10, 'INTP': 11, 'ENTJ': 12, 'ESTJ': 13, 'ENFJ': 14, 'ENFP': 15}
    }
    return mapping_dict[column].get(value)
for column in kd.columns:
    if kd[column].dtype == 'object':
        print(column)
        kd[column] = kd[column].apply(lambda x: map_data(column, x))
kd = kd.astype(int)
kd

Gender
Interest
Personality


Unnamed: 0,Age,Gender,Education,Introversion Score,Sensing Score,Thinking Score,Judging Score,Interest,Personality
4,31,1,0,3,6,5,3,3,0
5,33,1,0,1,7,3,6,2,1
7,27,0,1,3,4,5,5,1,2
10,32,0,0,3,6,1,5,0,0
16,32,1,0,5,6,0,7,3,3
...,...,...,...,...,...,...,...,...,...
43722,24,1,0,4,6,4,6,1,3
43725,28,0,0,7,2,5,8,0,12
43730,21,1,0,1,5,9,6,3,4
43739,26,0,1,8,5,8,5,0,9


In [34]:
rfcX = kd.drop(columns=['Personality'])
rfcy = kd['Personality']

scaler = MinMaxScaler()
rfcX_scaled = pd.DataFrame(scaler.fit_transform(rfcX), columns=rfcX.columns)
rfcX_scaled

Unnamed: 0,Age,Gender,Education,Introversion Score,Sensing Score,Thinking Score,Judging Score,Interest
0,0.382353,1.0,0.0,0.333333,0.666667,0.555556,0.333333,0.75
1,0.441176,1.0,0.0,0.111111,0.777778,0.333333,0.666667,0.50
2,0.264706,0.0,1.0,0.333333,0.444444,0.555556,0.555556,0.25
3,0.411765,0.0,0.0,0.333333,0.666667,0.111111,0.555556,0.00
4,0.411765,1.0,0.0,0.555556,0.666667,0.000000,0.777778,0.75
...,...,...,...,...,...,...,...,...
7266,0.176471,1.0,0.0,0.444444,0.666667,0.444444,0.666667,0.25
7267,0.294118,0.0,0.0,0.777778,0.222222,0.555556,0.888889,0.00
7268,0.088235,1.0,0.0,0.111111,0.555556,1.000000,0.666667,0.75
7269,0.235294,0.0,1.0,0.888889,0.555556,0.888889,0.555556,0.00


In [35]:
rfcX_train, rfcX_test, rfcy_train, rfcy_test = train_test_split(rfcX_scaled, rfcy, test_size=0.2, random_state=42)

rfc_model = RandomForestClassifier(n_estimators=150, random_state=42, class_weight="balanced")
rfc_model.fit(rfcX_train, rfcy_train)

rfcy_pred = rfc_model.predict(rfcX_test)

rfc_acc = accuracy_score(rfcy_test, rfcy_pred)

print(f"Random Forest Classifier Accuracy: {rfc_acc:.2f}\n")

#save
pickle.dump(rfc_model, open('rfc_model.pkl', 'wb'))
pickle.dump(scaler, open('rfc_scaler.pkl', 'wb'))

Random Forest Classifier Accuracy: 0.83



In [36]:
svmX = kd.drop(columns=['Personality'])
svmy = kd['Personality']

scaler = MinMaxScaler()
svmX_scaled = pd.DataFrame(scaler.fit_transform(svmX), columns=svmX.columns)
svmX_scaled

Unnamed: 0,Age,Gender,Education,Introversion Score,Sensing Score,Thinking Score,Judging Score,Interest
0,0.382353,1.0,0.0,0.333333,0.666667,0.555556,0.333333,0.75
1,0.441176,1.0,0.0,0.111111,0.777778,0.333333,0.666667,0.50
2,0.264706,0.0,1.0,0.333333,0.444444,0.555556,0.555556,0.25
3,0.411765,0.0,0.0,0.333333,0.666667,0.111111,0.555556,0.00
4,0.411765,1.0,0.0,0.555556,0.666667,0.000000,0.777778,0.75
...,...,...,...,...,...,...,...,...
7266,0.176471,1.0,0.0,0.444444,0.666667,0.444444,0.666667,0.25
7267,0.294118,0.0,0.0,0.777778,0.222222,0.555556,0.888889,0.00
7268,0.088235,1.0,0.0,0.111111,0.555556,1.000000,0.666667,0.75
7269,0.235294,0.0,1.0,0.888889,0.555556,0.888889,0.555556,0.00


In [37]:
svmX_train, svmX_test, svmy_train, svmy_test = train_test_split(svmX_scaled, svmy, test_size=0.2, random_state=42)

svm_model = SVC(kernel="poly", random_state=42, class_weight="balanced")
svm_model.fit(svmX_train, svmy_train)

svmy_pred = svm_model.predict(svmX_test)

svm_acc = accuracy_score(svmy_test, svmy_pred)

print(f"SVM Accuracy: {svm_acc:.2f}\n")

#save
pickle.dump(rfc_model, open('svm_model.pkl', 'wb'))
pickle.dump(scaler, open('svm_scaler.pkl', 'wb'))

SVM Accuracy: 0.81

