In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

In [16]:
df = pd.read_csv('personality_synthetic_dataset.csv')
df.head()

Unnamed: 0,personality_type,social_energy,alone_time_preference,talkativeness,deep_reflection,group_comfort,party_liking,listening_skill,empathy,creativity,...,spontaneity,adventurousness,reading_habit,sports_interest,online_social_usage,travel_desire,gadget_usage,work_style_collaborative,decision_speed,stress_handling
0,Extrovert,6.794295,3.85467,8.725446,2.515151,7.097368,8.588762,6.774799,6.430132,6.142968,...,4.853313,8.257134,5.270555,10.0,9.154296,4.816422,9.191711,8.31359,8.032376,7.176905
1,Ambivert,6.378988,5.731157,7.029529,7.274493,4.111199,3.258248,5.550909,3.958179,6.149457,...,6.067201,6.289347,5.753165,5.334303,4.683781,4.725666,5.956141,5.890619,3.158988,3.423577
2,Ambivert,7.459421,6.322263,3.922269,4.622261,5.343276,7.452152,9.48399,6.127654,7.032017,...,5.524244,9.238784,5.250405,3.15354,5.000338,6.139166,6.033048,5.8075,4.571003,5.64748
3,Extrovert,6.159626,3.097837,6.019093,1.96544,7.83714,10.0,9.436733,8.949684,8.923875,...,4.327018,8.489791,5.312617,8.379936,7.601946,6.370056,5.410145,6.671781,6.600233,5.870088
4,Introvert,5.568462,6.986722,3.91324,9.926161,1.650483,0.362298,7.470387,6.756837,9.507803,...,5.187689,3.167217,7.060235,2.333388,7.771569,5.534336,5.704598,5.832968,5.813099,3.758084


In [17]:
df.isnull().sum()

personality_type            0
social_energy               0
alone_time_preference       0
talkativeness               0
deep_reflection             0
group_comfort               0
party_liking                0
listening_skill             0
empathy                     0
creativity                  0
organization                0
leadership                  0
risk_taking                 0
public_speaking_comfort     0
curiosity                   0
routine_preference          0
excitement_seeking          0
friendliness                0
emotional_stability         0
planning                    0
spontaneity                 0
adventurousness             0
reading_habit               0
sports_interest             0
online_social_usage         0
travel_desire               0
gadget_usage                0
work_style_collaborative    0
decision_speed              0
stress_handling             0
dtype: int64

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 30 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   personality_type          20000 non-null  object 
 1   social_energy             20000 non-null  float64
 2   alone_time_preference     20000 non-null  float64
 3   talkativeness             20000 non-null  float64
 4   deep_reflection           20000 non-null  float64
 5   group_comfort             20000 non-null  float64
 6   party_liking              20000 non-null  float64
 7   listening_skill           20000 non-null  float64
 8   empathy                   20000 non-null  float64
 9   creativity                20000 non-null  float64
 10  organization              20000 non-null  float64
 11  leadership                20000 non-null  float64
 12  risk_taking               20000 non-null  float64
 13  public_speaking_comfort   20000 non-null  float64
 14  curios

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,StandardScaler
X = df.drop(columns=['personality_type'],axis=1)
y=df['personality_type']
scaler = StandardScaler()
le = LabelEncoder()
X_scaled = scaler.fit_transform(X)
y_en = le.fit_transform(y)

X_train,X_test,y_train,y_test = train_test_split(X_scaled,y_en,test_size=0.2,random_state=42)


In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,accuracy_score,confusion_matrix,classification_report

In [21]:
models = {
    "Logistic Regression": LogisticRegression(multi_class='ovr'),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "SVM": SVC(probability=True),
}

In [22]:
results = {}
for name, model in models.items():
    try:
        m = model.fit(X_train, y_train)
        y_pred = m.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = np.mean(np.abs(y_test - y_pred))
        mse = mean_squared_error(y_test, y_pred)
        acc = accuracy_score(y_test, y_pred)
        results[name] = {
            'R2 Score': r2,
            'RMSE': rmse,
            'MAE': mae,
            'MSE': mse,
            'Accuracy': acc
        }
        print(f"{name} trained")
    except Exception as e:
        print(f"Error with {name}: {e}")

Logistic Regression trained
Decision Tree trained
Random Forest trained
K-Nearest Neighbors trained
Naive Bayes trained
SVM trained


In [23]:
results_df = pd.DataFrame(results).T
results_df = results_df.sort_values(by='R2 Score', ascending=False)
print(results_df)

                     R2 Score      RMSE      MAE      MSE  Accuracy
SVM                  0.993005  0.067082  0.00300  0.00450   0.99775
Logistic Regression  0.991451  0.074162  0.00350  0.00550   0.99750
Naive Bayes          0.990674  0.077460  0.00400  0.00600   0.99700
K-Nearest Neighbors  0.988731  0.085147  0.00425  0.00725   0.99725
Random Forest        0.978627  0.117260  0.00825  0.01375   0.99450
Decision Tree        0.759066  0.393700  0.09350  0.15500   0.93725


In [None]:
best_model_name = results_df['R2 Score'].idxmax()
best_model_name = results_df['Accuracy'].idxmax()
best_model = models[best_model_name]
print(f"The best model is: {best_model_name} with R2 Score: {results_df['R2 Score'].max()}")
print(f"The best model is: {best_model_name} with Accuracy Score: {results_df['R2 Score'].max()}")

best_model.fit(X_train, y_train)
y_pred_best = best_model.predict(X_test)

The best model is: SVM with R2 Score: 0.9930051559772497
The best model is: SVM with Accuracy Score: 0.9930051559772497
