In [140]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [141]:
df = pd.read_csv("personality_dataset.csv")

In [142]:
df

Unnamed: 0,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,4.0,No,4.0,6.0,No,13.0,5.0,Extrovert
1,9.0,Yes,0.0,0.0,Yes,0.0,3.0,Introvert
2,9.0,Yes,1.0,2.0,Yes,5.0,2.0,Introvert
3,0.0,No,6.0,7.0,No,14.0,8.0,Extrovert
4,3.0,No,9.0,4.0,No,8.0,5.0,Extrovert
...,...,...,...,...,...,...,...,...
2895,3.0,No,7.0,6.0,No,6.0,6.0,Extrovert
2896,3.0,No,8.0,3.0,No,14.0,9.0,Extrovert
2897,4.0,Yes,1.0,1.0,Yes,4.0,0.0,Introvert
2898,11.0,Yes,1.0,,Yes,2.0,0.0,Introvert


In [143]:
df.shape

(2900, 8)

In [144]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2900 entries, 0 to 2899
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Time_spent_Alone           2837 non-null   float64
 1   Stage_fear                 2827 non-null   object 
 2   Social_event_attendance    2838 non-null   float64
 3   Going_outside              2834 non-null   float64
 4   Drained_after_socializing  2848 non-null   object 
 5   Friends_circle_size        2823 non-null   float64
 6   Post_frequency             2835 non-null   float64
 7   Personality                2900 non-null   object 
dtypes: float64(5), object(3)
memory usage: 181.4+ KB


In [145]:
df.describe()

Unnamed: 0,Time_spent_Alone,Social_event_attendance,Going_outside,Friends_circle_size,Post_frequency
count,2837.0,2838.0,2834.0,2823.0,2835.0
mean,4.505816,3.963354,3.0,6.268863,3.564727
std,3.479192,2.903827,2.247327,4.289693,2.926582
min,0.0,0.0,0.0,0.0,0.0
25%,2.0,2.0,1.0,3.0,1.0
50%,4.0,3.0,3.0,5.0,3.0
75%,8.0,6.0,5.0,10.0,6.0
max,11.0,10.0,7.0,15.0,10.0


In [146]:
X = df.drop("Personality", axis=1)
y = df["Personality"]

In [147]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [148]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoding", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
]) 

In [149]:
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scalar", StandardScaler())
])

In [150]:
from sklearn.compose import ColumnTransformer
cat_columns = ["Stage_fear", "Drained_after_socializing"]
num_columns = [col for col in X_train.columns if col not in cat_columns]

final_pipeline = ColumnTransformer([
    ("cat", cat_pipeline, cat_columns),
    ("num", num_pipeline, num_columns)
])

In [151]:
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)



In [152]:
from sklearn.preprocessing import LabelEncoder
X_train = final_pipeline.fit_transform(X_train)
le = LabelEncoder()
y_train = le.fit_transform(y_train)


  y = column_or_1d(y, warn=True)


In [153]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score


In [154]:
models = {
    "Logistic Regression": LogisticRegression(C=1.0, solver='liblinear', max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(max_depth=10, min_samples_split=5, criterion='entropy'),
    "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42),
    "SVM": SVC(C=1.0, kernel='rbf', gamma='scale', probability=True),
    "KNN": KNeighborsClassifier(n_neighbors=5, weights='distance', algorithm='auto'),
    "Naive Bayes": GaussianNB()
}

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    print(f"{name}: Mean Accuracy = {scores.mean():.4f}")

Logistic Regression: Mean Accuracy = 0.9250
Decision Tree: Mean Accuracy = 0.9228
Random Forest: Mean Accuracy = 0.9341
SVM: Mean Accuracy = 0.9358
KNN: Mean Accuracy = 0.9129
Naive Bayes: Mean Accuracy = 0.9353


In [155]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'criterion': ['gini', 'entropy']
}

grid_search_random_forest = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search_random_forest.fit(X_train, y_train)

print("Best Parameters:", grid_search_random_forest.best_params_)
print("Best CV Score:", grid_search_random_forest.best_score_)

best_rf = grid_search_random_forest.best_estimator_


Best Parameters: {'criterion': 'entropy', 'max_depth': 10, 'n_estimators': 200}
Best CV Score: 0.9349137931034482


In [156]:
best_rf

In [157]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6]
}

grid_search_naive = GridSearchCV(GaussianNB(), param_grid, cv=5, scoring='accuracy')
grid_search_naive.fit(X_train, y_train)

print("Best Parameters:", grid_search_naive.best_params_)
print("Best CV Score:", grid_search_naive.best_score_)

best_rf = grid_search_naive.best_estimator_

Best Parameters: {'var_smoothing': 1e-09}
Best CV Score: 0.935344827586207


In [158]:
best_rf

In [159]:
from sklearn.svm import SVC

param_grid = {
    'C': [0.1, 1, 10, 100],              
    'kernel': ['linear', 'rbf', 'poly'], 
    'gamma': ['scale', 'auto'],          
    'degree': [2, 3, 4]                  
}

grid_search_SVC = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy')
grid_search_SVC.fit(X_train, y_train)

print("Best Parameters:", grid_search_SVC.best_params_)
print("Best CV Score:", grid_search_SVC.best_score_)

best_rf = grid_search_SVC.best_estimator_

Best Parameters: {'C': 0.1, 'degree': 2, 'gamma': 'scale', 'kernel': 'rbf'}
Best CV Score: 0.9357758620689655


In [160]:
best_rf

In [161]:
best_model = grid_search_SVC.best_estimator_

X_test = final_pipeline.transform(X_test)
y_test = le.transform(y_test)
y_pred = best_model.predict(X_test)

  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [162]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9293103448275862
Precision: 0.9157894736842105
Recall: 0.9388489208633094
Confusion Matrix:
 [[278  24]
 [ 17 261]]


## 📌 Subsampling the Training Data for Fast Hyperparameter Tuning
To speed up the hyperparameter tuning process (e.g., for GridSearchCV), we extract a small, representative subset (20%) from the full training set. This subset maintains the original class distribution by using stratify=y_train, ensuring the tuning process is still valid and fair.

X_sub, y_sub: 20% of training data used for model tuning

_: Remaining 80% is not used in this step (ignored)

random_state=42: Ensures the same split every time for reproducibility

This technique helps reduce computation time significantly, especially when working with large datasets.



In [163]:
from sklearn.model_selection import train_test_split

# Split 20% of training data for tuning
X_sub, _, y_sub, _ = train_test_split(X_train, y_train, train_size=0.2, stratify=y_train, random_state=42)
