In [3]:
import pandas as pd
import numpy as np
import os

In [4]:
for dirname,_,filenames in os.walk('./data'):
    for filename in filenames:
        print(os.path.join(dirname,filename))

./data\personality_dataset.csv
./data\sample_submission.csv
./data\test.csv
./data\train.csv


In [5]:
df=pd.read_csv('./data/train.csv')
df_test=pd.read_csv('./data/test.csv')
df_sample_submission=pd.read_csv('./data/sample_submission.csv')

In [6]:
df.shape

(18524, 9)

In [7]:
df['Personality'].value_counts()

Personality
Extrovert    13699
Introvert     4825
Name: count, dtype: int64

In [8]:
df.head(5)

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0.0,No,6.0,4.0,No,15.0,5.0,Extrovert
1,1,1.0,No,7.0,3.0,No,10.0,8.0,Extrovert
2,2,6.0,Yes,1.0,0.0,,3.0,0.0,Introvert
3,3,3.0,No,7.0,3.0,No,11.0,5.0,Extrovert
4,4,1.0,No,4.0,4.0,No,13.0,,Extrovert


In [9]:
print(df['Stage_fear'].unique())
print(df['Drained_after_socializing'].unique())

['No' 'Yes' nan]
['No' nan 'Yes']


In [10]:
mapping = {'Yes': 1, 'No': 0}
df['Stage_fear'] = df['Stage_fear'].map(mapping)

mapping1 = {'Yes': 1, 'No': 0}
df['Drained_after_socializing'] = df['Drained_after_socializing'].map(mapping1)

mapping2 = {'Extrovert': 1, 'Introvert': 0}
df['Personality'] = df['Personality'].map(mapping2)

In [11]:
df.head(5)

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0.0,0.0,6.0,4.0,0.0,15.0,5.0,1
1,1,1.0,0.0,7.0,3.0,0.0,10.0,8.0,1
2,2,6.0,1.0,1.0,0.0,,3.0,0.0,0
3,3,3.0,0.0,7.0,3.0,0.0,11.0,5.0,1
4,4,1.0,0.0,4.0,4.0,0.0,13.0,,1


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18524 entries, 0 to 18523
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         18524 non-null  int64  
 1   Time_spent_Alone           17334 non-null  float64
 2   Stage_fear                 16631 non-null  float64
 3   Social_event_attendance    17344 non-null  float64
 4   Going_outside              17058 non-null  float64
 5   Drained_after_socializing  17375 non-null  float64
 6   Friends_circle_size        17470 non-null  float64
 7   Post_frequency             17260 non-null  float64
 8   Personality                18524 non-null  int64  
dtypes: float64(7), int64(2)
memory usage: 1.3 MB


In [13]:
df.isnull().sum()

id                              0
Time_spent_Alone             1190
Stage_fear                   1893
Social_event_attendance      1180
Going_outside                1466
Drained_after_socializing    1149
Friends_circle_size          1054
Post_frequency               1264
Personality                     0
dtype: int64

In [14]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

num_cols = ['Time_spent_Alone', 'Stage_fear', 'Social_event_attendance',
            'Going_outside', 'Drained_after_socializing',
            'Friends_circle_size', 'Post_frequency']

imputer = KNNImputer(n_neighbors=5)
df[num_cols] = imputer.fit_transform(df[num_cols])

scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

In [15]:
import optuna
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
import numpy as np

num_cols = [
    'Time_spent_Alone', 'Stage_fear', 'Social_event_attendance',
    'Going_outside', 'Drained_after_socializing',
    'Friends_circle_size', 'Post_frequency'
]

X = df[num_cols]
y = df['Personality']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

def objective(trial):
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy', 'log_loss'])
    splitter = trial.suggest_categorical('splitter', ['best', 'random'])
    max_depth = trial.suggest_int('max_depth', 3, 150)  # increased range
    min_samples_split = trial.suggest_int('min_samples_split', 2, 50)  # more flexibility
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 25)
    max_features = trial.suggest_categorical('max_features', [None, 'sqrt', 'log2'])
    class_weight = trial.suggest_categorical('class_weight', [None, 'balanced'])
    ccp_alpha = trial.suggest_float('ccp_alpha', 0.0, 0.03, step=0.002)  # pruning strength

    model = DecisionTreeClassifier(
        criterion=criterion,
        splitter=splitter,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        class_weight=class_weight,
        ccp_alpha=ccp_alpha,
        random_state=42
    )

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_train, y_train, cv=skf, scoring='f1_macro')
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, show_progress_bar=True)  # increased to 100 trials

print("\nðŸŽ¯ Best Hyperparameters Found:")
print(study.best_params)

best_params = study.best_params
best_model = DecisionTreeClassifier(
    **best_params,
    random_state=42
)
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)

print("\nâœ… Model Performance on Test Set:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred, average='macro'))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

[I 2025-12-03 01:49:16,027] A new study created in memory with name: no-name-4c91c707-a584-4827-b777-a61b56accf17


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-12-03 01:49:16,424] Trial 0 finished with value: 0.9606308442756191 and parameters: {'criterion': 'gini', 'splitter': 'best', 'max_depth': 85, 'min_samples_split': 2, 'min_samples_leaf': 9, 'max_features': None, 'class_weight': None, 'ccp_alpha': 0.026000000000000002}. Best is trial 0 with value: 0.9606308442756191.
[I 2025-12-03 01:49:16,594] Trial 1 finished with value: 0.9609008640741779 and parameters: {'criterion': 'log_loss', 'splitter': 'best', 'max_depth': 6, 'min_samples_split': 37, 'min_samples_leaf': 19, 'max_features': None, 'class_weight': None, 'ccp_alpha': 0.0}. Best is trial 1 with value: 0.9609008640741779.
[I 2025-12-03 01:49:16,748] Trial 2 finished with value: 0.956034391331156 and parameters: {'criterion': 'gini', 'splitter': 'best', 'max_depth': 36, 'min_samples_split': 10, 'min_samples_leaf': 22, 'max_features': 'sqrt', 'class_weight': None, 'ccp_alpha': 0.012}. Best is trial 1 with value: 0.9609008640741779.
[I 2025-12-03 01:49:16,967] Trial 3 finished w

In [16]:
from sklearn.metrics import roc_auc_score
print("ROC-AUC:", roc_auc_score(y_test, best_model.predict_proba(X_test)[:,1]))

ROC-AUC: 0.9607789039748875
