In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_squared_error, r2_score

In [3]:
!kaggle competitions download -c playground-series-s5e7

playground-series-s5e7.zip: Skipping, found more recently modified local copy (use --force to force download)


In [4]:
df_train = pd.read_csv('../Data/train.csv')
df_train.head()

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0.0,No,6.0,4.0,No,15.0,5.0,Extrovert
1,1,1.0,No,7.0,3.0,No,10.0,8.0,Extrovert
2,2,6.0,Yes,1.0,0.0,,3.0,0.0,Introvert
3,3,3.0,No,7.0,3.0,No,11.0,5.0,Extrovert
4,4,1.0,No,4.0,4.0,No,13.0,,Extrovert


In [49]:
df_misnoneTrain = df_train.fillna("no value")
df_misnoneTrain

In [5]:
df_cleanedTrain = df_train.dropna(axis=0)
df_cleanedTrain

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0.0,No,6.0,4.0,No,15.0,5.0,Extrovert
1,1,1.0,No,7.0,3.0,No,10.0,8.0,Extrovert
3,3,3.0,No,7.0,3.0,No,11.0,5.0,Extrovert
7,7,2.0,No,8.0,3.0,No,4.0,5.0,Extrovert
9,9,1.0,No,8.0,6.0,No,14.0,9.0,Extrovert
...,...,...,...,...,...,...,...,...,...
18509,18509,1.0,No,3.0,4.0,No,15.0,4.0,Extrovert
18511,18511,0.0,No,4.0,5.0,No,11.0,4.0,Extrovert
18514,18514,6.0,No,5.0,3.0,No,10.0,4.0,Extrovert
18519,18519,3.0,No,7.0,3.0,No,9.0,7.0,Extrovert


In [16]:
X_encoded = pd.get_dummies(df_cleanedTrain.drop('Personality', axis=1), drop_first=True)

y = df_cleanedTrain['Personality']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

In [19]:
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.9661432777232581


In [28]:
!pip install optuna
import optuna

Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
Downloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.9.0 optuna-4.4.0


In [29]:
def objective(trial):
    param = {
    'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
    'max_depth': trial.suggest_int('max_depth', 3, 10),           
    'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
    'subsample': trial.suggest_float('subsample', 0.6, 1.0),
    'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
    'gamma': trial.suggest_float('gamma', 0, 5),
    'reg_alpha': trial.suggest_float('reg_alpha', 0, 3),         
    'reg_lambda': trial.suggest_float('reg_lambda', 0, 3),
    'use_label_encoder': False,
    'eval_metric': 'logloss',
    'random_state': 42
}

    model = XGBClassifier(**param)

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)  

print("Best Params:", study.best_params)
print("Best CV Score:", study.best_value)

[I 2025-07-04 13:18:36,735] A new study created in memory with name: no-name-21bd89d5-069c-4e9d-9db0-1401f83030f5
[I 2025-07-04 13:18:39,713] Trial 0 finished with value: 0.9623364791821045 and parameters: {'n_estimators': 736, 'max_depth': 7, 'learning_rate': 0.27516260801045667, 'subsample': 0.7462933155271944, 'colsample_bytree': 0.9551244698695116, 'gamma': 0.8463799103057074, 'reg_alpha': 2.9267925221330997, 'reg_lambda': 0.23127097039662203}. Best is trial 0 with value: 0.9623364791821045.
[I 2025-07-04 13:18:42,045] Trial 1 finished with value: 0.9636862476631822 and parameters: {'n_estimators': 572, 'max_depth': 8, 'learning_rate': 0.10428623824891824, 'subsample': 0.8767291285288485, 'colsample_bytree': 0.8831925326750574, 'gamma': 1.1910860676813262, 'reg_alpha': 2.9502023921064278, 'reg_lambda': 2.171888456435834}. Best is trial 1 with value: 0.9636862476631822.
[I 2025-07-04 13:18:44,659] Trial 2 finished with value: 0.9633180742741289 and parameters: {'n_estimators': 778, 

Best Params: {'n_estimators': 299, 'max_depth': 5, 'learning_rate': 0.15841262137302178, 'subsample': 0.8519152889164038, 'colsample_bytree': 0.6808885474211932, 'gamma': 2.0070959113867732, 'reg_alpha': 1.2522715414146957, 'reg_lambda': 2.5895571241033593}
Best CV Score: 0.9639316464361883


In [46]:
final_model = XGBClassifier(
    n_estimators=299,
    max_depth=5,
    learning_rate=0.1584,
    subsample=0.8519,
    colsample_bytree=0.6809,
    gamma=2.007,
    reg_alpha=1.252,
    reg_lambda=2.589,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

final_model.fit(X_scaled, y)
y_pred_log = final_model.predict(X_test)
y_pred = np.expm1(y_pred_log)
y_test_actual = np.expm1(y_test)

rmse = mean_squared_error(y_test_actual, y_pred, squared=False)
r2_train = final_model.score(X_train, y_train)
r2_test = r2_score(y_test_actual, y_pred)

print("RMSE:", rmse)
print("R2 Train:", r2_train)

RMSE: 0.30210824789781193
R2 Train: 0.9640534903692799




In [38]:
import joblib
joblib.dump(final_model, 'final_model.pkl')

['final_model.pkl']

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
import joblib

df = df_train.copy()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
if 'Personality' in categorical_cols:
    categorical_cols.remove('Personality')
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

le = LabelEncoder()
y_encoded = le.fit_transform(df['Personality'])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        n_estimators=299,
        max_depth=5,
        learning_rate=0.1584,
        subsample=0.8519,
        colsample_bytree=0.6809,
        gamma=2.007,
        reg_alpha=1.252,
        reg_lambda=2.589,
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42
    ))
])

pipeline.fit(df.drop('Personality', axis=1), y_encoded)

joblib.dump({'pipeline': pipeline, 'label_encoder': le}, 'final_model_pipeline.pkl')

['final_model_pipeline.pkl']

In [62]:
df_submission2 = pd.read_csv('../Data/test.csv')

import joblib

model_bundle = joblib.load('final_model_pipeline.pkl')
pipeline = model_bundle['pipeline']
le = model_bundle['label_encoder']

preds_numeric = pipeline.predict(df_submission2)
preds_label = le.inverse_transform(preds_numeric)

submission = pd.DataFrame({
    'id': df_submission2['id'],
    'Personality': preds_label
})

submission.to_csv('submission2.csv', index=False)
submission.head()

Unnamed: 0,id,Personality
0,18524,Extrovert
1,18525,Introvert
2,18526,Extrovert
3,18527,Extrovert
4,18528,Introvert
