## Project Overview

This notebook builds a classifier to predict personality (Introvert/Extrovert) from behavioral traits.
I will load the dataset, preprocess it, perform exploratory analysis, train a machine learning model,
and save it for integration into a Streamlit application.

In [None]:
# importing all the necessary libraries
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier

import joblib

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier

## Data loading and viewing


In [None]:
# loading dataset into the dataframe in using pandas

dataset_path = 'D:\data_cleaning_using_python\personality_bot\dataset\intro_vs_extro_behave_dataset\personality_dataset.csv' 

df = pd.read_csv(dataset_path)
# displaying the first 5 rows of the dataset
print("First 5 rows of the dataset:")
print(df.head(5))

In [None]:
# checking all the dataset details
print("\nDataset Information:")
print(df.info())
print("\nDataset Description:")
print(df.describe())
# display cloumns of the dataset and no of columns
print("\nDataset Columns:")
print(df.columns)
print("\nNumber of Columns in the dataset:")
print(len(df.columns))
print(len(df) , "rows in the dataset")

In [None]:
# finding the null values in the dataset
print("\nChecking for null values:")
print(df.isnull().sum())


In [None]:
#print null value contains rows
print("\nRows with null values:")
print(df[df.isnull().any(axis=1)])
print(len(df[df.isnull().any(axis=1)]), "rows with null values found.")


In [None]:
# displaying the null values in each column
print(df.isnull().sum())

## Data preprocessing

In [None]:
#drop all the null valued rows in the dataset
df = df.dropna()

#print the columns with null valus
print("\nAfter dropping null values, the dataset now has:")
print(df.isnull().sum())


In [None]:
# find the unique values in the dataset4    
print("\nUnique values in each column:")
for column in df.columns:
    unique_values = df[column].unique()
    print(f"{column}: {len(unique_values)} unique values")
    print(unique_values)  # Display first 5 unique values for brevity
    print()  # New line for better readability

In [None]:
#encode the categorical columns
label_encoder = LabelEncoder()
yes_no_cols = ['Stage_fear','Drained_after_socializing']
for col in yes_no_cols:
    df[col] = label_encoder.fit_transform(df[col]) # Convert 'Yes'/'No' to 1/0

df.head(5)


In [None]:
personality_map = {
    "Introvert": 0,
    "Extrovert": 1
}
# Map personality types to numerical values
df['Personality'] = df['Personality'].map(personality_map)

df.head(5)

## Data Visualizing

In [None]:
# plot the personality distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='Personality', data=df, palette='viridis')
plt.title('Personality Distribution')
plt.xlabel('Personality Type')
plt.ylabel('Count')
plt.xticks(ticks=[0, 1], labels=['Introvert', 'Extrovert'])
plt.show()

In [None]:
# histograms for continuous features
continuous_features = df.select_dtypes(include=[np.number]).columns.tolist()
for feature in continuous_features:
    plt.figure(figsize=(10, 6))
    sns.histplot(df[feature], kde=True, bins=30, color='blue')
    plt.title(f'Histogram of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Frequency')
    plt.show()

In [None]:
# boxplots for feature vs personality
for feature in continuous_features:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='Personality', y=feature, data=df, palette='viridis')
    plt.title(f'Boxplot of {feature} by Personality Type')
    plt.xlabel('Personality Type')
    plt.ylabel(feature)
    plt.xticks(ticks=[0, 1], labels=['Introvert', 'Extrovert'])
    plt.show()

In [None]:
#violence plot for feature vs personality
for feature in continuous_features:
    plt.figure(figsize=(10, 6))
    sns.violinplot(x='Personality', y=feature, data=df, palette='viridis')
    plt.title(f'Violin Plot of {feature} by Personality Type')
    plt.xlabel('Personality Type')
    plt.ylabel(feature)
    plt.xticks(ticks=[0, 1], labels=['Introvert', 'Extrovert'])
    plt.show()

In [None]:
# pairplot for continuous features separated by personality type and
sns.pairplot(df, hue='Personality', vars=continuous_features, palette='viridis')
plt.title('Pairplot of Continuous Features by Personality Type')
plt.show()


In [None]:
# Correlation Heatmap
plt.figure(figsize=(12, 8))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# stripplot for feature vs personality
for feature in continuous_features:
    plt.figure(figsize=(10, 6))
    sns.stripplot(x='Personality', y=feature, data=df, palette='viridis', jitter=True)
    plt.title(f'Strip Plot of {feature} by Personality Type')
    plt.xlabel('Personality Type')
    plt.ylabel(feature)
    plt.xticks(ticks=[0, 1], labels=['Introvert', 'Extrovert'])
    plt.show()

## Model training

#### With RandomForestClassifier

In [None]:
# Model Training

rfc_model = RandomForestClassifier(random_state=42)

# Splitting the dataset into features and target variable
X = df.drop('Personality', axis=1)
y = df['Personality']
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
rfc_model.fit(X_train, y_train)
# Making predictions on the test set
y_pred_rfc = rfc_model.predict(X_test)
# Evaluating the model
accuracy = accuracy_score(y_test, y_pred_rfc)
print(f"Random Forest Classifier Accuracy: {accuracy:.2f}")
# Displaying the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_rfc)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Introvert', 'Extrovert'], yticklabels=['Introvert', 'Extrovert'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Model Evaluation
y_pred = rfc_model.predict(X_test)
print(confusion_matrix(y_test, y_pred_rfc))
print('')
print(classification_report(y_test, y_pred_rfc))

In [None]:
# cross validation
from sklearn.model_selection import cross_val_score
cross_val_scores = cross_val_score(rfc_model, X, y, cv=5)
print(f"Cross-validation scores: {cross_val_scores}")
print(f"Mean cross-validation score: {np.mean(cross_val_scores):.2f}")


#### With Xgboost Model

In [None]:
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
# Fitting the model
xgb_model.fit(X_train, y_train)
# Making predictions on the test set
y_pred_xgb = xgb_model.predict(X_test)

# Evaluating the model
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"XGBoost Classifier Accuracy: {accuracy_xgb:.2f}")
# Displaying the confusion matrix
conf_matrix_xgb = confusion_matrix(y_test, y_pred_xgb)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_xgb, annot=True, fmt='d', cmap='Blues', xticklabels=['Introvert', 'Extrovert'], yticklabels=['Introvert', 'Extrovert'])
plt.title('Confusion Matrix for XGBoost')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# model evaluation
print(confusion_matrix(y_test, y_pred_xgb))
print('')
print(classification_report(y_test, y_pred_xgb))


In [None]:
# cross validation for xgboost
from sklearn.model_selection import cross_val_score
cross_val_scores_xgb = cross_val_score(xgb_model, X, y, cv=5)
print(f"Cross-validation scores for XGBoost: {cross_val_scores_xgb}")
print(f"Mean cross-validation score for XGBoost: {np.mean(cross_val_scores_xgb):.2f}")


In [None]:
# Sample test data for prediction
sample_data = [9.0,1,0.0,0.0,1,0.0,0.0]
sample_data2 = [1.0,0,6.0,7.0,0,14.0,9.0]
# Convert sample data to DataFrame
sample_df = pd.DataFrame([sample_data], columns=X.columns)
# Predicting using Random Forest Classifier
predicted_personality_rfc = rfc_model.predict(sample_df)
# Predicting using XGBoost Classifier
predicted_personality_xgb = xgb_model.predict(sample_df)
# Displaying the predicted personality types
print(f"Predicted Personality (Random Forest): {'Introvert' if predicted_personality_rfc[0] == 0 else 'Extrovert'}")
print(f"Predicted Personality (XGBoost): {'Introvert' if predicted_personality_xgb[0] == 0 else 'Extrovert'}")


In [None]:
sample_df2 = pd.DataFrame([sample_data2], columns=X.columns)
# Predicting using Random Forest Classifier
predicted_personality_rfc2 = rfc_model.predict(sample_df2)
# Predicting using XGBoost Classifier
predicted_personality_xgb2 = xgb_model.predict(sample_df2)
# Displaying the predicted personality types for second sample
print(f"Predicted Personality for second sample (Random Forest): {'Introvert' if predicted_personality_rfc2[0] == 0 else 'Extrovert'}")
print(f"Predicted Personality for second sample (XGBoost): {'Introvert' if predicted_personality_xgb2[0] == 0 else 'Extrovert'}")


## Saving the Model


In [None]:
# save the 2 tained models
joblib.dump(rfc_model, 'random_forest_model.pkl')
joblib.dump(xgb_model, 'xgboost_model.pkl')