<a href="https://colab.research.google.com/github/Tarunika-R/Personality_Predict_Psychology/blob/main/ML_personality_prediction_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Project: Presonality Prediction - Psychology

Import necessary libraries

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier


Exploratory Data Analysis

In [None]:
data = pd.read_csv('/content/drive/MyDrive/data-final.csv', sep = '\t')

# Drop columns using column names
columns_to_remove = data.columns[50:]
data.drop(columns=columns_to_remove, inplace=True)
columns_to_drop = ['EXT1','EXT4','EXT7','EXT10','EST3','EST5','EST9','EST10','AGR1','AGR2','AGR6','AGR8','CSN3','CSN7','CSN8','CSN10','OPN1','OPN8','OPN9','OPN10']
data.drop(columns_to_drop, axis=1, inplace=True)

#handling missing values
null_values = data.isnull().sum()

data_filled = data.fillna(data.mean())

null_values = data_filled.isnull().sum()
data_filled.head()

Unnamed: 0,EXT2,EXT3,EXT5,EXT6,EXT8,EXT9,EST1,EST2,EST4,EST6,...,CSN4,CSN5,CSN6,CSN9,OPN2,OPN3,OPN4,OPN5,OPN6,OPN7
0,1.0,5.0,5.0,1.0,2.0,4.0,1.0,4.0,2.0,2.0,...,2.0,2.0,4.0,4.0,1.0,4.0,1.0,4.0,1.0,5.0
1,5.0,3.0,3.0,3.0,5.0,1.0,2.0,3.0,1.0,1.0,...,3.0,3.0,1.0,5.0,2.0,4.0,2.0,3.0,1.0,4.0
2,3.0,4.0,3.0,2.0,3.0,2.0,4.0,4.0,2.0,2.0,...,2.0,3.0,3.0,4.0,1.0,2.0,1.0,4.0,2.0,5.0
3,2.0,2.0,4.0,2.0,4.0,1.0,3.0,3.0,2.0,2.0,...,4.0,1.0,2.0,1.0,2.0,5.0,2.0,3.0,1.0,4.0
4,3.0,3.0,5.0,3.0,5.0,3.0,1.0,5.0,3.0,1.0,...,1.0,3.0,1.0,5.0,1.0,5.0,1.0,5.0,1.0,5.0


Clustering Data using K-means algorithm

In [None]:
#define 5 clusters and fit model
kmeans = KMeans(n_clusters=5)
k_fit = kmeans.fit(data_filled)

# Predicting the Clusters
pd.options.display.max_columns = 10

#labels_ is used to identify Labels of each point
predictions = k_fit.labels_
data_filled['Clusters'] = predictions
data_filled.head(10)



Unnamed: 0,EXT2,EXT3,EXT5,EXT6,EXT8,...,OPN4,OPN5,OPN6,OPN7,Clusters
0,1.0,5.0,5.0,1.0,2.0,...,1.0,4.0,1.0,5.0,3
1,5.0,3.0,3.0,3.0,5.0,...,2.0,3.0,1.0,4.0,0
2,3.0,4.0,3.0,2.0,3.0,...,1.0,4.0,2.0,5.0,3
3,2.0,2.0,4.0,2.0,4.0,...,2.0,3.0,1.0,4.0,0
4,3.0,3.0,5.0,3.0,5.0,...,1.0,5.0,1.0,5.0,3
5,3.0,4.0,4.0,2.0,3.0,...,1.0,3.0,1.0,5.0,3
6,3.0,4.0,3.0,3.0,3.0,...,3.0,4.0,1.0,5.0,3
7,1.0,5.0,5.0,2.0,2.0,...,1.0,5.0,1.0,4.0,3
8,2.0,3.0,4.0,2.0,2.0,...,1.0,4.0,1.0,5.0,3
9,5.0,3.0,2.0,3.0,4.0,...,1.0,3.0,3.0,4.0,1


Split Dataset into Train and Test sets

In [None]:
# Split the data into training and test sets
X = data_filled.iloc[:,:-1]
y = data_filled['Clusters']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the resulting datasets
print("Training set shape:", X_train.shape, y_train.shape)
print("Test set shape:", X_test.shape, y_test.shape)


Training set shape: (812272, 30) (812272,)
Test set shape: (203069, 30) (203069,)


Train model using KNN algorithm

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Training the KNN model
knn = KNeighborsClassifier(n_neighbors=5)  # You can adjust the value of n_neighbors as needed
knn.fit(X_train, y_train)

import joblib
filename = "model"
joblib.dump(knn, filename)


['model']

In [None]:
# Predicting personality traits
y_pred = knn.predict(X_test)

# Evaluating the model
print(y_pred)

[3 3 4 ... 3 2 0]


Calculate accuracy for the model trained

In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.8799373611925011


## Predict personality with user inputs
### Questions to ask user

In [None]:
ext_questions = {'EXT2' : 'I dont talk a lot',
                 'EXT3' : 'I feel comfortable around people',
                 'EXT5' : 'I start conversations',
                 'EXT6' : 'I have little to say',
                 'EXT8' : 'I dont like to draw attention to myself',
                 'EXT9' : 'I dont mind being the center of attention'}

est_questions = {'EST1' : 'I get stressed out easily',
                 'EST2' : 'I am relaxed most of the time',
                 'EST4' : 'I seldom feel blue',
                 'EST6' : 'I get upset easily',
                 'EST7' : 'I change my mood a lot',
                 'EST8' : 'I have frequent mood swings'}

agr_questions = {'AGR3' : 'I insult people',
                 'AGR4' : 'I sympathize with others feelings',
                 'AGR5' : 'I am not interested in other peoples problems',
                 'AGR7' : 'I am not really interested in others',
                 'AGR9' : 'I feel others emotions',
                 'AGR10': 'I make people feel at ease'}

csn_questions = {'CSN1' : 'I am always prepared',
                 'CSN2' : 'I leave my belongings around',
                 'CSN4' : 'I make a mess of things',
                 'CSN5' : 'I get chores done right away',
                 'CSN6' : 'I often forget to put things back in their proper place',
                 'CSN9' : 'I follow a schedule'}

opn_questions = {'OPN2' : 'I have difficulty understanding abstract ideas',
                 'OPN3' : 'I have a vivid imagination',
                 'OPN4' : 'I am not interested in abstract ideas',
                 'OPN5' : 'I have excellent ideas',
                 'OPN6' : 'I do not have a good imagination',
                 'OPN7' : 'I am quick to understand things'}




In [None]:

'''
user_answers = {'EXT2': 5, 'EXT3': 3, 'EXT5': 3, 'EXT6': 3, 'EXT8': 5, 'EXT9': 1,
                'EST1': 2, 'EST2': 3, 'EST4': 1, 'EST6': 1, 'EST7': 2, 'EST8': 1,
                'AGR3': 1, 'AGR4': 5, 'AGR5': 1, 'AGR7': 3, 'AGR9': 5, 'AGR10': 3,
                'CSN1': 3, 'CSN2': 2, 'CSN4': 3, 'CSN5': 3, 'CSN6': 1, 'CSN9': 5,
                'OPN2': 2, 'OPN3': 4, 'OPN4': 2, 'OPN5': 3, 'OPN6': 1, 'OPN7': 4}
 '''

user_answers = {}
# Loop through the questions for each trait
for trait, question in ext_questions.items():
    print(f"{trait} Trait:")
    answer = int(input(f"{question} (Enter your answer from 1 to 5): "))
    user_answers[trait] = answer

for trait, question in est_questions.items():
    print(f"{trait} Trait:")
    answer = int(input(f"{question} (Enter your answer from 1 to 5): "))
    user_answers[trait] = answer

for trait, question in agr_questions.items():
    print(f"{trait} Trait:")
    answer = int(input(f"{question} (Enter your answer from 1 to 5): "))
    user_answers[trait] = answer

for trait, question in csn_questions.items():
    print(f"{trait} Trait:")
    answer = int(input(f"{question} (Enter your answer from 1 to 5): "))
    user_answers[trait] = answer

for trait, question in opn_questions.items():
    print(f"{trait} Trait:")
    answer = int(input(f"{question} (Enter your answer from 1 to 5): "))
    user_answers[trait] = answer

# Create a DataFrame using the user's answers
user_data = pd.DataFrame(user_answers, index=[0])

print(user_answers)

EXT2 Trait:
I dont talk a lot (Enter your answer from 1 to 5): 2
EXT3 Trait:
I feel comfortable around people (Enter your answer from 1 to 5): 4
EXT5 Trait:
I start conversations (Enter your answer from 1 to 5): 1
EXT6 Trait:
I have little to say (Enter your answer from 1 to 5): 3
EXT8 Trait:
I dont like to draw attention to myself (Enter your answer from 1 to 5): 5
EXT9 Trait:
I dont mind being the center of attention (Enter your answer from 1 to 5): 1
EST1 Trait:
I get stressed out easily (Enter your answer from 1 to 5): 2
EST2 Trait:
I am relaxed most of the time (Enter your answer from 1 to 5): 4
EST4 Trait:
I seldom feel blue (Enter your answer from 1 to 5): 3
EST6 Trait:
I get upset easily (Enter your answer from 1 to 5): 5
EST7 Trait:
I change my mood a lot (Enter your answer from 1 to 5): 1
EST8 Trait:
I have frequent mood swings (Enter your answer from 1 to 5): 3
AGR3 Trait:
I insult people (Enter your answer from 1 to 5): 4
AGR4 Trait:
I sympathize with others feelings (Enter

Based on answers classify the user's personality

In [None]:
'''# Split the data into features (X) and target (y)
X_train = data_filled.drop('Clusters', axis=1)
y_train = data_filled['Clusters']

# Training the KNN model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Create a DataFrame containing the user's answers
user_data = pd.DataFrame(user_answers, index=[0])

# Fill any missing values in the user data using the mean values from the original dataset
user_data_filled = user_data.fillna(data.mean())

# Predict the personality cluster for the user
predicted_cluster = knn.predict(user_data_filled)
print("Predicted Personality Cluster:", predicted_cluster)

'''
# Get the feature column names used during training
feature_columns = X_train.columns

# Create a DataFrame containing the user's answers
user_data = pd.DataFrame(user_answers, index=[0])

# Reorder the user data columns to match the feature column order
user_data_reordered = user_data[feature_columns]

# Fill any missing values in the user data using the mean values from the original dataset
user_data_filled = user_data_reordered.fillna(data.mean())

# Predict the personality cluster for the user
predicted_cluster = knn.predict(user_data_filled)
print("Predicted Personality Cluster:", predicted_cluster)


Predicted Personality Cluster: [1]


In [None]:
# Define the mapping of cluster numbers to personality names
personality_names = {
    0: "Introverted and Reserved",
    1: "Friendly and Outgoing",
    2: "Emotionally Stable",
    3: "Sympathetic and Caring",
    4: "Organized and Disciplined"
}

# Predict the personality cluster for the user
predicted_cluster = knn.predict(user_data_filled)

# Get the corresponding personality name for the predicted cluster
predicted_personality = personality_names[predicted_cluster[0]]

print("Predicted Personality Cluster:", predicted_cluster)
print("Predicted Personality:", predicted_personality)


Predicted Personality Cluster: [1]
Predicted Personality: Friendly and Outgoing


In [None]:
# Get the predicted probabilities for all clusters
cluster_probabilities = knn.predict_proba(user_data_filled)[0]


# Calculate the percentage similarity for each cluster
for cluster_num, similarity in enumerate(cluster_probabilities):
    if cluster_num != predicted_cluster:
        personality_name = personality_names[cluster_num]
        percentage_similarity = similarity * 100
        print(f"Similarity with {personality_name}: {percentage_similarity:.2f}%")
        #similarity_percentages[personality_name] = percentage_similarity



Similarity with Introverted and Reserved: 20.00%
Similarity with Emotionally Stable: 0.00%
Similarity with Sympathetic and Caring: 0.00%
Similarity with Organized and Disciplined: 40.00%
