<a href="https://colab.research.google.com/github/Omshree-16/Personality-Prediction-Model/blob/main/Personality_Prediction_Model_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Downloading datasets


In [2]:
pip install opendatasets



In [3]:
import opendatasets as od
od.download('https://www.kaggle.com/datasnaek/mbti-type')

Skipping, found downloaded files in "./mbti-type" (use force=True to force download)


In [None]:
od.download('https://www.kaggle.com/kaggle/meta-kaggle')

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: omshree16
Your Kaggle Key: ··········
Dataset URL: https://www.kaggle.com/datasets/kaggle/meta-kaggle


### importing all required libraries


In [None]:
import re
from time import time
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
from nltk.stem.snowball import SnowballStemmer

In [None]:
import os
print('mbti-type : ',os.listdir('mbti-type'))
print('meta-kaggle : ',os.listdir('meta-kaggle'))

In [None]:
train_data = pd.read_csv('mbti-type/mbti_1.csv')
forum_data = pd.read_csv('meta-kaggle/ForumMessages.csv')
mbti = {'I':'Introversion', 'E':'Extroversion', 'N':'Intuition',
        'S':'Sensing', 'T':'Thinking', 'F': 'Feeling',
        'J':'Judging', 'P': 'Perceiving'}

In [None]:
train_data.head()

### Let's view all datasets

In [None]:
print(train_data.head(10))

In [None]:
print(forum_data.head(10))

### Let's view some info about our trainning dataset

In [None]:
train_data.shape

In [None]:
type_count = train_data['type'].value_counts()
colors = sns.color_palette("pastel")
plt.figure(figsize=(12,4))
sns.barplot(x=type_count.index, y=type_count.values, alpha=0.8, palette=colors)
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('Types', fontsize=12)
plt.show()

### Handle missing values

In [None]:
##ForumMessages.csv
print('Forum Missing Values:')
print(forum_data.isnull().sum())

##mbti_1.csv
print('Training Missing Values:')
print(train_data.isnull().sum())

forum_data['Message'] = forum_data['Message'].fillna('')

print('Forum Missing Values:')
print(forum_data.isnull().sum())

print(forum_data['PostUserId'].value_counts())

forum_data_agg = forum_data.groupby('PostUserId')['Message'].agg(lambda col: ' '.join(col)).reset_index()
print(forum_data_agg['PostUserId'].value_counts())

### Cleaning data

In [None]:
#function to clean data
def clean_text(text):
    #get rid of html and seperators
    text = BeautifulSoup(text, "lxml").text
    text = re.sub(r'\|\|\|', r'  ', text)
    text = re.sub(r'http\S+', r'  ', text)
    #get rid of punctuation
    text = text.replace('.', '  ')
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    #get rid of numbers
    text = ''.join(i for i in text if not i.isdigit())
    return text

train_data['clean_posts'] = train_data['posts'].apply(clean_text)
train_data['clean_posts'][1]

In [None]:
forum_data_agg['clean_messages'] = forum_data_agg['Message'].apply(clean_text)
forum_data_agg['clean_messages'][1]

In [None]:
#function to split string by uppercase
def split_uppercase(text):
    text_list = text.split()
    new_list = []
    for i in text_list:
        if i.isupper() == False: #don't split acronyms
            word = re.sub(r'([A-Z])', r' \1', i)
            new_list.append(word)
        else:
            word = i
            new_list.append(word)
    words = ' '.join(new_list)
    return words

forum_data_agg['clean_messages'] = forum_data_agg['clean_messages'].apply(split_uppercase)
forum_data_agg['clean_messages'][1]

In [None]:
#function to stem words
def stem_text(text):
    stemmer = SnowballStemmer('english')
    words_list = text.split()
    new_list = []
    for i in words_list:
        word = stemmer.stem(i)
        new_list.append(word)

    words = new_list
    words = ' '.join(words)
    return words
train_data['clean_posts'] = train_data['clean_posts'].apply(stem_text)
train_data['clean_posts'][1]

In [None]:
forum_data_agg['clean_messages'] = forum_data_agg['clean_messages'].apply(stem_text)
forum_data_agg['clean_messages'][1]

# **Classification(Classifier Model)**

### Importing libraries

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.decomposition import TruncatedSVD

kfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

scoring = {'acc': 'accuracy',
           'neg_log_loss': 'neg_log_loss',
           'f1_micro': 'f1_micro'}

## ExtraTreesClassifier with SVD(single value decomposition)

In [None]:
etc = ExtraTreesClassifier(n_estimators = 20, max_depth=4, n_jobs = -1)
tfidf = TfidfVectorizer(ngram_range=(1, 1), stop_words='english')
tsvd = TruncatedSVD(n_components=10)
model = Pipeline([('tfidf1', tfidf), ('tsvd1', tsvd), ('etc', etc)])


kfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)


np.random.seed(1)

results = cross_validate(model, train_data['clean_posts'], train_data['type'], cv=kfolds,
                          scoring=scoring, n_jobs=-1)

In [None]:
print("CV Accuracy: {:0.4f} (+/- {:0.4f})".format(np.mean(results['test_acc']),
                                                  np.std(results['test_acc'])))

print("CV F1: {:0.4f} (+/- {:0.4f})".format(np.mean(results['test_f1_micro']),
                                            np.std(results['test_f1_micro'])))

print("CV Logloss: {:0.4f} (+/- {:0.4f})".format(np.mean(-1*results['test_neg_log_loss']),
                                                 np.std(-1*results['test_neg_log_loss'])))

### Naive Bayes

In [None]:
np.random.seed(1)

tfidf2 = CountVectorizer(ngram_range=(1, 1),
                         stop_words='english',
                         lowercase = True,
                         max_features = 5000)

model_nb = Pipeline([('tfidf1', tfidf2), ('nb', MultinomialNB())])

results_nb = cross_validate(model_nb, train_data['clean_posts'], train_data['type'], cv=kfolds,
                          scoring=scoring, n_jobs=-1)

In [None]:
print("CV Accuracy: {:0.4f} (+/- {:0.4f})".format(np.mean(results_nb['test_acc']),
                                                  np.std(results_nb['test_acc'])))

print("CV F1: {:0.4f} (+/- {:0.4f})".format(np.mean(results_nb['test_f1_micro']),
                                            np.std(results_nb['test_f1_micro'])))

print("CV Logloss: {:0.4f} (+/- {:0.4f})".format(np.mean(-1*results_nb['test_neg_log_loss']),
                                                 np.std(-1*results_nb['test_neg_log_loss'])))

### Logistic Regression

In [None]:
np.random.seed(1)

from sklearn.linear_model import LogisticRegression

tfidf2 = CountVectorizer(ngram_range=(1, 1), stop_words='english', lowercase = True, max_features = 5000)

model_lr = Pipeline([('tfidf1', tfidf2), ('lr', LogisticRegression(class_weight="balanced", C=0.005))])

results_lr = cross_validate(model_lr, train_data['clean_posts'], train_data['type'], cv=kfolds,
                          scoring=scoring, n_jobs=-1)

In [None]:
print("CV Accuracy: {:0.4f} (+/- {:0.4f})".format(np.mean(results_lr['test_acc']),
                                                  np.std(results_lr['test_acc'])))

print("CV F1: {:0.4f} (+/- {:0.4f})".format(np.mean(results_lr['test_f1_micro']),
                                            np.std(results_lr['test_f1_micro'])))

print("CV Logloss: {:0.4f} (+/- {:0.4f})".format(np.mean(-1*results_lr['test_neg_log_loss']),
                                                 np.std(-1*results_lr['test_neg_log_loss'])))

# **Visualization**
As our Last model `(Logistic Regression)` gives high accuracy so we will apply our last model to whole users comments.

Let's see what is the most common user personalities

In [None]:
model_lr.fit(train_data['clean_posts'], train_data['type'])
pred_all = model_lr.predict(forum_data_agg['clean_messages'])

In [None]:
cnt_all = np.unique(pred_all, return_counts=True)

pred_df = pd.DataFrame({'personality': cnt_all[0], 'count': cnt_all[1]},
                      columns=['personality', 'count'], index=None)

pred_df.sort_values('count', ascending=False, inplace=True)

plt.figure(figsize=(12,4))
sns.barplot(x=pred_df['personality'], y=pred_df['count'], alpha=0.8)
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('Personality', fontsize=12)
plt.show()

In [None]:
pred_df['percent'] = pred_df['count']/pred_df['count'].sum()
pred_df['description'] = pred_df['personality'].apply(lambda x: ' '.join([mbti[l] for l in list(x)]))
pred_df

In [None]:
import plotly.graph_objs as go
import plotly.offline as py
labels = pred_df['description']
sizes = pred_df['percent']*100

trace = go.Pie(labels=labels, values=sizes)
layout = go.Layout(title='Kaggle Personality Distribution')

data = [trace]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)