<a href="https://colab.research.google.com/github/Severus-Reddy/Personality-Classification-using-MBTI/blob/main/Untitled6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.metrics import classification_report, f1_score
import pickle
import os.path
import plotly.offline as pyo
import plotly.graph_objs as go
import spacy
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
# Load one of the availables trained pipelines for English
# English pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter, ner, attribute_ruler, lemmatizer.
nlp = spacy.load('en_core_web_sm')


# stop words built in spacy (english)
print(nlp.Defaults.stop_words)

{'meanwhile', 'except', 'nine', 'becomes', 'five', 'but', 'keep', 'which', 'yourselves', 'sixty', 'sometimes', "'re", 'from', 'our', 'thereupon', 'neither', 'been', "'ll", 'so', 'here', "'d", 'even', 'nobody', 'ten', "'s", 'many', 'with', 'amount', 'out', 'moreover', 'latter', 'are', 'across', 'very', 'once', 'there', '’ve', 'ever', 'became', 'already', 'nothing', 'see', 'being', 'off', 'her', 'move', 'really', 'twelve', 'until', 'nevertheless', 'those', 'under', 'fifteen', 'throughout', 'to', 're', 'before', 'during', 'is', 'we', 'always', 'anyone', 'than', 'first', 'who', 'can', 'each', 'own', 'none', 'besides', 'towards', 'about', 'through', 'hers', 'might', 'now', 'their', 'front', 'do', 'four', 'over', 'after', 'whose', 'every', 'mostly', 'against', 'its', 'enough', 'least', 'all', 'indeed', 'beside', 'along', 'somewhere', 'namely', 'whereupon', 'him', 'rather', 'among', 'above', 'further', 'cannot', 'ca', 'six', 'empty', 'together', 'thereby', 'much', 'seeming', 'other', '’ll', '

In [7]:
print(f"Number of default stop words : {len(nlp.Defaults.stop_words)}")

Number of default stop words : 326


In [8]:
# Checking if a word is a stop word
nlp.vocab['is'].is_stop

True

In [9]:
nlp.vocab['below'].is_stop

True

In [10]:
nlp.vocab['btw'].is_stop

False

In [11]:
s_stemmer = SnowballStemmer(language='english')

In [12]:
words = ['run', 'runner', 'ran', 'runs', 'easily', 'fairly', 'fairness','boats','boating']

In [13]:
for word in words:
    print(word+ ' ------> ' + s_stemmer.stem(word))

run ------> run
runner ------> runner
ran ------> ran
runs ------> run
easily ------> easili
fairly ------> fair
fairness ------> fair
boats ------> boat
boating ------> boat


In [14]:
# Function to display lemmas
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [15]:
# Function to display lemmas
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')

In [16]:
doc = nlp(u"I am meeting him tomorrow at the meeting.")
show_lemmas(doc)

I            PRON   4690420944186131903    I
am           AUX    10382539506755952630   be
meeting      VERB   6880656908171229526    meet
him          PRON   1655312771067108281    he
tomorrow     NOUN   3573583789758258062    tomorrow
at           ADP    11667289587015813222   at
the          DET    7425985699627899538    the
meeting      NOUN   14798207169164081740   meeting
.            PUNCT  12646065887601541794   .


In [17]:
count_vect = CountVectorizer()

In [18]:
phrase = ["I'd like to have a glass of water please"]

In [19]:
# Fit Vectorizer to the Data (build a vocab, count the number of words...)
# Learn a vocabulary dictionary of all tokens in the raw documents
count_vect.fit(phrase)

CountVectorizer()

In [20]:
# Show features
count_vect.get_feature_names()


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.



['glass', 'have', 'like', 'of', 'please', 'to', 'water']

In [21]:
# Learn the vocabulary dictionary and return document-term matrix
count_vect.fit_transform(phrase)

<1x7 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [22]:
# shows a mapping of terms to feature indices.
count_vect.vocabulary_

{'like': 2, 'to': 5, 'have': 1, 'glass': 0, 'of': 3, 'water': 6, 'please': 4}

In [23]:
df = pd.read_csv('MBTI 500.csv')

In [24]:
df.head()

Unnamed: 0,posts,type
0,know intj tool use interaction people excuse a...,INTJ
1,rap music ehh opp yeah know valid well know fa...,INTJ
2,preferably p hd low except wew lad video p min...,INTJ
3,drink like wish could drink red wine give head...,INTJ
4,space program ah bad deal meing freelance max ...,INTJ


In [25]:
df['posts'][0]

'know intj tool use interaction people excuse antisocial truly enlighten mastermind know would count pet peeze something time matter people either whether group people mall never see best friend sit outside conversation jsut listen want interject sit formulate say wait inject argument thought find fascinate sit watch people talk people fascinate sit class watch different people find intrigue dad intj u stand look like line safeway watch people home talk people like think military job people voluntarily go job important show deference endanger live glorify way civilian think pretty ignorant general think military necessary defense mechanism political tactic feel like u specifically invest much money could put money education whatnot though personally sound budget aernative really comment one way base two politician eye year ago come name somewhat important kinda role model nowadays pick keep score individual level mean little vary accord number condition day may score high others low sw

In [26]:
df['type'][0]

'INTJ'

In [27]:
df['type'].unique()

array(['INTJ', 'INTP', 'ISFJ', 'ISFP', 'ISTJ', 'ISTP', 'ENFJ', 'ENFP',
       'ENTJ', 'ENTP', 'ESFJ', 'ESFP', 'ESTJ', 'ESTP', 'INFJ', nan],
      dtype=object)

In [28]:
print(f"Total of {len(df['type'].unique())} types of classified MBTI posts")

Total of 16 types of classified MBTI posts


In [29]:
df.isnull().sum()

posts    0
type     1
dtype: int64

In [30]:
df_bar_chart=df.groupby('type').count()


trace1 = go.Bar(x=df_bar_chart.index, y=df_bar_chart['posts'])

data = [trace1]
layout = go.Layout(title='MBTI # Classified Posts per Type')

fig = go.Figure(data=data, layout=layout)

fig.show()

In [31]:
# Flag to re-create or not the machine learning model
recreate_model=False

In [32]:
# We'll save the model into a file:
filename = 'mbti_svm_v2.sav'

In [33]:
# If the model file doesn't exists
if not os.path.isfile(filename):
    recreate_model=True

In [44]:
X = df['posts'] # features
y = df['type']  # labels
X_train, X_test, y_train, y_test = train_test_split(X.astype(str), y.astype(str), test_size=0.2, random_state=42)

In [45]:
# Check if need to recreate the model
if recreate_model:    
    
    # Creating an instance to vectorizer:
    vectorizer = TfidfVectorizer()
    
    # Training the vectorizer:
    X_train_tfidf = vectorizer.fit_transform(X_train.astype(str))
    
    # Training the classifier:
    clf = LinearSVC()
    clf.fit(X_train_tfidf, y_train)
    
    # Pipelining the vectorizer and the classifier
    text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])
    text_clf.fit(X_train, y_train)
    
    # saving the model to disk
    pickle.dump(text_clf, open(filename, 'wb'))

# If there is no need to recreate the model, just open the file from the disk    
else:
    # loading the model from disk
    text_clf = pickle.load(open(filename, 'rb'))

In [46]:
predictions = text_clf.predict(X_test)

In [47]:
print(classification_report(y_test, predictions))


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



              precision    recall  f1-score   support

        ENFJ       0.83      0.69      0.76       312
        ENFP       0.85      0.84      0.85      1232
        ENTJ       0.89      0.75      0.82       569
        ENTP       0.87      0.85      0.86      2359
        ESFJ       0.82      0.30      0.44        46
        ESFP       0.83      0.50      0.62        80
        ESTJ       0.96      0.83      0.89        86
        ESTP       0.95      0.92      0.94       370
        INFJ       0.85      0.86      0.85      1859
        INTJ       0.84      0.88      0.86      4443
        INTP       0.86      0.90      0.88      5079
        ISFJ       0.80      0.58      0.68       130
        ISFP       0.77      0.68      0.72       163
        ISTJ       0.85      0.66      0.75       259
        ISTP       0.88      0.81      0.85       653
         nan       0.00      0.00      0.00         1

    accuracy                           0.86     17641
   macro avg       0.80   


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [48]:
print(f"Overall accuracy of the model: {round(metrics.accuracy_score(y_test, predictions),2)}")

Overall accuracy of the model: 0.86
