In [3]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.stem.porter import PorterStemmer

In [4]:
#importing all necessary modules
import pandas as pd
import numpy as np
import re as regex
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sidharthaverma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
#storing it in a dataframe
dataf = pd.read_csv('mbti_1.csv')
#checking whether the data frame loads or not
dataf.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [6]:
#Preprocessing the data to store it in a optimized way in DB

#Copy the original data-set
datafcopy = dataf.copy();

#Lowercasing
datafcopy["posts"] = datafcopy["posts"].apply(lambda dataflowercase: dataflowercase.lower())

#Remove links 
datafcopy["posts"] = datafcopy["posts"].apply(lambda LinksRemoval: regex.sub(r'https?:\/\/.*?[\s+]', '', LinksRemoval))
    

#Remove stop-words
stopWords = nltk.corpus.stopwords.words('english')
stopWords.extend(['enfj', 'enfp', 'entj', 'entp', 'esfj', 'esfp', 'estj', 'estp', 'infj', 'infp', 'intj', 'intp', 'isfj', 'isfp', 'istj', 'istp'])
regexPatStopWords = r'\b(?:{})\b'.format('|'.join(stopWords))
datafcopy["posts"] = datafcopy["posts"].str.replace(regexPatStopWords, '')


#Remove non words
datafcopy["posts"] = datafcopy["posts"].apply(lambda nonWordsRemoval: regex.sub(r'[^a-zA-Z\s]','', nonWordsRemoval))

(datafcopy.head())

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

  datafcopy["posts"] = datafcopy["posts"].str.replace(regexPatStopWords, '')


Unnamed: 0,type,posts
0,INFJ,moments sportscenter top ten plays pran...
1,ENTP,finding lack posts alarmingsex boring...
2,INTP,good one course say know blessing ...
3,INTJ,dear enjoyed conversation day esoteric...
4,ENTJ,fired another silly misconception approachin...


In [7]:
# Stemming
def get_stemmed_text(corpus):
    stemmer = PorterStemmer()
    return [' '.join([stemmer.stem(word) for word in review.split()]) for review in corpus]

datafcopy['posts'] = get_stemmed_text(datafcopy['posts'])
(datafcopy.head())

Unnamed: 0,type,posts
0,INFJ,moment sportscent top ten play prank lifechang...
1,ENTP,find lack post alarmingsex bore posit often ex...
2,INTP,good one cours say know bless curs absolut pos...
3,INTJ,dear enjoy convers day esoter gab natur univer...
4,ENTJ,fire anoth silli misconcept approach logic go ...


In [8]:
#Exporting Data
datafcopy.to_csv(index=False)
compression_output = dict(method='zip', archive_name='out.csv')  
datafcopy.to_csv('out.zip', index=False, compression=compression_output) 

In [29]:
# Converting MBTI personality (or target or Y feature) into numerical form using Label Encoding
# encoding personality type
#Define target and train to split training and testing data sets
enc = LabelEncoder()
datafcopy['type of encoding'] = enc.fit_transform(datafcopy['type'])
target = datafcopy['type of encoding']

In [9]:
# Vectorizing the posts for the model and filtering Stop-words
vect = CountVectorizer(stop_words='english') 

#Converting posts (or training or X feature) into numerical form by count vectorization
train =  vect.fit_transform(datafcopy["posts"])
print(train.shape)

(8675, 170083)


In [10]:
#Dividing the model into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.20, stratify=target, random_state=42)
print ((X_train.shape),(y_train.shape),(X_test.shape),(y_test.shape))

(6940, 170083) (6940,) (1735, 170083) (1735,)


In [40]:
#Using Gradient Descent Algorithm
sgd = SGDClassifier(max_iter=10, tol=None)
sgd.fit(X_train, y_train)

Y_pred = sgd.predict(X_test)
predictions = [round(value) for value in Y_pred]

# evaluate predictions
accuraciesSGD = {}
accuracySGD = accuracy_score(y_test, predictions)
accuraciesSGD['Gradient Descent'] = accuracySGD * 100.0
print("Accuracy: %.2f%%" % (accuracySGD * 100.0))

Accuracy: 41.79%


In [12]:
#Random Forest
accuraciesRF = {}
random_forest = RandomForestClassifier(n_estimators=100, random_state = 1)
random_forest.fit(X_train, y_train)

# make predictions for test data
Y_pred = random_forest.predict(X_test)
predictions = [round(value) for value in Y_pred]

# evaluate predictions
accuracyRF = accuracy_score(y_test, predictions)
accuraciesRF['Random Forest'] = accuracyRF * 100.0 
print("Accuracy: %.2f%%" % (accuracyRF * 100.0))

Accuracy: 31.99%


In [27]:
#Linear Regression
accuraciesLR = {}
lr = LinearRegression()
random_forest.fit(X_train, y_train)

# make predictions for test data
Y_pred = random_forest.predict(X_test)
predictions = [round(value) for value in Y_pred]

# evaluate predictions
accuracyLR = accuracy_score(y_test, predictions)
accuraciesLR['Linear Regression'] = accuracyLR * 100.0 
print("Accuracy: %.2f%%" % (accuracyLR * 100.0))

Accuracy: 31.99%


In [30]:
#Prediction of personality type

cntizer = CountVectorizer(analyzer="word", max_features=1000) 
X_trainExample =  cntizer.fit_transform(datafcopy['posts'])
tfizer = TfidfTransformer()
X_tfidf =  tfizer.fit_transform(X_trainExample).toarray()
print(X_tfidf)

X = X_tfidf
Y = datafcopy['type of encoding']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, stratify=target, random_state=42)


sgd = SGDClassifier(max_iter=10, tol=None)
sgd.fit(X_train, y_train)


my_post_text = "Open minded and introverted"
my_post = pd.DataFrame([[my_post_text]])
my_post = [str (item) for item in my_post]

my_X_cnt = cntizer.transform(my_post)
my_X_tfidf =  tfizer.transform(my_X_cnt).toarray()

Y_pred = sgd.predict(my_X_tfidf)

print(Y_pred[0])

result = "Default"    
if (Y_pred[0]) == 0:
  result = 'Personality type: ENFJ'
elif (Y_pred[0]) == 1:
  result = 'Personality type: ENFP'
elif (Y_pred[0]) == 2:
  result = 'Personality type: ENTJ'
elif (Y_pred[0]) == 3:
  result = 'Personality type: ENTP'
elif (Y_pred[0]) == 4:
  result = 'Personality type: ESFJ'
elif (Y_pred[0]) == 5:
  result = 'Personality type: ESFP'
elif (Y_pred[0]) == 6:
  result = 'Personality type: ESTJ'
elif (Y_pred[0]) == 7:
  result = 'Personality type: ESTP'
elif (Y_pred[0]) == 8:
  result = 'Personality type: INFJ'
elif (Y_pred[0]) == 9:
  result = 'Personality type: INFP'
elif (Y_pred[0]) == 10:
  result = 'Personality type: INTJ'
elif (Y_pred[0]) == 11:
  result = 'Personality type: INTP'
elif (Y_pred[0]) == 12:
  result = 'Personality type: ISFJ'
elif (Y_pred[0]) == 13:
  result = 'Personality type: ISFP'
elif (Y_pred[0]) == 14:
  result = 'Personality type: ISTJ'
elif (Y_pred[0]) == 15:
  result = 'Personality type: ISTP'
else:
  result = 'UNKNOWN'

print(result)

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.037542   0.         ... 0.         0.         0.        ]
 [0.1355488  0.04902924 0.11270733 ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.05489534 0.         ... 0.         0.         0.        ]
 [0.         0.03220009 0.         ... 0.04340406 0.         0.        ]]
11
Personality type: INTP
