In [2]:
%matplotlib inline
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
from bs4 import BeautifulSoup
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import learning_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier

In [3]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
py.init_notebook_mode(connected = True)

In [4]:
train = pd.read_csv("mbti_1.csv")
us = pd.read_csv('Users.csv')
ps = pd.read_csv('ForumMessages.csv')
mbti = {'I':"Introversion","E":"Extroversion", "N":"Intutions","S":"Sensing","T":"Thinking","F":"Feeling","J":"Judging","P":"Perceiving"}
print(train.shape)

(8675, 2)


In [5]:
print(us)

                Id      UserName        DisplayName RegisterDate   
0                1    kaggleteam        Kaggle Team   03/24/2011  \
1              368  antgoldbloom  Anthony Goldbloom   01/20/2010   
2              381        iguyon           Isabelle   01/29/2010   
3              383  davidstephan      David Stephan   02/01/2010   
4              384    gabewarren        Gabe Warren   02/02/2010   
...            ...           ...                ...          ...   
15374507  17262686      okejonah          Oke Jonah   10/17/2023   
15374508  17262688       fuewill           fue.will   10/17/2023   
15374509  17262689  maxwellevans       Maxwellevans   10/17/2023   
15374510  17262690     dwiyantra          Dwiyantra   10/17/2023   
15374511  17262691   qwerty10245        qwerty10245   10/17/2023   

          PerformanceTier  
0                       5  
1                       2  
2                       2  
3                       0  
4                       0  
...            

In [44]:
# cnt_srs = train['type'].value_counts()
# cnt_srs = [cnt_srs.index, cnt_srs.values]
# plt.figure(figsize=(12,4))
# sns.barplot(cnt_srs.index, cnt_srs.values, alpha=0.8)
# sns.barplot(train['type'])
# plt.ylabel('number of occurences', fontsize=13)
# plt.xlabel('types',fontsize=13)
# plt.show()
# # print(cnt_srs)


In [6]:
ps['Message'] = ps['Message'].fillna('')
ps_join = ps.groupby('PostUserId')['Message'].agg(lambda col: ' '.join(col)).reset_index()
print(ps_join)


        PostUserId                                            Message
0               62  tes David, just a quick note to say thanks for...
1              368  Hi Tanya, <br><br>Kaggle will maintain a ratin...
2              381  <p>Hi Sergei,</p>\r\n<p>Compiled Matlab p-code...
3              387  <p>From an economic perspective let's look at ...
4              389  <p>There's still one more confusion.. what doe...
...            ...                                                ...
337795    17254990                            <p>Hi everyone!! :)</p>
337796    17255109                     <p>Its very good so useful</p>
337797    17256383  <p>Excellent EDA work with wide range of chart...
337798    17257536                        <p>be patient good luck</p>
337799    17258439                               <p>Good content </p>

[337800 rows x 2 columns]


In [8]:
etc = ExtraTreesClassifier(n_estimators = 20, max_depth = 4, n_jobs = -1)
tfidfi = TfidfVectorizer(ngram_range = (1,1), stop_words='english')
tsvd = TruncatedSVD(n_components = 10)
model = Pipeline([('tifidfi', tfidfi),('tsvd1', tsvd),('etc',etc)])

In [9]:
kfolds = StratifiedKFold(n_splits=5,shuffle=True,random_state=1)

np.random.seed(1)
scoring = {'acc': 'accuracy',
           'neg_log_loss':'neg_log_loss',
           'f1_micro':'f1_micro'}
results = cross_validate(model, train['posts'], train['type'], cv=kfolds, scoring = scoring, n_jobs=-1)

In [10]:
print("CV Accuracy: {:0.4f}(+/- {:0.4f})".format(np.mean(results['test_acc']),np.std(results['test_acc'])))
print("CV F1: {:0.4f}(+/- {:0.4f})".format(np.mean(results['test_f1_micro']),np.std(results['test_f1_micro'])))
print("CV Logloss: {:0.4f}(+/- {:0.4f})".format(np.mean(results['test_neg_log_loss']),np.std(results['test_neg_log_loss'])))

CV Accuracy: 0.2901(+/- 0.0126)
CV F1: 0.2901(+/- 0.0126)
CV Logloss: -2.1503(+/- 0.0074)


In [11]:
def cleanText(text):
  text = BeautifulSoup(text,'lxml').text
  text = re.sub(r'\|\|\|',r'',text)
  text = re.sub(r'http\S+',r'<URL>',text)
  return text

In [12]:
train['clean_posts'] = train['posts'].apply(cleanText)
print(train)


The input looks more like a filename than markup. You may want to open this file and pass the filehandle into Beautiful Soup.



      type                                              posts   
0     INFJ  'http://www.youtube.com/watch?v=qsXHcwe3krw|||...  \
1     ENTP  'I'm finding the lack of me in these posts ver...   
2     INTP  'Good one  _____   https://www.youtube.com/wat...   
3     INTJ  'Dear INTP,   I enjoyed our conversation the o...   
4     ENTJ  'You're fired.|||That's another silly misconce...   
...    ...                                                ...   
8670  ISFP  'https://www.youtube.com/watch?v=t8edHB_h908||...   
8671  ENFP  'So...if this thread already exists someplace ...   
8672  INTP  'So many questions when i do these things.  I ...   
8673  INFP  'I am very conflicted right now when it comes ...   
8674  INFP  'It has been too long since I have been on per...   

                                            clean_posts  
0     '<URL> and intj moments  <URL>  sportscenter n...  
1     'I'm finding the lack of me in these posts ver...  
2     'Good one  _____   <URL> course, to whi

In [13]:
np.random.seed(1)

tfidf2 = CountVectorizer(ngram_range=(1,1), stop_words='english',lowercase=True,max_features=5000)

model_np = Pipeline([('tfidf1',tfidf2), ("nb", MultinomialNB())])

results_nb = cross_validate(model_np, train['clean_posts'], train['type'], cv=kfolds, scoring=scoring, n_jobs=-1)



In [14]:
print("CV Accuracy: {:0.4f}(+/- {:0.4f})".format(np.mean(results_nb['test_acc']),np.std(results_nb['test_acc'])))
print("CV F1: {:0.4f}(+/- {:0.4f})".format(np.mean(results_nb['test_f1_micro']),np.std(results_nb['test_f1_micro'])))
print("CV Logloss: {:0.4f}(+/- {:0.4f})".format(np.mean(-1*results_nb['test_neg_log_loss']),np.std(results_nb['test_neg_log_loss'])))

CV Accuracy: 0.5627(+/- 0.0110)
CV F1: 0.5627(+/- 0.0110)
CV Logloss: 6.2055(+/- 0.3766)


In [15]:
np.random.seed(1)

tfidf2 = CountVectorizer(ngram_range=(1,1), stop_words='english',lowercase=True,max_features=5000)

model_lr = Pipeline([('tfidf1',tfidf2), ("lr", LogisticRegression(class_weight="balanced",C=0.005))])

results_lr = cross_validate(model_lr, train['clean_posts'], train['type'], cv=kfolds, scoring=scoring, n_jobs=-1)



In [16]:
print("CV Accuracy: {:0.4f}(+/- {:0.4f})".format(np.mean(results_lr['test_acc']),np.std(results_lr['test_acc'])))
print("CV F1: {:0.4f}(+/- {:0.4f})".format(np.mean(results_lr['test_f1_micro']),np.std(results_lr['test_f1_micro'])))
print("CV Logloss: {:0.4f}(+/- {:0.4f})".format(np.mean(-1*results_lr['test_neg_log_loss']),np.std(results_lr['test_neg_log_loss'])))

CV Accuracy: 0.6561(+/- 0.0138)
CV F1: 0.6561(+/- 0.0138)
CV Logloss: 1.3072(+/- 0.0131)


In [20]:
ps_join["clean_comments"] = ps_join["Message"].apply(cleanText)
model_lr.fit(train['clean_posts'], train['type'])
pred_all = model_lr.predict(ps_join['clean_comments'])
# pred_all = model_lr.predict(["good video and most of all nice"])
# print(ps_join['clean_comments'])
# print(pred_all)

['ISFP']


In [18]:
cnt_all = np.unique(pred_all, return_counts=True)
pred_df = pd.DataFrame({'personality': cnt_all[0],'count': cnt_all[1]}, columns=['personality', 'count'], index=None)
pred_df.sort_values('count', ascending=False, inplace=True)
print(pred_df)
# plt.figure(figsize=(12,6))
# sns.barplot(pred_df['personality'], pred_df['count'], alpha = 0.8)
# plt.ylabel('Numbeer of Occurences', fontsize=12)
# plt.xlabel('Personality', fontsize=12)
# plt.show()

   personality   count
5         ESFP  182404
10        INTJ   80880
3         ENTP   18528
14        ISTJ   14493
2         ENTJ   14123
11        INTP    6002
12        ISFJ    5744
13        ISFP    5652
15        ISTP    4627
4         ESFJ    1487
0         ENFJ    1363
1         ENFP     911
6         ESTJ     623
9         INFP     379
8         INFJ     311
7         ESTP     273


In [19]:
pred_df['percent'] = pred_df['count']/pred_df['count'].sum()
pred_df['description'] = pred_df['personality'].apply(lambda x: ' '.join([mbti[l] for l in list(x)]))
print(pred_df)

# labels = pred_df['description']
# sizes = pred_df['percent'] * 100

# trace = go.Pie(labels = labels, values= sizes)
# layout = go.Layout( title = "kaggle user personality distribution")

# data = [trace]
# fig = go.Figure(data = data, layout = layout)

# py.iplot(fig)

   personality   count   percent                                 description
5         ESFP  182404  0.539976     Extroversion Sensing Feeling Perceiving
10        INTJ   80880  0.239432     Introversion Intutions Thinking Judging
3         ENTP   18528  0.054849  Extroversion Intutions Thinking Perceiving
14        ISTJ   14493  0.042904       Introversion Sensing Thinking Judging
2         ENTJ   14123  0.041809     Extroversion Intutions Thinking Judging
11        INTP    6002  0.017768  Introversion Intutions Thinking Perceiving
12        ISFJ    5744  0.017004        Introversion Sensing Feeling Judging
13        ISFP    5652  0.016732     Introversion Sensing Feeling Perceiving
15        ISTP    4627  0.013697    Introversion Sensing Thinking Perceiving
4         ESFJ    1487  0.004402        Extroversion Sensing Feeling Judging
0         ENFJ    1363  0.004035      Extroversion Intutions Feeling Judging
1         ENFP     911  0.002697   Extroversion Intutions Feeling Perceiving

In [21]:
from joblib import dump

In [22]:
dump(model_lr,'./personality_predictor.joblib')

['./personality_predictor.joblib']