In [29]:
import pandas as pd
import numpy as np
import pickle
import re

In [30]:
df_essays = pd.read_csv('data/training/essays.csv', encoding='cp1252', delimiter=',', quotechar='"')

# for every essay, we replace the personalitiy categories 
# of the essay wich are "y" and "n" with "1" and "0" 
for e in df_essays.columns[2:7]:
    df_essays[e] = df_essays[e].replace('n', '0')
    df_essays[e] = df_essays[e].replace('y', '1')
    # not sure if we need this line: furter investigation possible:
    df_essays[e] = pd.to_numeric(df_essays[e])

df_essays = df_essays[["TEXT", "cEXT", "cNEU", "cAGR", "cCON", "cOPN"]]
df_essays.head()

Unnamed: 0,TEXT,cEXT,cNEU,cAGR,cCON,cOPN
0,"Well, right now I just woke up from a mid-day ...",0,1,1,0,1
1,"Well, here we go with the stream of consciousn...",0,0,1,0,0
2,An open keyboard and buttons to push. The thin...,0,1,0,1,1
3,I can't believe it! It's really happening! M...,1,0,1,1,0
4,"Well, here I go with the good old stream of co...",1,0,1,0,1


In [31]:
df_kaggle = pd.read_csv('data/training/mbti_1.csv',  skiprows=0 )
df_kaggle

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...
...,...,...
8670,ISFP,'https://www.youtube.com/watch?v=t8edHB_h908||...
8671,ENFP,'So...if this thread already exists someplace ...
8672,INTP,'So many questions when i do these things. I ...
8673,INFP,'I am very conflicted right now when it comes ...


In [32]:
def mbti_to_big5(mbti):
    mbti = mbti.lower()
    cEXT, cNEU, cAGR, cCON, cOPN = 0,np.NaN,0,0,0
    
    ## IN MBTI, extrovert or introvert
    ## correlates with Extroversion
    if mbti[0] == "i":
        cEXT = 0
    elif mbti[0] == "e":
        cEXT = 1
        
    ## IN MBTI, Feeler or Thinker
    ## correlates with Agrreableness
    if mbti[2] == "t":
        cAGR = 0
    elif mbti[2] == "f":
        cAGR = 1

    ## IN MBTI, Judger or Perceiver
    ## correlates with Conscientiousness
    if mbti[3] == "p":
        cCON = 0
    elif mbti[3] == "j":
        cCON = 1
        
    ## IN MBTI, Intuition or Sensing 
    ## correlates with Openness
    if mbti[1] == "n":
        cOPN = 1
    elif mbti[1] == "s":
        cOPN = 0   
        
    return cEXT, cNEU, cAGR, cCON, cOPN

In [33]:
# simply put every row of our read dataframe into a list of 
# the object "Essay"
# remove data from list substract
def create_essays(df, subtract=None):
    essays = []
    for index, row in df.iterrows():
        essays.append(essay.Essay(row.TEXT, row.cEXT, row.cNEU, row.cAGR, row.cCON, row.cOPN))  

    # remove scentences which do not contain emotionally charged words 
    # from the emotional lexicon
    if subtract != None:
        for x in essays:
            x.filtered_text = remove_unemotional_scentences(emotional_words, x.clean_text)

    return essays

def remove_unemotional_scentences(emotional_words, text_as_one_string):
    reduced_s = ""
    scentences = re.split('(?<=[.!?]) +', text_as_one_string)
    for s in scentences:
        if any(e in s for e in emotional_words):
            reduced_s = reduced_s + s + " "
        else:
            pass
    return reduced_s

In [34]:
mbti_to_big5(df_kaggle['type'][5])

(0, nan, 0, 1, 1)

In [35]:
df_kaggle["cEXT"] =   df_kaggle.apply(lambda x: mbti_to_big5(x.type)[0], 1)
df_kaggle["cNEU"] =   df_kaggle.apply(lambda x: mbti_to_big5(x.type)[1], 1)
df_kaggle["cAGR"] =   df_kaggle.apply(lambda x: mbti_to_big5(x.type)[2], 1)
df_kaggle["cCON"] =   df_kaggle.apply(lambda x: mbti_to_big5(x.type)[3], 1)
df_kaggle["cOPN"] =   df_kaggle.apply(lambda x: mbti_to_big5(x.type)[4], 1)

In [36]:
df_kaggle

Unnamed: 0,type,posts,cEXT,cNEU,cAGR,cCON,cOPN
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,0,,1,1,1
1,ENTP,'I'm finding the lack of me in these posts ver...,1,,0,0,1
2,INTP,'Good one _____ https://www.youtube.com/wat...,0,,0,0,1
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",0,,0,1,1
4,ENTJ,'You're fired.|||That's another silly misconce...,1,,0,1,1
...,...,...,...,...,...,...,...
8670,ISFP,'https://www.youtube.com/watch?v=t8edHB_h908||...,0,,1,0,0
8671,ENFP,'So...if this thread already exists someplace ...,1,,1,0,1
8672,INTP,'So many questions when i do these things. I ...,0,,0,0,1
8673,INFP,'I am very conflicted right now when it comes ...,0,,1,0,1


In [37]:
df_kaggle = df_kaggle[["posts", "cEXT", "cNEU", "cAGR", "cCON", "cOPN"]]
df_kaggle.columns = ["TEXT", "cEXT", "cNEU", "cAGR", "cCON", "cOPN"]
# remove som fancy ||| things
df_kaggle["TEXT"] = df_kaggle.apply(lambda x: x.TEXT.replace("|||", " ")[:], 1)

df_kaggle.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_kaggle["TEXT"] = df_kaggle.apply(lambda x: x.TEXT.replace("|||", " ")[:], 1)


Unnamed: 0,TEXT,cEXT,cNEU,cAGR,cCON,cOPN
0,'http://www.youtube.com/watch?v=qsXHcwe3krw ht...,0,,1,1,1
1,'I'm finding the lack of me in these posts ver...,1,,0,0,1
2,'Good one _____ https://www.youtube.com/wat...,0,,0,0,1
3,"'Dear INTP, I enjoyed our conversation the o...",0,,0,1,1
4,'You're fired. That's another silly misconcept...,1,,0,1,1


In [38]:
## Load Emotional Lexicon to substract from data

# also from "Emotional_Lexicon.csv" we read in the data, which is a list of words and 
# has several categories of emotions. 
# anger - anticipation - disgust - fear - joy - negative - positive 
# - sadness - surprise - trust - Charged
df_lexicon = pd.read_csv('data/training/Emotion_Lexicon.csv', index_col=0)


# some of the words have no emotional category, 
# so let's remove them as they have no use to us.
# can be improved by not even loading them when all columns are 0. maybe later.
df_lexicon = df_lexicon[(df_lexicon.T != 0).any()]
emotional_words = df_lexicon.index.tolist()

### Create data frame MBTI AND BIG5

In [39]:
# concatinate the dataframes:
frames  = [df_essays, df_kaggle]
essays_kaggle = pd.concat(frames, sort=False)
essays_kaggle.reset_index(drop=True)

# preprocess data by converting into OBJECT essay and save with pickle and removing non emotional scentences
#essays_kaggle = create_essays(essays_kaggle, emotional_words)
#pickle.dump(essays_kaggle, open("essays/essays11142.p", "wb"))
print("saved entries: ", len(essays_kaggle))

saved entries:  11142


In [40]:
essays_kaggle

Unnamed: 0,TEXT,cEXT,cNEU,cAGR,cCON,cOPN
0,"Well, right now I just woke up from a mid-day ...",0,1.0,1,0,1
1,"Well, here we go with the stream of consciousn...",0,0.0,1,0,0
2,An open keyboard and buttons to push. The thin...,0,1.0,0,1,1
3,I can't believe it! It's really happening! M...,1,0.0,1,1,0
4,"Well, here I go with the good old stream of co...",1,0.0,1,0,1
...,...,...,...,...,...,...
8670,'https://www.youtube.com/watch?v=t8edHB_h908 I...,0,,1,0,0
8671,'So...if this thread already exists someplace ...,1,,1,0,1
8672,'So many questions when i do these things. I ...,0,,0,0,1
8673,'I am very conflicted right now when it comes ...,0,,1,0,1


In [41]:
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from wordcloud import WordCloud

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

stop_words = stopwords.words("english")

def preprocess(text):
    corpus = []
    
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token)>2 and token not in stop_words:
            corpus.append(token)
    return corpus

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Piyush\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Piyush\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [42]:
essays_kaggle['clean'] = essays_kaggle['TEXT'].apply(preprocess)
essays_kaggle['clean_text'] = essays_kaggle['clean'].apply(lambda x:" ".join(x))
essays_kaggle.head()

Unnamed: 0,TEXT,cEXT,cNEU,cAGR,cCON,cOPN,clean,clean_text
0,"Well, right now I just woke up from a mid-day ...",0,1.0,1,0,1,"[right, woke, mid, day, nap, sort, weird, move...",right woke mid day nap sort weird moved texas ...
1,"Well, here we go with the stream of consciousn...",0,0.0,1,0,0,"[stream, consciousness, essay, things, like, h...",stream consciousness essay things like high sc...
2,An open keyboard and buttons to push. The thin...,0,1.0,0,1,1,"[open, keyboard, buttons, push, thing, finally...",open keyboard buttons push thing finally worke...
3,I can't believe it! It's really happening! M...,1,0.0,1,1,0,"[believe, happening, pulse, racing, like, mad,...",believe happening pulse racing like mad like f...
4,"Well, here I go with the good old stream of co...",1,0.0,1,0,1,"[good, old, stream, consciousness, assignment,...",good old stream consciousness assignment feel ...


In [43]:
from sklearn.model_selection import train_test_split
training, test = train_test_split(essays_kaggle, test_size=0.20, random_state=42)

In [44]:
train_x = training.clean_text

train_y_cEXT = training['cEXT']
train_y_cNEU = training['cNEU']
train_y_cAGR = training['cAGR']
train_y_cCON = training['cCON']
train_y_cOPN = training['cOPN']


test_x = test.clean_text

test_y_cEXT = test['cEXT']
test_y_cNEU = test['cNEU']
test_y_cAGR = test['cAGR']
test_y_cCON = test['cCON']
test_y_cOPN = test['cOPN']

In [45]:
## BAG OF WORDS

from sklearn.feature_extraction.text import CountVectorizer
bow_vectorizer = CountVectorizer()

# create vectors from our words
train_x_vectors = bow_vectorizer.fit_transform(train_x)
test_x_vectors = bow_vectorizer.transform(test_x)
# # now that's a big thing :-O

In [46]:
## TFIDF VECTORIZER

from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer()
train_x_vectors_tf = cv.fit_transform(train_x)
test_x_vectors_tf = cv.transform(test_x)

In [47]:
# for evaluation save some data for later:
evaluation = []
evaluation_tf = []
data = len(essays_kaggle)
vec_name = "MBTI"

## Hyperparameter Tuning

In [50]:
from sklearn.svm import SVC

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C':[100,1000], 'gamma':[0.001,0.0001]}

In [None]:
grid = GridSearchCV(SVC(), param_grid, verbose=2)

In [None]:
grid.fit(train_x_vectors, train_y_cEXT)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] C=100, gamma=0.001 ..............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ............................... C=100, gamma=0.001, total= 3.2min
[CV] C=100, gamma=0.001 ..............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.2min remaining:    0.0s


[CV] ............................... C=100, gamma=0.001, total= 3.3min
[CV] C=100, gamma=0.001 ..............................................
[CV] ............................... C=100, gamma=0.001, total= 3.3min
[CV] C=100, gamma=0.001 ..............................................
[CV] ............................... C=100, gamma=0.001, total= 3.2min
[CV] C=100, gamma=0.001 ..............................................
[CV] ............................... C=100, gamma=0.001, total= 3.3min
[CV] C=100, gamma=0.0001 .............................................
[CV] .............................. C=100, gamma=0.0001, total= 2.7min
[CV] C=100, gamma=0.0001 .............................................
[CV] .............................. C=100, gamma=0.0001, total= 2.7min
[CV] C=100, gamma=0.0001 .............................................
[CV] .............................. C=100, gamma=0.0001, total= 3.6min
[CV] C=100, gamma=0.0001 .............................................
[CV] .

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 62.1min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [100, 1000], 'gamma': [0.001, 0.0001]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=2)

In [30]:
grid.best_params_

{'C': 1000, 'gamma': 0.001}

In [31]:
grid.best_estimator_

SVC(C=1000, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [32]:
grid.score(test_x_vectors, test_y_cEXT)

0.7886944818304172

In [25]:
## SVM

from sklearn import svm
name = "svm"

print("training Extraversion cEXT using SVM...")
clf_svm_cEXT = svm.SVC(C=1000, gamma=0.001)
clf_svm_cEXT.fit(train_x_vectors, train_y_cEXT)
evaluation.append([data, vec_name, name, "cEXT", clf_svm_cEXT.score(test_x_vectors, test_y_cEXT)])
print("cEXT score: ", clf_svm_cEXT.score(test_x_vectors, test_y_cEXT))

try:
    print("training Neuroticism cNEU using SVM...")
    clf_svm_cNEU = svm.SVC(kernel='linear', C=1000, gamma=0.001)
    clf_svm_cNEU.fit(train_x_vectors, train_y_cNEU)
    evaluation.append([data, vec_name, name, "cNEU", clf_svm_cNEU.score(test_x_vectors, test_y_cNEU)])
    print("cNEU score: ", clf_svm_cNEU.score(test_x_vectors, test_y_cNEU))
except:
    print("with this data not available (MBTI only 4 dimensions)")

print("training Agreeableness cAGR using using SVM...")
clf_svm_cAGR = svm.SVC(C=1000, gamma=0.001)
clf_svm_cAGR.fit(train_x_vectors, train_y_cAGR)
evaluation.append([data, vec_name, name, "cAGR", clf_svm_cAGR.score(test_x_vectors, test_y_cAGR)])
print("cAGR score: ", clf_svm_cAGR.score(test_x_vectors, test_y_cAGR))

print("training Conscientiousness cCON using SVM...")
clf_svm_cCON = svm.SVC(C=1000, gamma=0.001)
clf_svm_cCON.fit(train_x_vectors, train_y_cCON)
evaluation.append([data, vec_name, name, "cCON", clf_svm_cCON.score(test_x_vectors, test_y_cCON)])
print("cCON score: ", clf_svm_cCON.score(test_x_vectors, test_y_cCON))

print("training Openness to Experience cOPN using SVM...")
clf_svm_cOPN = svm.SVC(C=1000, gamma=0.001)
clf_svm_cOPN.fit(train_x_vectors, train_y_cOPN)
evaluation.append([data, vec_name, name, "cOPN", clf_svm_cOPN.score(test_x_vectors, test_y_cOPN)])
print("cOPN score: ", clf_svm_cOPN.score(test_x_vectors, test_y_cOPN))

training Extraversion cEXT using SVM...
cEXT score:  0.7900403768506057
training Neuroticism cNEU using SVM...
with this data not available (MBTI only 4 dimensions)
training Agreeableness cAGR using using SVM...
cAGR score:  0.7581875280394796
training Conscientiousness cCON using SVM...
cCON score:  0.7222969941677883
training Openness to Experience cOPN using SVM...
cOPN score:  0.8115746971736204


In [None]:
evaluation

In [56]:
### SVM - TFIDF

print("training Extraversion cEXT using SVM...")
clf_svm_cEXT = svm.SVC(C=1000, gamma=0.001)
clf_svm_cEXT.fit(train_x_vectors_tf, train_y_cEXT)
evaluation_tf.append([data, vec_name, name, "cEXT", clf_svm_cEXT.score(test_x_vectors_tf, test_y_cEXT)])
print("cEXT score: ", clf_svm_cEXT.score(test_x_vectors_tf, test_y_cEXT))

try:
    print("training Neuroticism cNEU using SVM...")
    clf_svm_cNEU = svm.SVC(kernel='linear')
    clf_svm_cNEU.fit(train_x_vectors_tf, train_y_cNEU)
    evaluation_tf.append([data, vec_name, name, "cNEU", clf_svm_cNEU.score(test_x_vectors_tf, test_y_cNEU)])
    print("cNEU score: ", clf_svm_cNEU.score(test_x_vectors_tf, test_y_cNEU))
except:
    print("with this data not available (MBTI only 4 dimensions)")

print("training Agreeableness cAGR using using SVM...")
clf_svm_cAGR = svm.SVC(C=1000, gamma=0.001)
clf_svm_cAGR.fit(train_x_vectors_tf, train_y_cAGR)
evaluation_tf.append([data, vec_name, name, "cAGR", clf_svm_cAGR.score(test_x_vectors_tf, test_y_cAGR)])
print("cAGR score: ", clf_svm_cAGR.score(test_x_vectors_tf, test_y_cAGR))

print("training Conscientiousness cCON using SVM...")
clf_svm_cCON = svm.SVC(C=1000, gamma=0.001)
clf_svm_cCON.fit(train_x_vectors_tf, train_y_cCON)
evaluation_tf.append([data, vec_name, name, "cCON", clf_svm_cCON.score(test_x_vectors_tf, test_y_cCON)])
print("cCON score: ", clf_svm_cCON.score(test_x_vectors_tf, test_y_cCON))

print("training Openness to Experience cOPN using SVM...")
clf_svm_cOPN = svm.SVC(C=1000, gamma=0.001)
clf_svm_cOPN.fit(train_x_vectors_tf, train_y_cOPN)
evaluation_tf.append([data, vec_name, name, "cOPN", clf_svm_cOPN.score(test_x_vectors_tf, test_y_cOPN)])
print("cOPN score: ", clf_svm_cOPN.score(test_x_vectors_tf, test_y_cOPN))

training Extraversion cEXT using SVM...
cEXT score:  0.7747868999551368
training Neuroticism cNEU using SVM...
with this data not available (MBTI only 4 dimensions)
training Agreeableness cAGR using using SVM...
cAGR score:  0.7662628981606101
training Conscientiousness cCON using SVM...
cCON score:  0.7146702557200538
training Openness to Experience cOPN using SVM...
cOPN score:  0.8129205921938089


In [48]:
from sklearn.model_selection import RandomizedSearchCV
random_grid = {'C':[100,1000], 'gamma':[0.001,0.0001]}

In [52]:
random_cv = RandomizedSearchCV(SVC(), random_grid, verbose=2)

In [53]:
random_cv.fit(train_x_vectors_tf, train_y_cEXT)



Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END .................................C=100, gamma=0.001; total time= 2.6min
[CV] END .................................C=100, gamma=0.001; total time= 2.7min
[CV] END .................................C=100, gamma=0.001; total time= 2.3min
[CV] END .................................C=100, gamma=0.001; total time= 2.6min
[CV] END .................................C=100, gamma=0.001; total time= 2.6min
[CV] END ................................C=100, gamma=0.0001; total time= 2.7min
[CV] END ................................C=100, gamma=0.0001; total time= 2.6min
[CV] END ................................C=100, gamma=0.0001; total time= 2.7min
[CV] END ................................C=100, gamma=0.0001; total time= 2.7min
[CV] END ................................C=100, gamma=0.0001; total time= 2.7min
[CV] END ................................C=1000, gamma=0.001; total time= 3.3min
[CV] END ................................C=1000, 

RandomizedSearchCV(estimator=SVC(),
                   param_distributions={'C': [100, 1000],
                                        'gamma': [0.001, 0.0001]},
                   verbose=2)

In [54]:
random_cv.best_params_

{'gamma': 0.001, 'C': 1000}

In [55]:
random_cv.best_estimator_

SVC(C=1000, gamma=0.001)