In [1]:
import pandas as pd
import essay
import numpy as np
import pickle
import re

In [2]:
df_essays = pd.read_csv('data/training/essays.csv', encoding='cp1252', delimiter=',', quotechar='"')

# for every essay, we replace the personalitiy categories 
# of the essay wich are "y" and "n" with "1" and "0" 
for e in df_essays.columns[2:7]:
    df_essays[e] = df_essays[e].replace('n', '0')
    df_essays[e] = df_essays[e].replace('y', '1')
    # not sure if we need this line: furter investigation possible:
    df_essays[e] = pd.to_numeric(df_essays[e])

df_essays = df_essays[["TEXT", "cEXT", "cNEU", "cAGR", "cCON", "cOPN"]]
df_essays.head()

Unnamed: 0,TEXT,cEXT,cNEU,cAGR,cCON,cOPN
0,"Well, right now I just woke up from a mid-day ...",0,1,1,0,1
1,"Well, here we go with the stream of consciousn...",0,0,1,0,0
2,An open keyboard and buttons to push. The thin...,0,1,0,1,1
3,I can't believe it! It's really happening! M...,1,0,1,1,0
4,"Well, here I go with the good old stream of co...",1,0,1,0,1


In [3]:
df_kaggle = pd.read_csv('data/training/mbti_1.csv',  skiprows=0 )
df_kaggle

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...
...,...,...
8670,ISFP,'https://www.youtube.com/watch?v=t8edHB_h908||...
8671,ENFP,'So...if this thread already exists someplace ...
8672,INTP,'So many questions when i do these things. I ...
8673,INFP,'I am very conflicted right now when it comes ...


In [4]:
def mbti_to_big5(mbti):
    # check https://en.wikipedia.org/wiki/Myers%E2%80%93Briggs_Type_Indicator
    # in mbti (myers briggs) ther is invrovert vs. extrovert
    # which corellates with Extroversion in BIG FIVE
    mbti = mbti.lower()
    cEXT, cNEU, cAGR, cCON, cOPN = 0,np.NaN,0,0,0
    if mbti[0] == "i":
        cEXT = 0
    elif mbti[0] == "e":
        cEXT = 1

    # in mbti (myers briggs) ther is I*N*TUITION vs SENSING
    # which corellates with OPENNESS in BIG FIVE
    if mbti[1] == "n":
        cOPN = 1
    elif mbti[1] == "s":
        cOPN = 0   

    # in mbti (myers briggs) ther is THINKER vs FEELER
    # which corellates with AGREEABLENESS in BIG FIVE
    if mbti[2] == "t":
        cAGR = 0
    elif mbti[2] == "f":
        cAGR = 1

    # in mbti (myers briggs) ther is JUDGER vs PERCEIVER
    # which corellates with CONSCIENTIOUSNESS in BIG FIVE (worst corellation)
    # especially bec. orderlyness corellates with conscientiousness
    if mbti[3] == "p":
        cCON = 0
    elif mbti[3] == "j":
        cCON = 1

    return cEXT, cNEU, cAGR, cCON, cOPN

In [5]:
mbti_to_big5(df_kaggle['type'][5])

(0, nan, 0, 1, 1)

In [6]:
df_kaggle["cEXT"] =   df_kaggle.apply(lambda x: mbti_to_big5(x.type)[0], 1)
df_kaggle["cNEU"] =   df_kaggle.apply(lambda x: mbti_to_big5(x.type)[1], 1)
df_kaggle["cAGR"] =   df_kaggle.apply(lambda x: mbti_to_big5(x.type)[2], 1)
df_kaggle["cCON"] =   df_kaggle.apply(lambda x: mbti_to_big5(x.type)[3], 1)
df_kaggle["cOPN"] =   df_kaggle.apply(lambda x: mbti_to_big5(x.type)[4], 1)

In [7]:
df_kaggle = df_kaggle[["posts", "cEXT", "cNEU", "cAGR", "cCON", "cOPN"]]
df_kaggle.columns = ["TEXT", "cEXT", "cNEU", "cAGR", "cCON", "cOPN"]
# remove som fancy ||| things
df_kaggle["TEXT"] = df_kaggle.apply(lambda x: x.TEXT.replace("|||", " ")[:], 1)

df_kaggle.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_kaggle["TEXT"] = df_kaggle.apply(lambda x: x.TEXT.replace("|||", " ")[:], 1)


Unnamed: 0,TEXT,cEXT,cNEU,cAGR,cCON,cOPN
0,'http://www.youtube.com/watch?v=qsXHcwe3krw ht...,0,,1,1,1
1,'I'm finding the lack of me in these posts ver...,1,,0,0,1
2,'Good one _____ https://www.youtube.com/wat...,0,,0,0,1
3,"'Dear INTP, I enjoyed our conversation the o...",0,,0,1,1
4,'You're fired. That's another silly misconcept...,1,,0,1,1


In [8]:
## Load Emotional Lexicon to substract from data

# also from "Emotional_Lexicon.csv" we read in the data, which is a list of words and 
# has several categories of emotions. 
# anger - anticipation - disgust - fear - joy - negative - positive 
# - sadness - surprise - trust - Charged
df_lexicon = pd.read_csv('data/training/Emotion_Lexicon.csv', index_col=0)


# some of the words have no emotional category, 
# so let's remove them as they have no use to us.
# can be improved by not even loading them when all columns are 0. maybe later.
df_lexicon = df_lexicon[(df_lexicon.T != 0).any()]
emotional_words = df_lexicon.index.tolist()

In [9]:
# simply put every row of our read dataframe into a list of 
# the object "Essay"
# remove data from list substract
def create_essays(df, subtract=None):
    essays = []
    for index, row in df.iterrows():
        essays.append(essay.Essay(row.TEXT, row.cEXT, row.cNEU, row.cAGR, row.cCON, row.cOPN))  

    # remove scentences which do not contain emotionally charged words 
    # from the emotional lexicon
    if subtract != None:
        for x in essays:
            x.filtered_text = remove_unemotional_scentences(emotional_words, x.clean_text)

    return essays

def remove_unemotional_scentences(emotional_words, text_as_one_string):
    reduced_s = ""
    scentences = re.split('(?<=[.!?]) +', text_as_one_string)
    for s in scentences:
        if any(e in s for e in emotional_words):
            reduced_s = reduced_s + s + " "
        else:
            pass
    return reduced_s

In [10]:
# save preprocessed data by converting into OBJECT essay and save with pickle and removing non emotional scentences
#essays = create_essays(df_essays, emotional_words)
#pickle.dump(essays, open( "essays/essays2467.p", "wb"))
#print("saved entries: ", len(essays))

In [11]:
## Bag of words

import essay
import pickle

In [12]:
essays = df_essays
print("loaded count of essays:", len(essays))

loaded count of essays: 2467


In [13]:
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
#from wordcloud import WordCloud

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

stop_words = stopwords.words("english")

def preprocess(text):
    corpus = []
    
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token)>2 and token not in stop_words:
            corpus.append(token)
    return corpus

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Piyush\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Piyush\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
essays['clean'] = essays['TEXT'].apply(preprocess)
essays['clean_text'] = essays['clean'].apply(lambda x:" ".join(x))
essays.head()

Unnamed: 0,TEXT,cEXT,cNEU,cAGR,cCON,cOPN,clean,clean_text
0,"Well, right now I just woke up from a mid-day ...",0,1,1,0,1,"[right, woke, mid, day, nap, sort, weird, move...",right woke mid day nap sort weird moved texas ...
1,"Well, here we go with the stream of consciousn...",0,0,1,0,0,"[stream, consciousness, essay, things, like, h...",stream consciousness essay things like high sc...
2,An open keyboard and buttons to push. The thin...,0,1,0,1,1,"[open, keyboard, buttons, push, thing, finally...",open keyboard buttons push thing finally worke...
3,I can't believe it! It's really happening! M...,1,0,1,1,0,"[believe, happening, pulse, racing, like, mad,...",believe happening pulse racing like mad like f...
4,"Well, here I go with the good old stream of co...",1,0,1,0,1,"[good, old, stream, consciousness, assignment,...",good old stream consciousness assignment feel ...


In [15]:
from sklearn.model_selection import train_test_split
training, test = train_test_split(essays, test_size=0.20, random_state=42)

In [16]:
training['cEXT']

1124    1
856     1
1807    0
219     1
445     0
       ..
1638    1
1095    1
1130    0
1294    0
860     1
Name: cEXT, Length: 1973, dtype: int64

In [17]:
train_x = training.clean_text

train_y_cEXT = training['cEXT']
train_y_cNEU = training['cNEU']
train_y_cAGR = training['cAGR']
train_y_cCON = training['cCON']
train_y_cOPN = training['cOPN']


test_x = test.clean_text

test_y_cEXT = test['cEXT']
test_y_cNEU = test['cNEU']
test_y_cAGR = test['cAGR']
test_y_cCON = test['cCON']
test_y_cOPN = test['cOPN']

In [18]:
## BAG OF WORDS

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
bow_vectorizer = CountVectorizer()

# create vectors from our words
train_x_vectors = bow_vectorizer.fit_transform(train_x)
test_x_vectors = bow_vectorizer.transform(test_x)
# # now that's a big thing :-O

In [20]:
## TFIDF VECTORIZER

from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer()
train_x_vectors_tf = cv.fit_transform(train_x)
test_x_vectors_tf = cv.transform(test_x)

In [21]:
pickle.dump(bow_vectorizer, open('bow_vectorizer_project.p', 'wb'))

In [22]:
# for evaluation save some data for later:
evaluation = []
evaluation_tf = []
data = len(essays)
vec_name = "BoW"

In [23]:
## SVM

In [24]:
from sklearn import svm
name = "svm"


In [25]:
print("training Extraversion cEXT using SVM...")
clf_svm_cEXT = svm.SVC(kernel='linear')
clf_svm_cEXT.fit(train_x_vectors, train_y_cEXT)
evaluation.append([data, vec_name, name, "cEXT", clf_svm_cEXT.score(test_x_vectors, test_y_cEXT)])
print("cEXT score: ", clf_svm_cEXT.score(test_x_vectors, test_y_cEXT))

print("training Neuroticism cNEU using SVM...")
clf_svm_cNEU = svm.SVC(kernel='linear')
clf_svm_cNEU.fit(train_x_vectors, train_y_cNEU)
evaluation.append([data, vec_name, name, "cNEU", clf_svm_cNEU.score(test_x_vectors, test_y_cNEU)])
print("cNEU score: ", clf_svm_cNEU.score(test_x_vectors, test_y_cNEU))

print("training Agreeableness cAGR using using SVM...")
clf_svm_cAGR = svm.SVC(kernel='linear')
clf_svm_cAGR.fit(train_x_vectors, train_y_cAGR)
evaluation.append([data, vec_name, name, "cAGR", clf_svm_cAGR.score(test_x_vectors, test_y_cAGR)])
print("cAGR score: ", clf_svm_cAGR.score(test_x_vectors, test_y_cAGR))

print("training Conscientiousness cCON using SVM...")
clf_svm_cCON = svm.SVC(kernel='linear')
clf_svm_cCON.fit(train_x_vectors, train_y_cCON)
evaluation.append([data, vec_name, name, "cCON", clf_svm_cCON.score(test_x_vectors, test_y_cCON)])
print("cCON score: ", clf_svm_cCON.score(test_x_vectors, test_y_cCON))

print("training Openness to Experience cOPN using SVM...")
clf_svm_cOPN = svm.SVC(kernel='linear')
clf_svm_cOPN.fit(train_x_vectors, train_y_cOPN)
evaluation.append([data, vec_name, name, "cOPN", clf_svm_cOPN.score(test_x_vectors, test_y_cOPN)])
print("cOPN score: ", clf_svm_cOPN.score(test_x_vectors, test_y_cOPN))

training Extraversion cEXT using SVM...
cEXT score:  0.5546558704453441
training Neuroticism cNEU using SVM...
cNEU score:  0.5161943319838057
training Agreeableness cAGR using using SVM...
cAGR score:  0.52834008097166
training Conscientiousness cCON using SVM...
cCON score:  0.5080971659919028
training Openness to Experience cOPN using SVM...
cOPN score:  0.5708502024291497


In [78]:
evaluation

[[2467, 'BoW', 'svm', 'cEXT', 0.5546558704453441],
 [2467, 'BoW', 'svm', 'cNEU', 0.5161943319838057],
 [2467, 'BoW', 'svm', 'cAGR', 0.52834008097166],
 [2467, 'BoW', 'svm', 'cCON', 0.5080971659919028],
 [2467, 'BoW', 'svm', 'cOPN', 0.5708502024291497]]

In [26]:
### SVM - TFIDF

print("training Extraversion cEXT using SVM...")
clf_svm_cEXT = svm.SVC(kernel='linear')
clf_svm_cEXT.fit(train_x_vectors_tf, train_y_cEXT)
evaluation_tf.append([data, vec_name, name, "cEXT", clf_svm_cEXT.score(test_x_vectors_tf, test_y_cEXT)])
print("cEXT score: ", clf_svm_cEXT.score(test_x_vectors_tf, test_y_cEXT))

print("training Neuroticism cNEU using SVM...")
clf_svm_cNEU = svm.SVC(kernel='linear')
clf_svm_cNEU.fit(train_x_vectors_tf, train_y_cNEU)
evaluation_tf.append([data, vec_name, name, "cNEU", clf_svm_cNEU.score(test_x_vectors_tf, test_y_cNEU)])
print("cNEU score: ", clf_svm_cNEU.score(test_x_vectors_tf, test_y_cNEU))

print("training Agreeableness cAGR using using SVM...")
clf_svm_cAGR = svm.SVC(kernel='linear')
clf_svm_cAGR.fit(train_x_vectors_tf, train_y_cAGR)
evaluation_tf.append([data, vec_name, name, "cAGR", clf_svm_cAGR.score(test_x_vectors_tf, test_y_cAGR)])
print("cAGR score: ", clf_svm_cAGR.score(test_x_vectors_tf, test_y_cAGR))

print("training Conscientiousness cCON using SVM...")
clf_svm_cCON = svm.SVC(kernel='linear')
clf_svm_cCON.fit(train_x_vectors_tf, train_y_cCON)
evaluation_tf.append([data, vec_name, name, "cCON", clf_svm_cCON.score(test_x_vectors_tf, test_y_cCON)])
print("cCON score: ", clf_svm_cCON.score(test_x_vectors_tf, test_y_cCON))

print("training Openness to Experience cOPN using SVM...")
clf_svm_cOPN = svm.SVC(kernel='linear')
clf_svm_cOPN.fit(train_x_vectors_tf, train_y_cOPN)
evaluation_tf.append([data, vec_name, name, "cOPN", clf_svm_cOPN.score(test_x_vectors_tf, test_y_cOPN)])
print("cOPN score: ", clf_svm_cOPN.score(test_x_vectors_tf, test_y_cOPN))

training Extraversion cEXT using SVM...
cEXT score:  0.5384615384615384
training Neuroticism cNEU using SVM...
cNEU score:  0.5809716599190283
training Agreeableness cAGR using using SVM...
cAGR score:  0.5607287449392713
training Conscientiousness cCON using SVM...
cCON score:  0.5465587044534413
training Openness to Experience cOPN using SVM...
cOPN score:  0.6072874493927125


In [80]:
evaluation_tf

[[2467, 'BoW', 'svm', 'cEXT', 0.5384615384615384],
 [2467, 'BoW', 'svm', 'cNEU', 0.5809716599190283],
 [2467, 'BoW', 'svm', 'cAGR', 0.5607287449392713],
 [2467, 'BoW', 'svm', 'cCON', 0.5465587044534413],
 [2467, 'BoW', 'svm', 'cOPN', 0.6072874493927125]]

## HYPERPARAMETER TUNING FOR cNEU

In [24]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
param_grid = {'C':[10,100,1000], 'gamma':[0.1,0.001,0.0001]}

In [25]:
grid = GridSearchCV(SVC(), param_grid, verbose=2)

In [26]:
grid.fit(train_x_vectors, train_y_cEXT)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] END ....................................C=10, gamma=0.1; total time=   6.1s
[CV] END ....................................C=10, gamma=0.1; total time=   6.0s
[CV] END ....................................C=10, gamma=0.1; total time=   6.2s
[CV] END ....................................C=10, gamma=0.1; total time=   6.0s
[CV] END ....................................C=10, gamma=0.1; total time=   6.2s
[CV] END ..................................C=10, gamma=0.001; total time=   6.1s
[CV] END ..................................C=10, gamma=0.001; total time=   6.1s
[CV] END ..................................C=10, gamma=0.001; total time=   6.0s
[CV] END ..................................C=10, gamma=0.001; total time=   6.2s
[CV] END ..................................C=10, gamma=0.001; total time=   6.1s
[CV] END .................................C=10, gamma=0.0001; total time=   5.7s
[CV] END .................................C=10, g

GridSearchCV(estimator=SVC(),
             param_grid={'C': [10, 100, 1000], 'gamma': [0.1, 0.001, 0.0001]},
             verbose=2)

In [27]:
grid.best_params_

{'C': 10, 'gamma': 0.0001}

In [31]:
## SVM HYPERPARAMTER - BOW

print("training Extraversion cEXT using SVM...")
clf_svm_cEXT = svm.SVC(C=10, gamma=0.0001)
clf_svm_cEXT.fit(train_x_vectors, train_y_cEXT)
evaluation.append([data, vec_name, name, "cEXT", clf_svm_cEXT.score(test_x_vectors, test_y_cEXT)])
print("cEXT score: ", clf_svm_cEXT.score(test_x_vectors, test_y_cEXT))

print("training Neuroticism cNEU using SVM...")
clf_svm_cNEU = svm.SVC(C=10, gamma=0.0001)
clf_svm_cNEU.fit(train_x_vectors, train_y_cNEU)
evaluation.append([data, vec_name, name, "cNEU", clf_svm_cNEU.score(test_x_vectors, test_y_cNEU)])
print("cNEU score: ", clf_svm_cNEU.score(test_x_vectors, test_y_cNEU))

print("training Agreeableness cAGR using using SVM...")
clf_svm_cAGR = svm.SVC(C=10, gamma=0.0001)
clf_svm_cAGR.fit(train_x_vectors, train_y_cAGR)
evaluation.append([data, vec_name, name, "cAGR", clf_svm_cAGR.score(test_x_vectors, test_y_cAGR)])
print("cAGR score: ", clf_svm_cAGR.score(test_x_vectors, test_y_cAGR))

print("training Conscientiousness cCON using SVM...")
clf_svm_cCON = svm.SVC(C=10, gamma=0.0001)
clf_svm_cCON.fit(train_x_vectors, train_y_cCON)
evaluation.append([data, vec_name, name, "cCON", clf_svm_cCON.score(test_x_vectors, test_y_cCON)])
print("cCON score: ", clf_svm_cCON.score(test_x_vectors, test_y_cCON))

print("training Openness to Experience cOPN using SVM...")
clf_svm_cOPN = svm.SVC(C=10, gamma=0.0001)
clf_svm_cOPN.fit(train_x_vectors, train_y_cOPN)
evaluation.append([data, vec_name, name, "cOPN", clf_svm_cOPN.score(test_x_vectors, test_y_cOPN)])
print("cOPN score: ", clf_svm_cOPN.score(test_x_vectors, test_y_cOPN))

training Extraversion cEXT using SVM...
cEXT score:  0.5526315789473685
training Neuroticism cNEU using SVM...
cNEU score:  0.5890688259109311
training Agreeableness cAGR using using SVM...
cAGR score:  0.5404858299595142
training Conscientiousness cCON using SVM...
cCON score:  0.5344129554655871
training Openness to Experience cOPN using SVM...
cOPN score:  0.5890688259109311


In [32]:
pickle.dump(clf_svm_cNEU, open('cNEU_project.p', 'wb'))

In [31]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
param_grid = {'C':[10,100,1000], 'gamma':[0.1,0.001,0.0001]}

In [32]:
grid_tf = GridSearchCV(SVC(), param_grid, verbose=2)

In [33]:
grid_tf.fit(train_x_vectors_tf, train_y_cNEU)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] END ....................................C=10, gamma=0.1; total time=   3.9s
[CV] END ....................................C=10, gamma=0.1; total time=   3.8s
[CV] END ....................................C=10, gamma=0.1; total time=   4.0s
[CV] END ....................................C=10, gamma=0.1; total time=   4.1s
[CV] END ....................................C=10, gamma=0.1; total time=   3.8s
[CV] END ..................................C=10, gamma=0.001; total time=   3.9s
[CV] END ..................................C=10, gamma=0.001; total time=   3.8s
[CV] END ..................................C=10, gamma=0.001; total time=   3.9s
[CV] END ..................................C=10, gamma=0.001; total time=   4.1s
[CV] END ..................................C=10, gamma=0.001; total time=   4.0s
[CV] END .................................C=10, gamma=0.0001; total time=   4.0s
[CV] END .................................C=10, g

GridSearchCV(estimator=SVC(),
             param_grid={'C': [10, 100, 1000], 'gamma': [0.1, 0.001, 0.0001]},
             verbose=2)

In [34]:
grid_tf.best_params_

{'C': 10, 'gamma': 0.1}

In [37]:
### SVM HYPERPARAMETER TUNING - TFIDF

print("training Extraversion cEXT using SVM...")
clf_svm_cEXT = svm.SVC(C=10, gamma=0.1)
clf_svm_cEXT.fit(train_x_vectors_tf, train_y_cEXT)
evaluation_tf.append([data, vec_name, name, "cEXT", clf_svm_cEXT.score(test_x_vectors_tf, test_y_cEXT)])
print("cEXT score: ", clf_svm_cEXT.score(test_x_vectors_tf, test_y_cEXT))

print("training Neuroticism cNEU using SVM...")
clf_svm_cNEU = svm.SVC(C=10, gamma=0.1)
clf_svm_cNEU.fit(train_x_vectors_tf, train_y_cNEU)
evaluation_tf.append([data, vec_name, name, "cNEU", clf_svm_cNEU.score(test_x_vectors_tf, test_y_cNEU)])
print("cNEU score: ", clf_svm_cNEU.score(test_x_vectors_tf, test_y_cNEU))

print("training Agreeableness cAGR using using SVM...")
clf_svm_cAGR = svm.SVC(C=10, gamma=0.1)
clf_svm_cAGR.fit(train_x_vectors_tf, train_y_cAGR)
evaluation_tf.append([data, vec_name, name, "cAGR", clf_svm_cAGR.score(test_x_vectors_tf, test_y_cAGR)])
print("cAGR score: ", clf_svm_cAGR.score(test_x_vectors_tf, test_y_cAGR))

print("training Conscientiousness cCON using SVM...")
clf_svm_cCON = svm.SVC(C=10, gamma=0.1)
clf_svm_cCON.fit(train_x_vectors_tf, train_y_cCON)
evaluation_tf.append([data, vec_name, name, "cCON", clf_svm_cCON.score(test_x_vectors_tf, test_y_cCON)])
print("cCON score: ", clf_svm_cCON.score(test_x_vectors_tf, test_y_cCON))

print("training Openness to Experience cOPN using SVM...")
clf_svm_cOPN = svm.SVC(C=10, gamma=0.1)
clf_svm_cOPN.fit(train_x_vectors_tf, train_y_cOPN)
evaluation_tf.append([data, vec_name, name, "cOPN", clf_svm_cOPN.score(test_x_vectors_tf, test_y_cOPN)])
print("cOPN score: ", clf_svm_cOPN.score(test_x_vectors_tf, test_y_cOPN))

training Extraversion cEXT using SVM...
cEXT score:  0.5566801619433198
training Neuroticism cNEU using SVM...
cNEU score:  0.5506072874493927
training Agreeableness cAGR using using SVM...
cAGR score:  0.5607287449392713
training Conscientiousness cCON using SVM...
cCON score:  0.5344129554655871
training Openness to Experience cOPN using SVM...
cOPN score:  0.6093117408906883


In [81]:
## Decision Tree

from sklearn import tree
name = "tree"

print("training Extraversion cEXT using dec...")
clf_dec_cEXT = tree.DecisionTreeClassifier()
clf_dec_cEXT.fit(train_x_vectors, train_y_cEXT)
evaluation.append([data, vec_name, name, "cEXT", clf_dec_cEXT.score(test_x_vectors, test_y_cEXT)])
print("cEXT score: ", clf_dec_cEXT.score(test_x_vectors, test_y_cEXT))

print("training Neuroticism cNEU using dec...")
clf_dec_cNEU = tree.DecisionTreeClassifier()
clf_dec_cNEU.fit(train_x_vectors, train_y_cNEU)
evaluation.append([data, vec_name, name, "cNEU", clf_dec_cNEU.score(test_x_vectors, test_y_cNEU)])
print("cNEU score: ", clf_dec_cNEU.score(test_x_vectors, test_y_cNEU))

print("training Agreeableness cAGR using using dec...")
clf_dec_cAGR = tree.DecisionTreeClassifier()
clf_dec_cAGR.fit(train_x_vectors, train_y_cAGR)
evaluation.append([data, vec_name, name, "cAGR", clf_dec_cAGR.score(test_x_vectors, test_y_cAGR)])
print("cAGR score: ", clf_dec_cAGR.score(test_x_vectors, test_y_cAGR))

print("training Conscientiousness cCON using dec...")
clf_dec_cCON = tree.DecisionTreeClassifier()
clf_dec_cCON.fit(train_x_vectors, train_y_cCON)
evaluation.append([data, vec_name, name, "cCON", clf_dec_cCON.score(test_x_vectors, test_y_cCON)])
print("cCON score: ", clf_dec_cCON.score(test_x_vectors, test_y_cCON))

print("training Openness to Experience cOPN using dec...")
clf_dec_cOPN = tree.DecisionTreeClassifier()
clf_dec_cOPN.fit(train_x_vectors, train_y_cOPN)
evaluation.append([data, vec_name, name, "cOPN", clf_dec_cOPN.score(test_x_vectors, test_y_cOPN)])
print("cOPN score: ", clf_dec_cOPN.score(test_x_vectors, test_y_cOPN))


training Extraversion cEXT using dec...
cEXT score:  0.5182186234817814
training Neuroticism cNEU using dec...
cNEU score:  0.5445344129554656
training Agreeableness cAGR using using dec...
cAGR score:  0.5040485829959515
training Conscientiousness cCON using dec...
cCON score:  0.5141700404858299
training Openness to Experience cOPN using dec...
cOPN score:  0.5242914979757085


In [82]:
evaluation

[[2467, 'BoW', 'svm', 'cEXT', 0.5546558704453441],
 [2467, 'BoW', 'svm', 'cNEU', 0.5161943319838057],
 [2467, 'BoW', 'svm', 'cAGR', 0.52834008097166],
 [2467, 'BoW', 'svm', 'cCON', 0.5080971659919028],
 [2467, 'BoW', 'svm', 'cOPN', 0.5708502024291497],
 [2467, 'BoW', 'tree', 'cEXT', 0.5182186234817814],
 [2467, 'BoW', 'tree', 'cNEU', 0.5445344129554656],
 [2467, 'BoW', 'tree', 'cAGR', 0.5040485829959515],
 [2467, 'BoW', 'tree', 'cCON', 0.5141700404858299],
 [2467, 'BoW', 'tree', 'cOPN', 0.5242914979757085]]

In [83]:
## Decision Tree - TFIDF

from sklearn import tree
name = "tree"

print("training Extraversion cEXT using dec...")
clf_dec_cEXT = tree.DecisionTreeClassifier()
clf_dec_cEXT.fit(train_x_vectors_tf, train_y_cEXT)
evaluation_tf.append([data, vec_name, name, "cEXT", clf_dec_cEXT.score(test_x_vectors_tf, test_y_cEXT)])
print("cEXT score: ", clf_dec_cEXT.score(test_x_vectors_tf, test_y_cEXT))

print("training Neuroticism cNEU using dec...")
clf_dec_cNEU = tree.DecisionTreeClassifier()
clf_dec_cNEU.fit(train_x_vectors_tf, train_y_cNEU)
evaluation_tf.append([data, vec_name, name, "cNEU", clf_dec_cNEU.score(test_x_vectors_tf, test_y_cNEU)])
print("cNEU score: ", clf_dec_cNEU.score(test_x_vectors_tf, test_y_cNEU))

print("training Agreeableness cAGR using using dec...")
clf_dec_cAGR = tree.DecisionTreeClassifier()
clf_dec_cAGR.fit(train_x_vectors_tf, train_y_cAGR)
evaluation_tf.append([data, vec_name, name, "cAGR", clf_dec_cAGR.score(test_x_vectors_tf, test_y_cAGR)])
print("cAGR score: ", clf_dec_cAGR.score(test_x_vectors_tf, test_y_cAGR))

print("training Conscientiousness cCON using dec...")
clf_dec_cCON = tree.DecisionTreeClassifier()
clf_dec_cCON.fit(train_x_vectors_tf, train_y_cCON)
evaluation_tf.append([data, vec_name, name, "cCON", clf_dec_cCON.score(test_x_vectors_tf, test_y_cCON)])
print("cCON score: ", clf_dec_cCON.score(test_x_vectors_tf, test_y_cCON))

print("training Openness to Experience cOPN using dec...")
clf_dec_cOPN = tree.DecisionTreeClassifier()
clf_dec_cOPN.fit(train_x_vectors_tf, train_y_cOPN)
evaluation_tf.append([data, vec_name, name, "cOPN", clf_dec_cOPN.score(test_x_vectors_tf, test_y_cOPN)])
print("cOPN score: ", clf_dec_cOPN.score(test_x_vectors_tf, test_y_cOPN))


training Extraversion cEXT using dec...
cEXT score:  0.5566801619433198
training Neuroticism cNEU using dec...
cNEU score:  0.4959514170040486
training Agreeableness cAGR using using dec...
cAGR score:  0.5161943319838057
training Conscientiousness cCON using dec...
cCON score:  0.5101214574898786
training Openness to Experience cOPN using dec...
cOPN score:  0.5323886639676113


In [84]:
evaluation_tf

[[2467, 'BoW', 'svm', 'cEXT', 0.5384615384615384],
 [2467, 'BoW', 'svm', 'cNEU', 0.5809716599190283],
 [2467, 'BoW', 'svm', 'cAGR', 0.5607287449392713],
 [2467, 'BoW', 'svm', 'cCON', 0.5465587044534413],
 [2467, 'BoW', 'svm', 'cOPN', 0.6072874493927125],
 [2467, 'BoW', 'tree', 'cEXT', 0.5566801619433198],
 [2467, 'BoW', 'tree', 'cNEU', 0.4959514170040486],
 [2467, 'BoW', 'tree', 'cAGR', 0.5161943319838057],
 [2467, 'BoW', 'tree', 'cCON', 0.5101214574898786],
 [2467, 'BoW', 'tree', 'cOPN', 0.5323886639676113]]

In [85]:
## Naive Bayes

from sklearn.naive_bayes import GaussianNB
name = "gNB"

print("training Extraversion cEXT using GaussianNaiveBayes...")
clf_gnb_cEXT = GaussianNB()
clf_gnb_cEXT.fit(train_x_vectors.toarray(), train_y_cEXT)
evaluation.append([data, vec_name, name, "cEXT", clf_gnb_cEXT.score(test_x_vectors.toarray(), test_y_cEXT)])
print("cEXT score: ", clf_gnb_cEXT.score(test_x_vectors.toarray(), test_y_cEXT))

print("training Neuroticism cNEU using GaussianNaiveBayes...")
clf_gnb_cNEU = GaussianNB()
clf_gnb_cNEU.fit(train_x_vectors.toarray(), train_y_cNEU)
evaluation.append([data, vec_name, name, "cNEU", clf_gnb_cNEU.score(test_x_vectors.toarray(), test_y_cNEU)])
print("cNEU score: ", clf_gnb_cNEU.score(test_x_vectors.toarray(), test_y_cNEU))
    
print("training Agreeableness cAGR using using GaussianNaiveBayes...")
clf_gnb_cAGR = GaussianNB()
clf_gnb_cAGR.fit(train_x_vectors.toarray(), train_y_cAGR)
evaluation.append([data, vec_name, name, "cAGR", clf_gnb_cAGR.score(test_x_vectors.toarray(), test_y_cAGR)])
print("cAGR score: ", clf_gnb_cAGR.score(test_x_vectors.toarray(), test_y_cAGR))

print("training Conscientiousness cCON using GaussianNaiveBayes...")
clf_gnb_cCON = GaussianNB()
clf_gnb_cCON.fit(train_x_vectors.toarray(), train_y_cCON)
evaluation.append([data, vec_name, name, "cCON", clf_gnb_cCON.score(test_x_vectors.toarray(), test_y_cCON)])
print("cCON score: ", clf_gnb_cCON.score(test_x_vectors.toarray(), test_y_cCON))

print("training Openness to Experience cOPN using GaussianNaiveBayes...")
clf_gnb_cOPN = GaussianNB()
clf_gnb_cOPN.fit(train_x_vectors.toarray(), train_y_cOPN)
evaluation.append([data, vec_name, name, "cOPN", clf_gnb_cOPN.score(test_x_vectors.toarray(), test_y_cOPN)])
print("cOPN score: ", clf_gnb_cOPN.score(test_x_vectors.toarray(), test_y_cOPN))

training Extraversion cEXT using GaussianNaiveBayes...
cEXT score:  0.5040485829959515
training Neuroticism cNEU using GaussianNaiveBayes...
cNEU score:  0.5182186234817814
training Agreeableness cAGR using using GaussianNaiveBayes...
cAGR score:  0.5060728744939271
training Conscientiousness cCON using GaussianNaiveBayes...
cCON score:  0.5303643724696356
training Openness to Experience cOPN using GaussianNaiveBayes...
cOPN score:  0.5303643724696356


In [86]:
## Naive Bayes - TFIDF

from sklearn.naive_bayes import GaussianNB
name = "gNB"

print("training Extraversion cEXT using GaussianNaiveBayes...")
clf_gnb_cEXT = GaussianNB()
clf_gnb_cEXT.fit(train_x_vectors_tf.toarray(), train_y_cEXT)
evaluation_tf.append([data, vec_name, name, "cEXT", clf_gnb_cEXT.score(test_x_vectors_tf.toarray(), test_y_cEXT)])
print("cEXT score: ", clf_gnb_cEXT.score(test_x_vectors_tf.toarray(), test_y_cEXT))

print("training Neuroticism cNEU using GaussianNaiveBayes...")
clf_gnb_cNEU = GaussianNB()
clf_gnb_cNEU.fit(train_x_vectors_tf.toarray(), train_y_cNEU)
evaluation_tf.append([data, vec_name, name, "cNEU", clf_gnb_cNEU.score(test_x_vectors_tf.toarray(), test_y_cNEU)])
print("cNEU score: ", clf_gnb_cNEU.score(test_x_vectors_tf.toarray(), test_y_cNEU))
    
print("training Agreeableness cAGR using using GaussianNaiveBayes...")
clf_gnb_cAGR = GaussianNB()
clf_gnb_cAGR.fit(train_x_vectors_tf.toarray(), train_y_cAGR)
evaluation_tf.append([data, vec_name, name, "cAGR", clf_gnb_cAGR.score(test_x_vectors_tf.toarray(), test_y_cAGR)])
print("cAGR score: ", clf_gnb_cAGR.score(test_x_vectors_tf.toarray(), test_y_cAGR))

print("training Conscientiousness cCON using GaussianNaiveBayes...")
clf_gnb_cCON = GaussianNB()
clf_gnb_cCON.fit(train_x_vectors_tf.toarray(), train_y_cCON)
evaluation_tf.append([data, vec_name, name, "cCON", clf_gnb_cCON.score(test_x_vectors_tf.toarray(), test_y_cCON)])
print("cCON score: ", clf_gnb_cCON.score(test_x_vectors_tf.toarray(), test_y_cCON))

print("training Openness to Experience cOPN using GaussianNaiveBayes...")
clf_gnb_cOPN = GaussianNB()
clf_gnb_cOPN.fit(train_x_vectors_tf.toarray(), train_y_cOPN)
evaluation_tf.append([data, vec_name, name, "cOPN", clf_gnb_cOPN.score(test_x_vectors_tf.toarray(), test_y_cOPN)])
print("cOPN score: ", clf_gnb_cOPN.score(test_x_vectors_tf.toarray(), test_y_cOPN))

training Extraversion cEXT using GaussianNaiveBayes...
cEXT score:  0.4979757085020243
training Neuroticism cNEU using GaussianNaiveBayes...
cNEU score:  0.52834008097166
training Agreeableness cAGR using using GaussianNaiveBayes...
cAGR score:  0.5060728744939271
training Conscientiousness cCON using GaussianNaiveBayes...
cCON score:  0.5222672064777328
training Openness to Experience cOPN using GaussianNaiveBayes...
cOPN score:  0.5303643724696356


In [87]:
evaluation_tf

[[2467, 'BoW', 'svm', 'cEXT', 0.5384615384615384],
 [2467, 'BoW', 'svm', 'cNEU', 0.5809716599190283],
 [2467, 'BoW', 'svm', 'cAGR', 0.5607287449392713],
 [2467, 'BoW', 'svm', 'cCON', 0.5465587044534413],
 [2467, 'BoW', 'svm', 'cOPN', 0.6072874493927125],
 [2467, 'BoW', 'tree', 'cEXT', 0.5566801619433198],
 [2467, 'BoW', 'tree', 'cNEU', 0.4959514170040486],
 [2467, 'BoW', 'tree', 'cAGR', 0.5161943319838057],
 [2467, 'BoW', 'tree', 'cCON', 0.5101214574898786],
 [2467, 'BoW', 'tree', 'cOPN', 0.5323886639676113],
 [2467, 'BoW', 'gNB', 'cEXT', 0.4979757085020243],
 [2467, 'BoW', 'gNB', 'cNEU', 0.52834008097166],
 [2467, 'BoW', 'gNB', 'cAGR', 0.5060728744939271],
 [2467, 'BoW', 'gNB', 'cCON', 0.5222672064777328],
 [2467, 'BoW', 'gNB', 'cOPN', 0.5303643724696356]]

In [88]:
## Logistic Regression

from sklearn.linear_model import LogisticRegression
name="logR"

print("training Extraversion cEXT using Logistic Regression...")
clf_log_cEXT = LogisticRegression(solver="newton-cg")
clf_log_cEXT.fit(train_x_vectors, train_y_cEXT)
evaluation.append([data, vec_name, name, "cEXT", clf_log_cEXT.score(test_x_vectors, test_y_cEXT)])
print("cEXT score: ", clf_log_cEXT.score(test_x_vectors, test_y_cEXT))

print("training Neuroticism cNEU using Logistic Regression...")
clf_log_cNEU = LogisticRegression(solver="newton-cg")
clf_log_cNEU.fit(train_x_vectors, train_y_cNEU)
evaluation.append([data, vec_name, name, "cNEU", clf_log_cNEU.score(test_x_vectors, test_y_cNEU)])
print("cNEU score: ", clf_log_cNEU.score(test_x_vectors, test_y_cNEU))

print("training Agreeableness cAGR using using Logistic Regression...")
clf_log_cAGR = LogisticRegression(solver="newton-cg")
clf_log_cAGR.fit(train_x_vectors, train_y_cAGR)
evaluation.append([data, vec_name, name, "cAGR", clf_log_cAGR.score(test_x_vectors, test_y_cAGR)])
print("cAGR score: ", clf_log_cAGR.score(test_x_vectors, test_y_cAGR))

print("training Conscientiousness cCON using Logistic Regression...")
clf_log_cCON = LogisticRegression(solver="newton-cg")
clf_log_cCON.fit(train_x_vectors, train_y_cCON)
evaluation.append([data, vec_name, name, "cCON", clf_log_cCON.score(test_x_vectors, test_y_cCON)])
print("cCON score: ", clf_log_cCON.score(test_x_vectors, test_y_cCON))

print("training Openness to Experience cOPN using Logistic Regression...")
clf_log_cOPN = LogisticRegression(solver="newton-cg")
clf_log_cOPN.fit(train_x_vectors, train_y_cOPN)
evaluation.append([data, vec_name, name, "cOPN", clf_log_cOPN.score(test_x_vectors, test_y_cOPN)])
print("cOPN score: ", clf_log_cOPN.score(test_x_vectors, test_y_cOPN))

training Extraversion cEXT using Logistic Regression...
cEXT score:  0.5566801619433198
training Neuroticism cNEU using Logistic Regression...
cNEU score:  0.5323886639676113
training Agreeableness cAGR using using Logistic Regression...
cAGR score:  0.5344129554655871
training Conscientiousness cCON using Logistic Regression...
cCON score:  0.5303643724696356
training Openness to Experience cOPN using Logistic Regression...
cOPN score:  0.5607287449392713


In [89]:
## Logistic Regression - TFIDF

from sklearn.linear_model import LogisticRegression
name="logR"

print("training Extraversion cEXT using Logistic Regression...")
clf_log_cEXT = LogisticRegression(solver="newton-cg")
clf_log_cEXT.fit(train_x_vectors_tf, train_y_cEXT)
evaluation_tf.append([data, vec_name, name, "cEXT", clf_log_cEXT.score(test_x_vectors_tf, test_y_cEXT)])
print("cEXT score: ", clf_log_cEXT.score(test_x_vectors_tf, test_y_cEXT))

print("training Neuroticism cNEU using Logistic Regression...")
clf_log_cNEU = LogisticRegression(solver="newton-cg")
clf_log_cNEU.fit(train_x_vectors_tf, train_y_cNEU)
evaluation_tf.append([data, vec_name, name, "cNEU", clf_log_cNEU.score(test_x_vectors_tf, test_y_cNEU)])
print("cNEU score: ", clf_log_cNEU.score(test_x_vectors_tf, test_y_cNEU))

print("training Agreeableness cAGR using using Logistic Regression...")
clf_log_cAGR = LogisticRegression(solver="newton-cg")
clf_log_cAGR.fit(train_x_vectors_tf, train_y_cAGR)
evaluation_tf.append([data, vec_name, name, "cAGR", clf_log_cAGR.score(test_x_vectors_tf, test_y_cAGR)])
print("cAGR score: ", clf_log_cAGR.score(test_x_vectors_tf, test_y_cAGR))

print("training Conscientiousness cCON using Logistic Regression...")
clf_log_cCON = LogisticRegression(solver="newton-cg")
clf_log_cCON.fit(train_x_vectors_tf, train_y_cCON)
evaluation_tf.append([data, vec_name, name, "cCON", clf_log_cCON.score(test_x_vectors_tf, test_y_cCON)])
print("cCON score: ", clf_log_cCON.score(test_x_vectors_tf, test_y_cCON))

print("training Openness to Experience cOPN using Logistic Regression...")
clf_log_cOPN = LogisticRegression(solver="newton-cg")
clf_log_cOPN.fit(train_x_vectors_tf, train_y_cOPN)
evaluation_tf.append([data, vec_name, name, "cOPN", clf_log_cOPN.score(test_x_vectors_tf, test_y_cOPN)])
print("cOPN score: ", clf_log_cOPN.score(test_x_vectors_tf, test_y_cOPN))

training Extraversion cEXT using Logistic Regression...
cEXT score:  0.5506072874493927
training Neuroticism cNEU using Logistic Regression...
cNEU score:  0.5789473684210527
training Agreeableness cAGR using using Logistic Regression...
cAGR score:  0.5587044534412956
training Conscientiousness cCON using Logistic Regression...
cCON score:  0.5566801619433198
training Openness to Experience cOPN using Logistic Regression...
cOPN score:  0.6052631578947368


In [90]:
## Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier
name="RF"


print("training Extraversion cEXT using Random Forest...")
clf_rf_cEXT = RandomForestClassifier(n_estimators=100)
clf_rf_cEXT.fit(train_x_vectors, train_y_cEXT)
evaluation.append([data, vec_name, name, "cEXT", clf_rf_cEXT.score(test_x_vectors, test_y_cEXT)])
print("cEXT score: ", clf_rf_cEXT.score(test_x_vectors, test_y_cEXT))

print("training Neuroticism cNEU using Random Forest...")
clf_rf_cNEU = RandomForestClassifier(n_estimators=100)
clf_rf_cNEU.fit(train_x_vectors, train_y_cNEU)
evaluation.append([data, vec_name, name, "cNEU", clf_rf_cNEU.score(test_x_vectors, test_y_cNEU)])
print("cNEU score: ", clf_rf_cNEU.score(test_x_vectors, test_y_cNEU))

print("training Agreeableness cAGR using using Random Forest...")
clf_rf_cAGR = RandomForestClassifier(n_estimators=100)
clf_rf_cAGR.fit(train_x_vectors, train_y_cAGR)
evaluation.append([data, vec_name, name, "cAGR", clf_rf_cAGR.score(test_x_vectors, test_y_cAGR)])
print("cAGR score: ", clf_rf_cAGR.score(test_x_vectors, test_y_cAGR))

print("training Conscientiousness cCON using Random Forest...")
clf_rf_cCON = RandomForestClassifier(n_estimators=100)
clf_rf_cCON.fit(train_x_vectors, train_y_cCON)
evaluation.append([data, vec_name, name, "cCON", clf_rf_cCON.score(test_x_vectors, test_y_cCON)])
print("cCON score: ", clf_rf_cCON.score(test_x_vectors, test_y_cCON))

print("training Openness to Experience cOPN using Random Forest...")
clf_rf_cOPN = RandomForestClassifier(n_estimators=100)
clf_rf_cOPN.fit(train_x_vectors, train_y_cOPN)
evaluation.append([data, vec_name, name, "cOPN", clf_rf_cOPN.score(test_x_vectors, test_y_cOPN)])
print("cOPN score: ", clf_rf_cOPN.score(test_x_vectors, test_y_cOPN))


training Extraversion cEXT using Random Forest...
cEXT score:  0.5384615384615384
training Neuroticism cNEU using Random Forest...
cNEU score:  0.562753036437247
training Agreeableness cAGR using using Random Forest...
cAGR score:  0.5404858299595142
training Conscientiousness cCON using Random Forest...
cCON score:  0.562753036437247
training Openness to Experience cOPN using Random Forest...
cOPN score:  0.5910931174089069


In [91]:
## Random Forest Classifier - TFIDF

from sklearn.ensemble import RandomForestClassifier
name="RF"


print("training Extraversion cEXT using Random Forest...")
clf_rf_cEXT = RandomForestClassifier(n_estimators=100)
clf_rf_cEXT.fit(train_x_vectors_tf, train_y_cEXT)
evaluation_tf.append([data, vec_name, name, "cEXT", clf_rf_cEXT.score(test_x_vectors_tf, test_y_cEXT)])
print("cEXT score: ", clf_rf_cEXT.score(test_x_vectors_tf, test_y_cEXT))

print("training Neuroticism cNEU using Random Forest...")
clf_rf_cNEU = RandomForestClassifier(n_estimators=100)
clf_rf_cNEU.fit(train_x_vectors_tf, train_y_cNEU)
evaluation_tf.append([data, vec_name, name, "cNEU", clf_rf_cNEU.score(test_x_vectors_tf, test_y_cNEU)])
print("cNEU score: ", clf_rf_cNEU.score(test_x_vectors_tf, test_y_cNEU))

print("training Agreeableness cAGR using using Random Forest...")
clf_rf_cAGR = RandomForestClassifier(n_estimators=100)
clf_rf_cAGR.fit(train_x_vectors_tf, train_y_cAGR)
evaluation_tf.append([data, vec_name, name, "cAGR", clf_rf_cAGR.score(test_x_vectors_tf, test_y_cAGR)])
print("cAGR score: ", clf_rf_cAGR.score(test_x_vectors_tf, test_y_cAGR))

print("training Conscientiousness cCON using Random Forest...")
clf_rf_cCON = RandomForestClassifier(n_estimators=100)
clf_rf_cCON.fit(train_x_vectors_tf, train_y_cCON)
evaluation_tf.append([data, vec_name, name, "cCON", clf_rf_cCON.score(test_x_vectors_tf, test_y_cCON)])
print("cCON score: ", clf_rf_cCON.score(test_x_vectors_tf, test_y_cCON))

print("training Openness to Experience cOPN using Random Forest...")
clf_rf_cOPN = RandomForestClassifier(n_estimators=100)
clf_rf_cOPN.fit(train_x_vectors_tf, train_y_cOPN)
evaluation_tf.append([data, vec_name, name, "cOPN", clf_rf_cOPN.score(test_x_vectors_tf, test_y_cOPN)])
print("cOPN score: ", clf_rf_cOPN.score(test_x_vectors_tf, test_y_cOPN))


training Extraversion cEXT using Random Forest...
cEXT score:  0.5364372469635628
training Neuroticism cNEU using Random Forest...
cNEU score:  0.5647773279352226
training Agreeableness cAGR using using Random Forest...
cAGR score:  0.4979757085020243
training Conscientiousness cCON using Random Forest...
cCON score:  0.52834008097166
training Openness to Experience cOPN using Random Forest...
cOPN score:  0.6214574898785425


In [92]:
filename = "data/evaluation/evaluation" + "new_count" + ".p"
pickle.dump(evaluation, open(filename, "wb"))
print("evaluation saved as", filename)

evaluation saved as data/evaluation/evaluationnew_count.p


In [93]:
filename = "data/evaluation/evaluation" + "new_tfidf" + ".p"
pickle.dump(evaluation_tf, open(filename, "wb"))
print("evaluation saved as", filename)

evaluation saved as data/evaluation/evaluationnew_tfidf.p


In [29]:
pickle.dump(clf_svm_cNEU, open('cNEU_piyush.p', 'wb'))

In [95]:
pickle.dump(clf_rf_cNEU, open('cNEU_piyush.p', 'wb'))
pickle.dump(clf_rf_cAGR, open('cAGR_piyush.p', 'wb'))
pickle.dump(clf_rf_cCON, open('cCON_piyush.p', 'wb'))
pickle.dump(clf_rf_cOPN, open('cOPN_piyush.p', 'wb'))

In [96]:
filename = "cEXT_piyush_new.p"

with open(filename, 'wb') as fout:
    pickle.dump((bow_vectorizer ,clf_rf_cEXT), fout)

In [97]:
evaluation

[[2467, 'BoW', 'svm', 'cEXT', 0.5546558704453441],
 [2467, 'BoW', 'svm', 'cNEU', 0.5161943319838057],
 [2467, 'BoW', 'svm', 'cAGR', 0.52834008097166],
 [2467, 'BoW', 'svm', 'cCON', 0.5080971659919028],
 [2467, 'BoW', 'svm', 'cOPN', 0.5708502024291497],
 [2467, 'BoW', 'tree', 'cEXT', 0.5182186234817814],
 [2467, 'BoW', 'tree', 'cNEU', 0.5445344129554656],
 [2467, 'BoW', 'tree', 'cAGR', 0.5040485829959515],
 [2467, 'BoW', 'tree', 'cCON', 0.5141700404858299],
 [2467, 'BoW', 'tree', 'cOPN', 0.5242914979757085],
 [2467, 'BoW', 'gNB', 'cEXT', 0.5040485829959515],
 [2467, 'BoW', 'gNB', 'cNEU', 0.5182186234817814],
 [2467, 'BoW', 'gNB', 'cAGR', 0.5060728744939271],
 [2467, 'BoW', 'gNB', 'cCON', 0.5303643724696356],
 [2467, 'BoW', 'gNB', 'cOPN', 0.5303643724696356],
 [2467, 'BoW', 'logR', 'cEXT', 0.5566801619433198],
 [2467, 'BoW', 'logR', 'cNEU', 0.5323886639676113],
 [2467, 'BoW', 'logR', 'cAGR', 0.5344129554655871],
 [2467, 'BoW', 'logR', 'cCON', 0.5303643724696356],
 [2467, 'BoW', 'logR', '

In [98]:
evaluation_tf

[[2467, 'BoW', 'svm', 'cEXT', 0.5384615384615384],
 [2467, 'BoW', 'svm', 'cNEU', 0.5809716599190283],
 [2467, 'BoW', 'svm', 'cAGR', 0.5607287449392713],
 [2467, 'BoW', 'svm', 'cCON', 0.5465587044534413],
 [2467, 'BoW', 'svm', 'cOPN', 0.6072874493927125],
 [2467, 'BoW', 'tree', 'cEXT', 0.5566801619433198],
 [2467, 'BoW', 'tree', 'cNEU', 0.4959514170040486],
 [2467, 'BoW', 'tree', 'cAGR', 0.5161943319838057],
 [2467, 'BoW', 'tree', 'cCON', 0.5101214574898786],
 [2467, 'BoW', 'tree', 'cOPN', 0.5323886639676113],
 [2467, 'BoW', 'gNB', 'cEXT', 0.4979757085020243],
 [2467, 'BoW', 'gNB', 'cNEU', 0.52834008097166],
 [2467, 'BoW', 'gNB', 'cAGR', 0.5060728744939271],
 [2467, 'BoW', 'gNB', 'cCON', 0.5222672064777328],
 [2467, 'BoW', 'gNB', 'cOPN', 0.5303643724696356],
 [2467, 'BoW', 'logR', 'cEXT', 0.5506072874493927],
 [2467, 'BoW', 'logR', 'cNEU', 0.5789473684210527],
 [2467, 'BoW', 'logR', 'cAGR', 0.5587044534412956],
 [2467, 'BoW', 'logR', 'cCON', 0.5566801619433198],
 [2467, 'BoW', 'logR', '