In [22]:
import pandas as pd
import numpy as np
import scipy as sp

from Features import Features
import pickle
import datetime

from sklearn.ensemble import RandomForestClassifier

## Training the feature extraction 

In [23]:
train_csv_path = '../data/mbti_full_pull_half_train.csv'
df = pd.read_csv(train_csv_path, index_col=0)
df = df.sample(200)

In [24]:
# this is reducing the size of the training dataset
new_indices = []
for k,group in df.groupby(["mbti_type"]).groups.items():
    if len(group) > 10000:
        new_indices.extend(group[:10000])
    else:
        new_indices.extend(group)
df = df.loc[new_indices]
df.groupby(['mbti_type']).count()

Unnamed: 0_level_0,body
mbti_type,Unnamed: 1_level_1
ENFJ,1
ENFP,11
ENTJ,6
ENTP,26
ESFP,1
ESTJ,1
ESTP,2
INFJ,27
INFP,22
INTJ,42


In [25]:
modelName = '../models/features2021-12-13.model'

try:
    # the model can be loaded
    with open(modelName,'rb') as f:
        feature_extractor = pickle.load(f)
except:
    # training the model
    feature_extractor = Features(df.body, '../data/stopwords.txt')
    feature_extractor.build_model()

cleaning the copora
building tfidf model
building lda topic model
model built, saving it
model saved at:


# Training the first layers

## enumerating all the cognitive functions (With repeats)


In [26]:
from itertools import combinations

types = ['IE','NS','TF','JP']

deg1 = []

for i in types:
    for ii in i:
        deg1.append(ii)

deg2 = []
for i,j, in combinations(types,2):
    for ii in i:
        for jj in j:
            deg2.append(ii+jj)  

deg3 = []
for i,j,k in combinations(types,3):
    for ii in i:
        for jj in j:
            for kk in k:
                deg3.append(ii+jj+kk)

deg4 = []
for i,j,k,l in combinations(types,4):
    for ii in i:
        for jj in j:
            for kk in k:
                for ll in l:
                    deg4.append(ii+jj+kk+ll)

cog_funs = deg1 + deg2 + deg3 + deg4

def normalize(s):
    ret = ''
    for type in types:
        if type[0] in s:
            ret += type[0]
        elif type[1] in s:
            ret += type[1]
        else:
            ret += '_'
    return ret

cog_funs = list(map(normalize,cog_funs))
print('\t'.join(cog_funs))
print(len(cog_funs))
cog_funs = {i:None for i in cog_funs}

I___	E___	_N__	_S__	__T_	__F_	___J	___P	IN__	IS__	EN__	ES__	I_T_	I_F_	E_T_	E_F_	I__J	I__P	E__J	E__P	_NT_	_NF_	_ST_	_SF_	_N_J	_N_P	_S_J	_S_P	__TJ	__TP	__FJ	__FP	INT_	INF_	IST_	ISF_	ENT_	ENF_	EST_	ESF_	IN_J	IN_P	IS_J	IS_P	EN_J	EN_P	ES_J	ES_P	I_TJ	I_TP	I_FJ	I_FP	E_TJ	E_TP	E_FJ	E_FP	_NTJ	_NTP	_NFJ	_NFP	_STJ	_STP	_SFJ	_SFP	INTJ	INTP	INFJ	INFP	ISTJ	ISTP	ISFJ	ISFP	ENTJ	ENTP	ENFJ	ENFP	ESTJ	ESTP	ESFJ	ESFP
80


There are repeating elements in the above listed cog_funs, such as 'I___' and 'E___' are really the same thing, and I choose not to handle this repeatition.  

Now, what's left to do is to have feature extractions and then train binary classifier for each cognitive functions. 

Suppose the feature extraction is trained and stored in a ../models/features______.model

In [27]:
def check_match(y,y_):
    for i,j in enumerate(y_):
        if j == '_':
            pass
        elif j == y[i]:
            pass
        else:
            return 0
    return 1

In [28]:
check_match('INTJ','_NT_'), check_match('INTJ','E___')

(1, 0)

## Training the first layer model

In [29]:
modelNameSuffix = '2021-12-13'

train_X = feature_extractor.get_features(df.body)
train_y = df.mbti_type

In [30]:
for model in cog_funs.keys():
    
    train_yy = [check_match(i,model) for i in train_y]
    
    classifier = RandomForestClassifier(n_estimators=10)
    
    classifier.fit(train_X, train_yy)
    
    with open('../models/first_layer/'  + model + modelNameSuffix + '.model','wb') as f:
        pickle.dump(classifier,f)
        cog_funs[model] = classifier
    print(model +' training completed', end='\t')



I___ training completed	E___ training completed	_N__ training completed	_S__ training completed	__T_ training completed	__F_ training completed	___J training completed	___P training completed	IN__ training completed	IS__ training completed	EN__ training completed	ES__ training completed	I_T_ training completed	I_F_ training completed	E_T_ training completed	E_F_ training completed	I__J training completed	I__P training completed	E__J training completed	E__P training completed	_NT_ training completed	



_NF_ training completed	_ST_ training completed	_SF_ training completed	_N_J training completed	_N_P training completed	_S_J training completed	_S_P training completed	__TJ training completed	__TP training completed	__FJ training completed	__FP training completed	INT_ training completed	INF_ training completed	IST_ training completed	ISF_ training completed	ENT_ training completed	ENF_ training completed	EST_ training completed	ESF_ training completed	IN_J training completed	IN_P training completed	IS_J training completed	IS_P training completed	EN_J training completed	



EN_P training completed	ES_J training completed	ES_P training completed	I_TJ training completed	I_TP training completed	I_FJ training completed	I_FP training completed	E_TJ training completed	E_TP training completed	E_FJ training completed	E_FP training completed	_NTJ training completed	_NTP training completed	_NFJ training completed	_NFP training completed	_STJ training completed	_STP training completed	_SFJ training completed	_SFP training completed	INTJ training completed	INTP training completed	INFJ training completed	INFP training completed	ISTJ training completed	ISTP training completed	ISFJ training completed	ISFP training completed	ENTJ training completed	ENTP training completed	ENFJ training completed	ENFP training completed	ESTJ training completed	ESTP training completed	ESFJ training completed	ESFP training completed	



## import test data

In [31]:
test_df = pd.read_csv('../data/mbti_full_pull_half_test.csv')

In [32]:
test_X = feature_extractor.get_features(test_df.body)
test_y = test_df.mbti_type

In [None]:
def test(cog):
    with open("../models/first_layer/"+cog+modelNameSuffix + '.model','rb') as f:
        model = pickle.load(f)
    predict_y = model.predict(test_X)
    counter = 0
    test_yy = [check_match(cog, c) for c in  test_y]
    for x,y in zip(test_yy, predict_y):
        if x==y:
            counter+=1;
    print(counter/len(predict_y))        

In [None]:
for cog in cog_funs:
    print(cog,end=':\t')
    test(cog)

I___
0.36212808317663975
E___
0.8268259410832135
_N__
0.07757253991077682
_S__
0.9710577738450762
__T_
0.43914758691885114
__F_
0.7595398739077536
___J
0.8019393134977694
___P
0.3964163256276961
IN__
0.6611363049810124
IS__
0.9894923127972569
EN__
0.8945175681156214
ES__
0.9965343066769901
I_T_
0.8668657596873502
I_F_
0.9484201600117981
E_T_
0.9588172399808281
E_F_
0.9865059174870037
I__J
0.9337831360837665
I__P
0.8903144932345242
E__J
0.9873170372008996
E__P
0.9404195701065516
_NT_
0.747151863731888
_NF_
0.8229915569811599
_ST_
0.9820078899826715
_SF_
0.9994469638314346
_N_J
0.8686354754267596
_N_P
0.6985215499760351
_S_J
0.9984883678059212
_S_P
0.9835563912546548
__TJ
0.9593702761493935
__TP
0.8596762894959997
__FJ
0.9784315894259484
__FP
0.9525126276591822
INT_
0.9431478818714744
INF_
0.9627991003944991
IST_
0.9927736607307451
ISF_
0.9995207019872433
ENT_
0.9744865980901818
ENF_
0.9886443240054567
EST_
0.9966817829886074
ESF_
0.9999631309220957
IN_J
0.9530656638277477
IN_P
0.9571212

In [None]:
def cf_predict(train_X):
    return np.array([cog_funs[model].predict_proba(train_X)[:,0] \
        for model in sorted(list(cog_funs))]).T

# The second layer model: random forest

The inputs of the second layer model should be
- ✔ cognitive functions, there are roughly 80 of them. With a bigger weight
- ❌ the features. 

We have imagined to use a second layer as a NN, which takes both the cognitive functions and the features. However, we realized that NN is too costly and really not necessary, as we have a first layer with incredible accuracy. So we will use a simple _random forest_ for the second layer. 


In [1]:
mbti_types = ['ENFJ','ENFP','ENTJ','ENTP','ESFJ','ESFP','ESTJ','ESTP',
    'INFJ','INFP','INTJ','INTP','ISFJ','ISFP','ISTJ','ISTP']
type2int = {t:i for i,t in enumerate(mbti_types)}
int2type = {i:t for i,t in enumerate(mbti_types)}

train_y2 = train_y.apply(lambda x:type2int[x]).values
test_y2  = test_y.apply(lambda x:type2int[x]).values

train_X2 = cf_predict(train_X)
test_X2  = cf_predict(test_X)

NameError: name 'train_y' is not defined

In [None]:
second_layer_classifier = RandomForestClassifier(n_estimators=100)
second_layer_classifier.fit(train_X2,train_y)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(test_y2, second_layer_classifier.predict(test_X2))