In [13]:
import pandas as pd
import numpy as np
import scipy as sp

from Features import Features
import pickle
import datetime

from sklearn.ensemble import RandomForestClassifier

## Training the feature extraction 

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
train_csv_path = '../data/mbti_full_pull_half_train.csv'
# train_csv_path = '/mnt/c/Users/haiya/Downloads/finalp/mbti_full_pull_half_train.csv'

df = pd.read_csv(train_csv_path, index_col=0)

# Both the train and test are sample from the training mbti_full_pull_half_train.csv. So this will not affect the evaluation later
X_train, X_test, y_train, y_test = \
    train_test_split(df.body, df.mbti_type, test_size=0.33, random_state=42)

In [16]:
# this is reducing the size of the training dataset
new_indices = []
for k,group in df.groupby(["mbti_type"]).groups.items():
    if len(group) > 10000:
        new_indices.extend(group[:10000])
    else:
        new_indices.extend(group)
df = df.loc[new_indices]
df.groupby(['mbti_type']).count()

Unnamed: 0_level_0,body
mbti_type,Unnamed: 1_level_1
ENFJ,3279
ENFP,10000
ENTJ,6614
ENTP,10000
ESFJ,346
ESFP,695
ESTJ,1043
ESTP,3757
INFJ,10000
INFP,10000


In [17]:
modelName = '../models/features2021-12-14.model'

try:
    # the model can be loaded
    with open(modelName,'rb') as f:
        feature_extractor = pickle.load(f)
except:
    # training the model
    feature_extractor = Features(df.body, '../data/stopwords.txt')
    feature_extractor.build_model()

# Training the first layers

## enumerating all the cognitive functions (With repeats)


In [18]:
from itertools import combinations

types = ['IE','NS','TF','JP']

deg1 = []

for i in types:
    for ii in i:
        deg1.append(ii)

deg2 = []
for i,j, in combinations(types,2):
    for ii in i:
        for jj in j:
            deg2.append(ii+jj)  

deg3 = []
for i,j,k in combinations(types,3):
    for ii in i:
        for jj in j:
            for kk in k:
                deg3.append(ii+jj+kk)

deg4 = []
for i,j,k,l in combinations(types,4):
    for ii in i:
        for jj in j:
            for kk in k:
                for ll in l:
                    deg4.append(ii+jj+kk+ll)

cog_funs = deg1 + deg2 + deg3 + deg4

def normalize(s):
    ret = ''
    for type in types:
        if type[0] in s:
            ret += type[0]
        elif type[1] in s:
            ret += type[1]
        else:
            ret += '_'
    return ret

cog_funs = list(map(normalize,cog_funs))
print('\t'.join(cog_funs))
print(len(cog_funs))
cog_funs = {i:None for i in cog_funs}

I___	E___	_N__	_S__	__T_	__F_	___J	___P	IN__	IS__	EN__	ES__	I_T_	I_F_	E_T_	E_F_	I__J	I__P	E__J	E__P	_NT_	_NF_	_ST_	_SF_	_N_J	_N_P	_S_J	_S_P	__TJ	__TP	__FJ	__FP	INT_	INF_	IST_	ISF_	ENT_	ENF_	EST_	ESF_	IN_J	IN_P	IS_J	IS_P	EN_J	EN_P	ES_J	ES_P	I_TJ	I_TP	I_FJ	I_FP	E_TJ	E_TP	E_FJ	E_FP	_NTJ	_NTP	_NFJ	_NFP	_STJ	_STP	_SFJ	_SFP	INTJ	INTP	INFJ	INFP	ISTJ	ISTP	ISFJ	ISFP	ENTJ	ENTP	ENFJ	ENFP	ESTJ	ESTP	ESFJ	ESFP
80


There are repeating elements in the above listed cog_funs, such as 'I___' and 'E___' are really the same thing, and I choose not to handle this repeatition.  

Now, what's left to do is to have feature extractions and then train binary classifier for each cognitive functions. 

Suppose the feature extraction is trained and stored in a ../models/features______.model

## Training the first layer model

In [19]:
def check_match(y,y_):
    for i,j in enumerate(y_):
        if j == '_':
            pass
        elif j == y[i]:
            pass
        else:
            return 0
    return 1

In [20]:
modelNameSuffix = '2021-12-13'

X_train1 = feature_extractor.get_features(X_train)
X_test1 = feature_extractor.get_features(X_test)

In [21]:
for model in cog_funs.keys():
    try:
        with open('../models/first_layer/'  + model + modelNameSuffix + '.model','rb') as f:
            cog_funs[model] = pickle.load(f)
    except:
        
        train_yy = [check_match(i,model) for i in train_y]
        
        classifier = RandomForestClassifier(n_estimators=10)
        
        classifier.fit(train_X, train_yy)
        
        with open('../models/first_layer/'  + model + modelNameSuffix + '.model','wb') as f:
            pickle.dump(classifier,f)
            cog_funs[model] = classifier
        print(model +' training completed', end='\t')

In [22]:
def cf_predict(train_X):
    return np.array([cog_funs[model].predict_proba(train_X)[:,0] \
        for model in sorted(list(cog_funs))]).T

# The second layer model: random forest

The inputs of the second layer model should be
- ✔ cognitive functions, there are roughly 80 of them. With a bigger weight
- ❌ the features. 

We have imagined to use a second layer as a NN, which takes both the cognitive functions and the features. However, we realized that NN is too costly and really not necessary, as we have a first layer with incredible accuracy. So we will use a simple _random forest_ for the second layer. 


In [24]:
X_train2 = cf_predict(X_train1)
X_test2 = cf_predict(X_test1)

In [25]:
from sklearn.metrics import accuracy_score

deg1 = {i:None for i in deg1}

for i in deg1:
    clf = RandomForestClassifier()
    yy_train = [i in j for j in y_train]
    clf.fit(X_train2,yy_train)
    deg1[i] = clf
    print(i,end=' ')

for i,clf in deg1.items():
    yy_test = [i in j for j in y_test]
    print(i, accuracy_score(yy_test, clf.predict(X_test2)))

I E N S T F J P I 0.919433927130532
E 0.9193966854943827
N 0.9991434423685681
S 0.9992551672770157
T 0.8409037303705543
F 0.8412885606107628
J 0.7704301408975235
P 0.7715473899819999


## The above result will be my second layer model. 

The results shown implies that we can be certain about the personality in the `N versus S`, `T versus F`, `J versus P` dimension. The only thing uncertain is `Introverts versus Extroverts`. 