In [1]:
import pandas as pd
import numpy as np
import scipy as sp

import pickle
import datetime

from sklearn.ensemble import RandomForestClassifier

## Training the feature extraction 

In [19]:
train_csv_path = '../data/mbti_full_pull_half_train.csv'
df = pd.read_csv(train_csv_path, index_col=0)


In [40]:
new_indices = []
for k,group in df.groupby(["mbti_type"]).groups.items():
    if len(group) > 10000:
        new_indices.extend(group[:10000])
    else:
        new_indices.extend(group)
df = df.loc[new_indices]

body         87922
mbti_type    87922
dtype: int64

In [3]:
from Features import Features

In [4]:
modelName = '../models/features2021-12-11.model'

try:
    with open(modelName,'rb') as f:
        feature_extractor = pickle.load(f)
except:
    feature_extractor = Features(df.body, '../data/stopwords.txt')

# Training the first layers

## enumerating all the cognitive functions (With repeats)


In [5]:
from itertools import combinations

types = ['IE','NS','TF','JP']

deg1 = []

for i in types:
    for ii in i:
        deg1.append(ii)

deg2 = []
for i,j, in combinations(types,2):
    for ii in i:
        for jj in j:
            deg2.append(ii+jj)  

deg3 = []
for i,j,k in combinations(types,3):
    for ii in i:
        for jj in j:
            for kk in k:
                deg3.append(ii+jj+kk)

deg4 = []
for i,j,k,l in combinations(types,4):
    for ii in i:
        for jj in j:
            for kk in k:
                for ll in l:
                    deg4.append(ii+jj+kk+ll)

cog_funs = deg1 + deg2 + deg3 + deg4

def normalize(s):
    ret = ''
    for type in types:
        if type[0] in s:
            ret += type[0]
        elif type[1] in s:
            ret += type[1]
        else:
            ret += '_'
    return ret

cog_funs = list(map(normalize,cog_funs))
print('\t'.join(cog_funs))
print(len(cog_funs))

I___	E___	_N__	_S__	__T_	__F_	___J	___P	IN__	IS__	EN__	ES__	I_T_	I_F_	E_T_	E_F_	I__J	I__P	E__J	E__P	_NT_	_NF_	_ST_	_SF_	_N_J	_N_P	_S_J	_S_P	__TJ	__TP	__FJ	__FP	INT_	INF_	IST_	ISF_	ENT_	ENF_	EST_	ESF_	IN_J	IN_P	IS_J	IS_P	EN_J	EN_P	ES_J	ES_P	I_TJ	I_TP	I_FJ	I_FP	E_TJ	E_TP	E_FJ	E_FP	_NTJ	_NTP	_NFJ	_NFP	_STJ	_STP	_SFJ	_SFP	INTJ	INTP	INFJ	INFP	ISTJ	ISTP	ISFJ	ISFP	ENTJ	ENTP	ENFJ	ENFP	ESTJ	ESTP	ESFJ	ESFP
80


There are repeating elements in the above listed cog_funs, such as 'I___' and 'E___' are really the same thing, and I choose not to handle this repeatition.  

Now, what's left to do is to have feature extractions and then train binary classifier for each cognitive functions. 

Suppose the feature extraction is trained and stored in a ../models/features______.model

In [6]:
def check_match(y,y_):
    for i,j in enumerate(y_):
        if j == '_':
            pass
        elif j == y[i]:
            pass
        else:
            return 0
    return 1

In [7]:
check_match('INTJ','_NT_'), check_match('INTJ','E___')

(1, 0)

In [12]:
df

Unnamed: 0,body,mbti_type
572509,"- For any number of reasons, sex always compli...",ENTP
1754043,"I have several favorite books, but here are a ...",INTJ
704479,I have a great relationship with my ESFJ mom. ...,ENTJ
796469,"""Bitch, you best back the fuck off"" is the fee...",INTJ
1563634,Hm. It sounds like Si might be a better fit fo...,INFJ
...,...,...
1751260,"Patience lady, may I ask what is the context o...",INTJ
478158,I think the main sign is being in touch with o...,INFP
1774634,Complaining about my tone is exactly what you'...,INTJ
1309992,As far as i know about the thinking and feelin...,ENTP


## Training the first layer model

In [41]:
modelNameSuffix = str(datetime.date.today())

train_X = feature_extractor.get_features(df.body)
train_y = df.mbti_type

def flatten_one_row(feature):
    tfidf, emoticon, topic = feature
    tfidf = np.array(tfidf.todense()).flatten()
    return np.concatenate([tfidf, emoticon, topic], axis=None)

train_X = np.array([flatten_one_row(row) for row in train_X])

In [42]:
for model in cog_funs:
    
    train_yy = [check_match(i,model) for i in train_y]
    
    classifier = RandomForestClassifier(n_estimators=8)
    # todo: perhaps find a better classifier? 
    
    classifier.fit(train_X, train_yy)
    
    with open('../models/first_layer/'  + model + modelNameSuffix + '.model','wb') as f:
        pickle.dump(classifier,f)
    print(model +' training completed')
    

I___ training completed
E___ training completed
_N__ training completed
_S__ training completed
__T_ training completed
__F_ training completed
___J training completed
___P training completed
IN__ training completed
IS__ training completed
EN__ training completed
ES__ training completed
I_T_ training completed
I_F_ training completed
E_T_ training completed
E_F_ training completed
I__J training completed
I__P training completed
E__J training completed
E__P training completed
_NT_ training completed
_NF_ training completed
_ST_ training completed
_SF_ training completed
_N_J training completed
_N_P training completed
_S_J training completed
_S_P training completed
__TJ training completed
__TP training completed
__FJ training completed
__FP training completed
INT_ training completed
INF_ training completed
IST_ training completed
ISF_ training completed
ENT_ training completed
ENF_ training completed
EST_ training completed
ESF_ training completed
IN_J training completed
IN_P training co

In [43]:
full_df = pd.read_csv(train_csv_path, index_col=0)
full_train_X = feature_extractor.get_features(full_df.body)

In [44]:
full_train_X = np.array([flatten_one_row(row) for row in full_train_X ])
np.save("../data/train_x.np",full_train_X)

  arr = np.asanyarray(arr)


# The second layer model

The inputs of the second layer model should be
- cognitive functions, there are roughly 80 of them. With a bigger weight
- the dimension-reducted output of the features. The features supposed have a really large dimension of 10k, we should perhaps reduct it down to 1k? 

And the layers of the NN should be
- a few linear layers, plus activations

I have already written some codes in `development.ipynb`. Those codes should be moved here. 