In [1]:
import numpy as np
import pandas as pd
import re
import string
import pickle
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn import preprocessing
import warnings

from torch.utils.data import TensorDataset, DataLoader
import torch
from torch import nn
from torchvision import datasets
from torchvision.transforms import ToTensor, Lambda, Compose
import matplotlib.pyplot as plt
import datetime
import os

warnings.filterwarnings("ignore")

## read in all the pretrained models

In [2]:
# loading pretrained models

feature_extractor_path = '../models/features2021-12-11.model'

with open(feature_extractor_path,'rb') as f:
    feature_extractor = pickle.load(f)


In [3]:
first_layer_model_path = '../models/first_layer/'
first_layer_model_version = '2021-12-13'
# ❗

first_layer_model = dict()
for modelname in os.listdir(first_layer_model_path):
    if first_layer_model_version in modelname:
        with open(first_layer_model_path + modelname,'rb') as f:
            first_layer_model[modelname[:4]] = pickle.load(f)

In [17]:
df = pd.read_csv('../data/mbti_1.csv', index_col=0)

In [18]:
train_X = feature_extractor.get_features(df.body)

In [49]:
def flatten_one_row(feature):
    tfidf, emoticon, topic = feature
    tfidf = np.array(tfidf.todense()).flatten()
    return np.concatenate([tfidf, emoticon, topic], axis=None)

train_X = np.array([flatten_one_row(row) for row in train_X])


In [67]:
help(first_layer_model['I___'])

Help on RandomForestClassifier in module sklearn.ensemble._forest object:

class RandomForestClassifier(ForestClassifier)
 |  RandomForestClassifier(n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)
 |  
 |  A random forest classifier.
 |  
 |  A random forest is a meta estimator that fits a number of decision tree
 |  classifiers on various sub-samples of the dataset and uses averaging to
 |  improve the predictive accuracy and control over-fitting.
 |  The sub-sample size is controlled with the `max_samples` parameter if
 |  `bootstrap=True` (default), otherwise the whole dataset is used to build
 |  each tree.
 |  
 |  Read more in the :ref:`User Guide <forest>`.
 |  
 |  Parameters
 |  ----------

In [100]:
def cog_funs(first_layer_model,train_X):
    ret = []
    for key,v in first_layer_model.items():
        ret.append(v.predict_log_proba(train_X)[:,0])
    ret = np.array(ret)
    return ret.T
    


In [104]:
train_X_cog_funs = cog_funs(first_layer_model,train_X)

In [106]:
train_X = np.array([np.concatenate([a,b],axis=None) for a,b in zip(train_X,train_X_cog_funs)])

In [None]:
mbti_types = ['ENFJ','ENFP','ENTJ','ENTP','ESFJ','ESFP','ESTJ','ESTP',
    'INFJ','INFP','INTJ','INTP','ISFJ','ISFP','ISTJ','ISTP']
type2int = {t:i for i,t in enumerate(mbti_types)}
int2type = {i:t for i,t in enumerate(mbti_types)}

In [129]:
train_y = df.mbti_type.apply(lambda x:type2int[x]).values

## read in data

In [10]:
def first_layer_output(Xtrain, ytrain):
    Xprob = []
    
    
    for i,sample in enumerate(Xtrain):
        Xprob.append(first_layer_output_predict(sample))
        
        if i % 1000 == 0:
            print(i,'out of', len(Xtrain), 'has been transformed')
            print(time.time() - start)
        
    df = pd.DataFrame(Xprob)
    
    # renameing columns
    df.rename(columns = dict(zip(df.columns,sorted(models.keys()))),inplace=True)
    
    df['label'] = ytrain
    
    return df

## Another thing to do is to transfer the label to int

In [11]:
# Xtrain = train['body'].values
# ytrain = train['mbti_type'].values
# Xtest = test['body'].values
# ytest = test['mbti_type'].values
# del train
# del test
# with open('mbti_to_number_encoder','wb') as f:
#     pickle.dump(le,f)
# le = preprocessing.LabelEncoder()
# ytrain = le.fit_transform(ytrain)
# ytest = le.transform(ytest)
# train_df = first_layer_output(Xtrain, ytrain)
# test_df  = first_layer_output(Xtest, ytest)
# ytrain = train_df['label'].values
# train_df.drop(['label'], axis=1, inplace=True)
# Xtrain = train_df.values

# ytest = test_df['label'].values
# test_df.drop(['label'], axis=1, inplace=True)
# Xtest = test_df.values
# torch.save(torch.from_numpy(Xtrain.astype('float32')),'../data/Xtrain.pt')
# torch.save(torch.from_numpy(ytrain), 'ytrain.pt')
# torch.save(torch.from_numpy(Xtest.astype('float32')),'../data/Xtest.pt')
# torch.save(torch.from_numpy(ytest), 'ytest.pt')

In [12]:
# read the label encoder
with open('../data/mbti_to_number_encoder','rb') as f:
    le = pickle.load(f)




In [13]:
Xtrain = torch.load('../data/Xtrain.pt')
ytrain = torch.load('../data/ytrain.pt')
Xtest = torch.load('../data/Xtest.pt')
ytest = torch.load('../data/ytest.pt')


In [19]:
from sklearn.model_selection import train_test_split

In [20]:
df = pd.read_csv('../data/development.csv', index_col=0)
X = df['body']
y = df['type']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1,random_state=42)

In [35]:
from sklearn.tree import DecisionTreeClassifier

In [42]:
clf = DecisionTreeClassifier(criterion='gini',max_depth=500)

In [45]:
for i in range(100):
    clf.fit(Xtrain.numpy(), ytrain.numpy())
    print(i, accuracy_score(ytest.numpy(), clf.predict(Xtest.numpy())))

0 0.269375
1 0.278375
2 0.271125
3 0.276125
4 0.270375
5 0.27175
6 0.27375
7 0.2745
8 0.26925
9 0.276375
10 0.276125
11 0.274875
12 0.2775
13 0.273375
14 0.271125
15 0.269875
16 0.27075
17 0.27225
18 0.27575
19 0.273
20 0.27525
21 0.27425
22 0.270375
23 0.27375
24 0.270875
25 0.27325
26 0.279125
27 0.273625
28 0.2745
29 0.279
30 0.271
31 0.269
32 0.273875
33 0.2735
34 0.2685
35 0.2785
36 0.274125
37 0.269125
38 0.276
39 0.27475
40 0.272125
41 0.275125
42 0.2715


KeyboardInterrupt: 

In [18]:
train_dl = DataLoader(train_data,shuffle=True, batch_size=500)
test_dl = DataLoader(test_data,shuffle=True,batch_size=500)

In [53]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

class NeuralN(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(32, 32),
            nn.ReLU(),
            nn.Linear(32,32),
            nn.MaxPool1d(5, stride=1),
            nn.SiLU(),
            nn.Linear(32-5+1,20),
            nn.ReLU(),
            nn.Linear(20,16)
        )
    
    def forward(self,x):
        logits = self.linear_relu_stack(x)
        return logits

model = NeuralN().to(device)

In [54]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-1)

In [55]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X,y) in enumerate(dataloader):
        X,y = X.to(device), y.to(device)
        
        pred = model(X)
        loss = loss_fn(pred, y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # if batch % 100 == 0:
        #     loss, current = loss.item(), batch * len(X)
        #     print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [56]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [57]:
epochs = 20
for t in range(200):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dl, model, loss_fn, optimizer)
    test(test_dl, model, loss_fn)
print("Done!")
    

Epoch 1
-------------------------------
Test Error: 
 Accuracy: 16.3%, Avg loss: 2.619456 

Epoch 2
-------------------------------
Test Error: 
 Accuracy: 22.1%, Avg loss: 2.228087 

Epoch 3
-------------------------------
Test Error: 
 Accuracy: 21.5%, Avg loss: 2.121699 

Epoch 4
-------------------------------
Test Error: 
 Accuracy: 20.8%, Avg loss: 2.109974 

Epoch 5
-------------------------------
Test Error: 
 Accuracy: 22.4%, Avg loss: 2.103401 

Epoch 6
-------------------------------
Test Error: 
 Accuracy: 22.0%, Avg loss: 2.099401 

Epoch 7
-------------------------------
Test Error: 
 Accuracy: 22.4%, Avg loss: 2.096186 

Epoch 8
-------------------------------
Test Error: 
 Accuracy: 22.4%, Avg loss: 2.093070 

Epoch 9
-------------------------------
Test Error: 
 Accuracy: 22.4%, Avg loss: 2.090246 

Epoch 10
-------------------------------
Test Error: 
 Accuracy: 23.2%, Avg loss: 2.088287 

Epoch 11
-------------------------------
Test Error: 
 Accuracy: 22.9%, Avg los

In [58]:
for t in range(100):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dl, model, loss_fn, optimizer)
    test(test_dl, model, loss_fn)
print("Done!")
    

Epoch 1
-------------------------------
Test Error: 
 Accuracy: 38.9%, Avg loss: 1.814872 

Epoch 2
-------------------------------
Test Error: 
 Accuracy: 38.6%, Avg loss: 1.814110 

Epoch 3
-------------------------------
Test Error: 
 Accuracy: 38.6%, Avg loss: 1.812954 

Epoch 4
-------------------------------
Test Error: 
 Accuracy: 38.8%, Avg loss: 1.813648 

Epoch 5
-------------------------------
Test Error: 
 Accuracy: 38.8%, Avg loss: 1.812658 

Epoch 6
-------------------------------
Test Error: 
 Accuracy: 38.6%, Avg loss: 1.814167 

Epoch 7
-------------------------------
Test Error: 
 Accuracy: 39.0%, Avg loss: 1.812439 

Epoch 8
-------------------------------
Test Error: 
 Accuracy: 38.9%, Avg loss: 1.812962 

Epoch 9
-------------------------------
Test Error: 
 Accuracy: 38.8%, Avg loss: 1.812491 

Epoch 10
-------------------------------
Test Error: 
 Accuracy: 38.9%, Avg loss: 1.811833 

Epoch 11
-------------------------------
Test Error: 
 Accuracy: 38.8%, Avg los