In [1]:
import numpy as np
import pandas as pd
import re
import string
import pickle
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn import preprocessing
import warnings

from torch.utils.data import TensorDataset, DataLoader
import torch
from torch import nn
from torchvision import datasets
from torchvision.transforms import ToTensor, Lambda, Compose
import matplotlib.pyplot as plt
import datetime
import os

warnings.filterwarnings("ignore")

## read in all the pretrained models

In [2]:
# loading pretrained models

feature_extractor_path = '../models/features2021-12-11.model'

with open(feature_extractor_path,'rb') as f:
    feature_extractor = pickle.load(f)


In [3]:
first_layer_model_path = '../models/first_layer/'
first_layer_model_version = '2021-12-13'
# ❗

first_layer_model = dict()
for modelname in os.listdir(first_layer_model_path):
    if first_layer_model_version in modelname:
        with open(first_layer_model_path + modelname,'rb') as f:
            first_layer_model[modelname[:4]] = pickle.load(f)

In [17]:
train_csv = '../data/mbti_1.csv'
df = pd.read_csv(train_csv, index_col=0)

In [18]:
train_X = feature_extractor.get_features(df.body)

In [None]:
def flatten_one_row(feature):
    tfidf, emoticon, topic = feature
    tfidf = np.array(tfidf.todense()).flatten()
    return np.concatenate([tfidf, emoticon, topic], axis=None)

train_X = np.array([flatten_one_row(row) for row in train_X])

In [21]:
train_X.shape

(8675, 9503)

In [22]:
def cog_funs(first_layer_model,train_X):
    ret = []
    for key,v in first_layer_model.items():
        ret.append(v.predict_log_proba(train_X)[:,0])
    ret = np.array(ret)
    return ret.T

In [23]:
train_X_cog_funs = cog_funs(first_layer_model,train_X)

In [24]:
train_X = np.array([np.concatenate([a,b],axis=None) for a,b in zip(train_X,train_X_cog_funs)])

In [25]:
mbti_types = ['ENFJ','ENFP','ENTJ','ENTP','ESFJ','ESFP','ESTJ','ESTP',
    'INFJ','INFP','INTJ','INTP','ISFJ','ISFP','ISTJ','ISTP']
type2int = {t:i for i,t in enumerate(mbti_types)}
int2type = {i:t for i,t in enumerate(mbti_types)}

In [26]:
train_y = df.mbti_type.apply(lambda x:type2int[x]).values

In [81]:
train_dataset = TensorDataset(torch.tensor(train_X.astype('float32')), torch.tensor(train_y))
train_dl = DataLoader(train_dataset,batch_size=50)

In [82]:
for x,y in train_dl:
    print(x.shape, y.shape)
    break

torch.Size([50, 9583]) torch.Size([50])


In [87]:
device = 'gpu' if torch.cuda.is_available() else 'cpu'
device = 'cpu'

class NeuralN(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(9583, 10000),
            nn.ReLU(),
            nn.Linear(10000,5000),
            nn.ReLU(),
            nn.Linear(5000,5000),
            nn.ReLU(),
            nn.Linear(5000,2000),
            nn.ReLU(),
            nn.Linear(2000,2000),
            nn.ReLU(),
            nn.Linear(2000,1000),
            nn.ReLU(),
            nn.Linear(1000,200),
            nn.Linear(200,200),
            nn.ReLU(),     
            nn.Linear(200,100),
            nn.ReLU(),     
            nn.Linear(100,16),
            
        )
    
    def forward(self,x):
        logits = self.linear_relu_stack(x)
        return logits

model = NeuralN().to(device)

In [88]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-1)

In [89]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X,y) in enumerate(dataloader):
        X,y = X.to(device), y.to(device)
        
        pred = model(X)
        loss = loss_fn(pred, y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # if batch % 100 == 0:
        #     loss, current = loss.item(), batch * len(X)
        #     print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [90]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [91]:
epochs = 100
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dl, model, loss_fn, optimizer)
    test(train_dl, model, loss_fn)
print("Done!")
    

Epoch 1
-------------------------------


KeyboardInterrupt: 