In [1]:
import numpy as np
import pandas as pd
import re
import string
import pickle
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn import preprocessing
import warnings

from torch.utils.data import TensorDataset, DataLoader
import torch
from torch import nn
from torchvision import datasets
from torchvision.transforms import ToTensor, Lambda, Compose
import matplotlib.pyplot as plt


warnings.filterwarnings("ignore")

## read in all the pretrained models

In [2]:
# names of all pretrained models
models = {
    "FiFe":None,  
    "NeTe":None,  
    "NiTe":None,  
    "SeTi":None,  
    "TeFi":None,
    "FvT":None,  
    "NeTi":None,  
    "NiTi":None,
    "SiFe":None,
    "TiFe":None,
    "IvE":None,  
    "NiFe":None,  
    "NvS":None,  
    "SiFi":None,  
    "TiFi":None,
    "NeFe":None,  
    "NiFi":None,  
    "PvJ":None,  
    "SiSe":None,  
    "TiTe":None,
    "NeFi":None,  
    "NiNe":None,  
    "SeFe":None,  
    "SiTe":None,  
    "NeSe":None,  
    "NiSe":None,  
    "SeFi":None,  
    "SiTi":None,
    "NeSi":None,  
    "NiSi":None,  
    "SeTe":None,  
    "TeFe":None }
    
for key in models:
    with open('../first_layer_models/' + key + '.pickle', 'rb') as f:
        models[key] = pickle.load(f)

## first layer prediction

### transfer a paragraph into features

In [3]:
# Preprocessing tools

ps = PorterStemmer()
wnl = WordNetLemmatizer()
str_punc = string.punctuation

engstopwords = stopwords.words("english")
engstopwordsV2 = re.sub('[' + re.escape(string.punctuation) + ']', '',
                        ' '.join(engstopwords)).split()

engstopwords = set(engstopwords).union(set(engstopwordsV2))


In [4]:
# Function to lemmatize a word using the three types: adjective, verb, noun
# Do i really need to use lemmatization? 
def lemmatize_all_types(word):
    word = wnl.lemmatize(word, 'a')
    word = wnl.lemmatize(word, 'v')
    word = wnl.lemmatize(word, 'n')
    return word

# Function to clean text
def clean(text):
    # Remove URLs from text
    text = re.sub("http.*?([ ]|\|\|\||$)", "", text).lower()
    url_regex = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""
    text = re.sub(url_regex, "", text)

    # Remove specific punctuation (usually associated with a word)
    text = re.sub(r'(:|;).', " ", text)
    
    # Remove punctuations
    text = re.sub('['+re.escape(str_punc)+']'," ",  text)
    
    # Remove parantheses, brackets
    text = re.sub('(\[|\()*\d+(\]|\))*', ' ', text)
    
    # Remove string marks
    text = re.sub('[’‘“\.”…–]', '', text)
    text = re.sub('[^(\w|\s)]', '', text)
    text = re.sub('(gt|lt)', '', text)
    
    #Check that each word is not stopword, and lemmatize it
    text = list(map(lemmatize_all_types, text.split()))
    text = [word for word in text if (word not in engstopwords)]
    text = " ".join(text)
    return text
    

def process_classify_sample(modelObject, sample):
    vectorizer = modelObject['cv']
    label_encoder = modelObject['labelEncoder']
    model = modelObject['model']
    
    # Preprocessing
    clean_sample = clean(sample)
    x = vectorizer.transform([clean_sample]).toarray()
    
    # Classification
    y = model.predict(x)
    y_probability = max(model.predict_proba(x)[0])
    classified_cf = label_encoder.inverse_transform(y)[0]
    return letters_to_functions(classified_cf), y_probability




In [6]:
sentence = 'what the heck, this is so sad'

In [7]:
models['FiFe']['cv'].transform([clean(sentence)]).toarray()

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

In [9]:
df = pd.read_csv('../data/mbti_1.csv')

In [15]:
df['cleaned'] = df['posts'].apply(clean)

In [16]:
for model in models:
    
    def new_func(sentence):
        x = models[model]['cv'].transform([sentence]).toarray()
        yp = models[model]['model'].predict_proba(x)[0][0]
        return yp
    
    df[model] = df['cleaned'].apply(new_func)
    

In [20]:
with open('../data/mbti_to_number_encoder','rb') as f:
    label_encoder = pickle.load(f)

In [22]:
X = df[models.keys()]
y = label_encoder.transform(df['type'])

In [27]:
from sklearn.model_selection import train_test_split

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [29]:
from sklearn.tree import DecisionTreeClassifier

In [30]:
clf = DecisionTreeClassifier(criterion='gini',max_depth=500)

In [31]:
for i in range(100):
    clf.fit(X_train, y_train)
    print(i, accuracy_score(y_test, clf.predict(X_test)))

0 0.5072046109510087
1 0.4974063400576369
2 0.5002881844380404
3 0.5037463976945245
4 0.4956772334293948
5 0.4962536023054755
6 0.5118155619596542
7 0.5129682997118156
8 0.5043227665706052
9 0.5025936599423632
10 0.5031700288184437
11 0.5002881844380404
12 0.4974063400576369
13 0.4968299711815562
14 0.5043227665706052
15 0.5025936599423632
16 0.5002881844380404
17 0.49452449567723344
18 0.4974063400576369
19 0.5037463976945245
20 0.49855907780979825
21 0.5002881844380404
22 0.515850144092219
23 0.5054755043227666
24 0.4979827089337176
25 0.5048991354466859
26 0.5112391930835735
27 0.5014409221902018
28 0.4968299711815562
29 0.4974063400576369
30 0.5043227665706052
31 0.5025936599423632
32 0.5083573487031701
33 0.5129682997118156
34 0.5037463976945245
35 0.5031700288184437
36 0.5060518731988473
37 0.5054755043227666
38 0.49279538904899134
39 0.5020172910662825
40 0.5083573487031701
41 0.506628242074928
42 0.5002881844380404
43 0.49855907780979825
44 0.4974063400576369
45 0.5158501440922

In [45]:
from sklearn.kernel_approximation import RBFSampler
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(max_iter=500)

clf.fit(X_train,y_train)

clf.score(X_test,y_test)

0.631700288184438

In [51]:
train_data_set = TensorDataset(
    torch.from_numpy(X_train.values.astype('float32')), 
    torch.from_numpy(y_train))
test_data_set = TensorDataset(
    torch.from_numpy(X_test.values.astype('float32')), 
    torch.from_numpy(y_test))

In [65]:
len(train_data_set)

6940

In [58]:
train_dl = DataLoader(train_data_set,shuffle=True, batch_size=1000)
test_dl = DataLoader(test_data_set,shuffle=True,batch_size=1000)

In [59]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

class NeuralN(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(32, 32),
            nn.ReLU(),
            nn.Linear(32,32),
            nn.MaxPool1d(5, stride=1),
            nn.SiLU(),
            nn.Linear(32-5+1,20),
            nn.ReLU(),
            nn.Linear(20,16)
        )
    
    def forward(self,x):
        logits = self.linear_relu_stack(x)
        return logits

model = NeuralN().to(device)

In [60]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-1)

In [61]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X,y) in enumerate(dataloader):
        X,y = X.to(device), y.to(device)
        
        pred = model(X)
        loss = loss_fn(pred, y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # if batch % 100 == 0:
        #     loss, current = loss.item(), batch * len(X)
        #     print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [62]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [63]:
epochs = 20
for t in range(200):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dl, model, loss_fn, optimizer)
    test(test_dl, model, loss_fn)
print("Done!")
    

Epoch 1
-------------------------------
Test Error: 
 Accuracy: 16.9%, Avg loss: 2.718391 

Epoch 2
-------------------------------
Test Error: 
 Accuracy: 16.6%, Avg loss: 2.653565 

Epoch 3
-------------------------------
Test Error: 
 Accuracy: 16.6%, Avg loss: 2.592096 

Epoch 4
-------------------------------
Test Error: 
 Accuracy: 21.3%, Avg loss: 2.528291 

Epoch 5
-------------------------------
Test Error: 
 Accuracy: 21.3%, Avg loss: 2.463693 

Epoch 6
-------------------------------
Test Error: 
 Accuracy: 21.3%, Avg loss: 2.400086 

Epoch 7
-------------------------------
Test Error: 
 Accuracy: 21.3%, Avg loss: 2.347029 

Epoch 8
-------------------------------
Test Error: 
 Accuracy: 21.3%, Avg loss: 2.302702 

Epoch 9
-------------------------------
Test Error: 
 Accuracy: 21.3%, Avg loss: 2.271373 

Epoch 10
-------------------------------
Test Error: 
 Accuracy: 21.3%, Avg loss: 2.253566 

Epoch 11
-------------------------------
Test Error: 
 Accuracy: 21.3%, Avg los