In [1]:
import numpy as np
import pandas as pd
import re
import string
import pickle
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn import preprocessing
import warnings

from torch.utils.data import TensorDataset, DataLoader
import torch
from torch import nn
from torchvision import datasets
from torchvision.transforms import ToTensor, Lambda, Compose
import matplotlib.pyplot as plt


warnings.filterwarnings("ignore")

## read in all the pretrained models

In [2]:
# names of all pretrained models
models = {
    "FiFe":None,  
    "NeTe":None,  
    "NiTe":None,  
    "SeTi":None,  
    "TeFi":None,
    "FvT":None,  
    "NeTi":None,  
    "NiTi":None,
    "SiFe":None,
    "TiFe":None,
    "IvE":None,  
    "NiFe":None,  
    "NvS":None,  
    "SiFi":None,  
    "TiFi":None,
    "NeFe":None,  
    "NiFi":None,  
    "PvJ":None,  
    "SiSe":None,  
    "TiTe":None,
    "NeFi":None,  
    "NiNe":None,  
    "SeFe":None,  
    "SiTe":None,  
    "NeSe":None,  
    "NiSe":None,  
    "SeFi":None,  
    "SiTi":None,
    "NeSi":None,  
    "NiSi":None,  
    "SeTe":None,  
    "TeFe":None }
    
for key in models:
    with open('../first_layer_models/' + key + '.pickle', 'rb') as f:
        models[key] = pickle.load(f)

## first layer prediction

### transfer a paragraph into features

In [3]:
# Preprocessing tools

ps = PorterStemmer()
wnl = WordNetLemmatizer()
str_punc = string.punctuation

engstopwords = stopwords.words("english")
engstopwordsV2 = re.sub('[' + re.escape(string.punctuation) + ']', '',
                        ' '.join(engstopwords)).split()

engstopwords = set(engstopwords).union(set(engstopwordsV2))


In [4]:
# Function to lemmatize a word using the three types: adjective, verb, noun
# Do i really need to use lemmatization? 
def lemmatize_all_types(word):
    word = wnl.lemmatize(word, 'a')
    word = wnl.lemmatize(word, 'v')
    word = wnl.lemmatize(word, 'n')
    return word

# Function to clean text
def clean(text):
    # Remove URLs from text
    text = re.sub("http.*?([ ]|\|\|\||$)", "", text).lower()
    url_regex = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""
    text = re.sub(url_regex, "", text)

    # Remove specific punctuation (usually associated with a word)
    text = re.sub(r'(:|;).', " ", text)
    
    # Remove punctuations
    text = re.sub('['+re.escape(str_punc)+']'," ",  text)
    
    # Remove parantheses, brackets
    text = re.sub('(\[|\()*\d+(\]|\))*', ' ', text)
    
    # Remove string marks
    text = re.sub('[’‘“\.”…–]', '', text)
    text = re.sub('[^(\w|\s)]', '', text)
    text = re.sub('(gt|lt)', '', text)
    
    #Check that each word is not stopword, and lemmatize it
    text = list(map(lemmatize_all_types, text.split()))
    text = [word for word in text if (word not in engstopwords)]
    text = " ".join(text)
    return text
    

def process_classify_sample(modelObject, sample):
    vectorizer = modelObject['cv']
    label_encoder = modelObject['labelEncoder']
    model = modelObject['model']
    
    # Preprocessing
    clean_sample = clean(sample)
    x = vectorizer.transform([clean_sample]).toarray()
    
    # Classification
    y = model.predict(x)
    y_probability = max(model.predict_proba(x)[0])
    classified_cf = label_encoder.inverse_transform(y)[0]
    return letters_to_functions(classified_cf), y_probability




In [5]:
sentence = 'I am a stupid guy'

In [6]:
models['FiFe']['cv'].transform([clean(sentence)]).toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  

In [7]:
def first_layer_output_predict(sample):
    
    clean_sample = clean(sample)
    
    features = []
    
    for modelName, modelObj in models.items():
        
        x = modelObj['cv'].transform([clean_sample]).toarray()
        yp = modelObj['model'].predict_proba(x)[0][0]
        
        features.append((modelName,yp))
    features.sort(key=lambda x: x[0])
    
    return [feature[1] for feature in features]


## read in data

In [8]:
!ls /mnt/c/Users/haiya/Downloads/finalp/

 Emoticon_Dict.p		    mbti-type.zip
 Untitled.ipynb			    mbti_full_pull.csv
'data clean and inspection.ipynb'   mbti_full_pull_half.csv
 development.csv		    mbti_full_pull_half_test.csv
 emoticons.csv			    mbti_full_pull_half_train.csv
 full_pull_v2000000000000.csv	   'with emojis.csv'


In [9]:
# train = pd.read_csv('/mnt/c/Users/haiya/Downloads/finalp/mbti_full_pull_half_train.csv',index_col=0)
# test  = pd.read_csv('/mnt/c/Users/haiya/Downloads/finalp/mbti_full_pull_half_test.csv',index_col=0)


# train = train.sample(80000, replace=True)
# import time

In [10]:
def first_layer_output(Xtrain, ytrain):
    Xprob = []
    
    start = time.time()
    
    for i,sample in enumerate(Xtrain):
        Xprob.append(first_layer_output_predict(sample))
        
        if i % 1000 == 0:
            print(i,'out of', len(Xtrain), 'has been transformed')
            print(time.time() - start)
        
    df = pd.DataFrame(Xprob)
    
    # renameing columns
    df.rename(columns = dict(zip(df.columns,sorted(models.keys()))),inplace=True)
    
    df['label'] = ytrain
    
    return df

## Another thing to do is to transfer the label to int

In [11]:
# Xtrain = train['body'].values
# ytrain = train['mbti_type'].values
# Xtest = test['body'].values
# ytest = test['mbti_type'].values
# del train
# del test
# with open('mbti_to_number_encoder','wb') as f:
#     pickle.dump(le,f)
# le = preprocessing.LabelEncoder()
# ytrain = le.fit_transform(ytrain)
# ytest = le.transform(ytest)
# train_df = first_layer_output(Xtrain, ytrain)
# test_df  = first_layer_output(Xtest, ytest)
# ytrain = train_df['label'].values
# train_df.drop(['label'], axis=1, inplace=True)
# Xtrain = train_df.values

# ytest = test_df['label'].values
# test_df.drop(['label'], axis=1, inplace=True)
# Xtest = test_df.values
# torch.save(torch.from_numpy(Xtrain.astype('float32')),'../data/Xtrain.pt')
# torch.save(torch.from_numpy(ytrain), 'ytrain.pt')
# torch.save(torch.from_numpy(Xtest.astype('float32')),'../data/Xtest.pt')
# torch.save(torch.from_numpy(ytest), 'ytest.pt')

In [12]:
# read the label encoder
with open('../data/mbti_to_number_encoder','rb') as f:
    le = pickle.load(f)




In [13]:
Xtrain = torch.load('../data/Xtrain.pt')
ytrain = torch.load('../data/ytrain.pt')
Xtest = torch.load('../data/Xtest.pt')
ytest = torch.load('../data/ytest.pt')


In [19]:
from sklearn.model_selection import train_test_split

In [20]:
df = pd.read_csv('../data/development.csv', index_col=0)
X = df['body']
y = df['type']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1,random_state=42)

In [35]:
from sklearn.tree import DecisionTreeClassifier

In [42]:
clf = DecisionTreeClassifier(criterion='gini',max_depth=500)

In [45]:
for i in range(100):
    clf.fit(Xtrain.numpy(), ytrain.numpy())
    print(i, accuracy_score(ytest.numpy(), clf.predict(Xtest.numpy())))

0 0.269375
1 0.278375
2 0.271125
3 0.276125
4 0.270375
5 0.27175
6 0.27375
7 0.2745
8 0.26925
9 0.276375
10 0.276125
11 0.274875
12 0.2775
13 0.273375
14 0.271125
15 0.269875
16 0.27075
17 0.27225
18 0.27575
19 0.273
20 0.27525
21 0.27425
22 0.270375
23 0.27375
24 0.270875
25 0.27325
26 0.279125
27 0.273625
28 0.2745
29 0.279
30 0.271
31 0.269
32 0.273875
33 0.2735
34 0.2685
35 0.2785
36 0.274125
37 0.269125
38 0.276
39 0.27475
40 0.272125
41 0.275125
42 0.2715


KeyboardInterrupt: 

In [18]:
train_dl = DataLoader(train_data,shuffle=True, batch_size=500)
test_dl = DataLoader(test_data,shuffle=True,batch_size=500)

In [53]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

class NeuralN(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(32, 32),
            nn.ReLU(),
            nn.Linear(32,32),
            nn.MaxPool1d(5, stride=1),
            nn.SiLU(),
            nn.Linear(32-5+1,20),
            nn.ReLU(),
            nn.Linear(20,16)
        )
    
    def forward(self,x):
        logits = self.linear_relu_stack(x)
        return logits

model = NeuralN().to(device)

In [54]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-1)

In [55]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X,y) in enumerate(dataloader):
        X,y = X.to(device), y.to(device)
        
        pred = model(X)
        loss = loss_fn(pred, y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # if batch % 100 == 0:
        #     loss, current = loss.item(), batch * len(X)
        #     print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [56]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [57]:
epochs = 20
for t in range(200):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dl, model, loss_fn, optimizer)
    test(test_dl, model, loss_fn)
print("Done!")
    

Epoch 1
-------------------------------
Test Error: 
 Accuracy: 16.3%, Avg loss: 2.619456 

Epoch 2
-------------------------------
Test Error: 
 Accuracy: 22.1%, Avg loss: 2.228087 

Epoch 3
-------------------------------
Test Error: 
 Accuracy: 21.5%, Avg loss: 2.121699 

Epoch 4
-------------------------------
Test Error: 
 Accuracy: 20.8%, Avg loss: 2.109974 

Epoch 5
-------------------------------
Test Error: 
 Accuracy: 22.4%, Avg loss: 2.103401 

Epoch 6
-------------------------------
Test Error: 
 Accuracy: 22.0%, Avg loss: 2.099401 

Epoch 7
-------------------------------
Test Error: 
 Accuracy: 22.4%, Avg loss: 2.096186 

Epoch 8
-------------------------------
Test Error: 
 Accuracy: 22.4%, Avg loss: 2.093070 

Epoch 9
-------------------------------
Test Error: 
 Accuracy: 22.4%, Avg loss: 2.090246 

Epoch 10
-------------------------------
Test Error: 
 Accuracy: 23.2%, Avg loss: 2.088287 

Epoch 11
-------------------------------
Test Error: 
 Accuracy: 22.9%, Avg los

In [58]:
for t in range(100):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dl, model, loss_fn, optimizer)
    test(test_dl, model, loss_fn)
print("Done!")
    

Epoch 1
-------------------------------
Test Error: 
 Accuracy: 38.9%, Avg loss: 1.814872 

Epoch 2
-------------------------------
Test Error: 
 Accuracy: 38.6%, Avg loss: 1.814110 

Epoch 3
-------------------------------
Test Error: 
 Accuracy: 38.6%, Avg loss: 1.812954 

Epoch 4
-------------------------------
Test Error: 
 Accuracy: 38.8%, Avg loss: 1.813648 

Epoch 5
-------------------------------
Test Error: 
 Accuracy: 38.8%, Avg loss: 1.812658 

Epoch 6
-------------------------------
Test Error: 
 Accuracy: 38.6%, Avg loss: 1.814167 

Epoch 7
-------------------------------
Test Error: 
 Accuracy: 39.0%, Avg loss: 1.812439 

Epoch 8
-------------------------------
Test Error: 
 Accuracy: 38.9%, Avg loss: 1.812962 

Epoch 9
-------------------------------
Test Error: 
 Accuracy: 38.8%, Avg loss: 1.812491 

Epoch 10
-------------------------------
Test Error: 
 Accuracy: 38.9%, Avg loss: 1.811833 

Epoch 11
-------------------------------
Test Error: 
 Accuracy: 38.8%, Avg los