In [None]:
%pip install pandas
%pip install numpy
%pip install torch

In [None]:
import pandas as pd
import numpy as np
import torch
import math
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torch import nn
import time
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset

In [None]:
# reads a binary classification raw data file stored as a CSV of format
#   type    post1   post2   post3   ...   post50
#   <mbti>  <post>  <post>  <post>  ...   <post>
#   ...     ...     ...     ...     ...   ...
rawdf = pd.read_csv("mbti_cleaned_TF.csv", dtype=str)
print(rawdf)

     type                                              post1  \
0       F  'Yes peace is the absence of conflict - your I...   
1       F  'Yes, I have gone completely cold hearted towa...   
2       F  'Why not both? It's more fun to be open to dif...   
3       F  Yeah, you're not gonna win her over with logic...   
4       F                                          'Explain?   
...   ...                                                ...   
7673    T  'Whats up guys.   The other night I was thinki...   
7674    T  'I've been told I have a death glare.  My wife...   
7675    T  'No, I wouldn't say I?m stubborn at all. I can...   
7676    T  'I suppose I have one thing to add. Eights are...   
7677    T  'This happens to me! It usually happens when I...   

                                                  post2  \
0     I'm just going to reallybriefly respond:  1. Y...   
1     I'm a heavy introvert, yet everyone at work lo...   
2     I imagine INFPs as dreamy space prince, dyed h..

In [None]:
print(rawdf.shape)
rawdf[rawdf.columns[0]]

(7678, 51)


0       F
1       F
2       F
3       F
4       F
       ..
7673    T
7674    T
7675    T
7676    T
7677    T
Name: type, Length: 7678, dtype: object

In [None]:
# Convert this dataframe to a single, long column containing every single type and post in pairs of <type>|<post>
indexed = pd.DataFrame()
for i in range(1, rawdf.shape[1]):
  #col = pd.concat((rawdf[rawdf.columns[0]], rawdf.rename(mapper={i:'post1'}, axis = 1)[rawdf.columns[i]]), axis = 1)
  col = rawdf[rawdf.columns[0]] + "|" + rawdf[rawdf.columns[i]]
  #print(col)
  indexed = pd.concat((indexed, col), axis=0)
print(indexed)

                                                      0
0     F|'Yes peace is the absence of conflict - your...
1     F|'Yes, I have gone completely cold hearted to...
2     F|'Why not both? It's more fun to be open to d...
3     F|Yeah, you're not gonna win her over with log...
4                                           F|'Explain?
...                                                 ...
7673  T|You do realize the entire basis of MBTI is d...
7674  T|You had me at too much work involved.  ISTPs...
7675  T|You know this actually worked?  The thing th...
7676  T|You randomly disassemble an old mobile phone...
7677  T|Yyyeeah. I know.  At this stage of my life I...

[383900 rows x 1 columns]


In [None]:
# New Dataframe splits the old one to reach our final format, N pairs of posts and corresponding types
df = pd.DataFrame(np.row_stack([indexed.columns, indexed.values]), columns = ["type"])
df[["type", "post"]] = df["type"].str.split('|', expand=True)
print(df)

       type                                               post
0       NaN                                                NaN
1         F  'Yes peace is the absence of conflict - your I...
2         F  'Yes, I have gone completely cold hearted towa...
3         F  'Why not both? It's more fun to be open to dif...
4         F  Yeah, you're not gonna win her over with logic...
...     ...                                                ...
383896    T  You do realize the entire basis of MBTI is der...
383897    T  You had me at too much work involved.  ISTPs u...
383898    T  You know this actually worked?  The thing that...
383899    T  You randomly disassemble an old mobile phone a...
383900    T  Yyyeeah. I know.  At this stage of my life I a...

[383901 rows x 2 columns]


In [None]:
# Drop null values left from the splitting process
df = df.drop(axis=0, index=0)
df = df.dropna()
print(df)

       type                                               post
1         F  'Yes peace is the absence of conflict - your I...
2         F  'Yes, I have gone completely cold hearted towa...
3         F  'Why not both? It's more fun to be open to dif...
4         F  Yeah, you're not gonna win her over with logic...
5         F                                          'Explain?
...     ...                                                ...
383896    T  You do realize the entire basis of MBTI is der...
383897    T  You had me at too much work involved.  ISTPs u...
383898    T  You know this actually worked?  The thing that...
383899    T  You randomly disassemble an old mobile phone a...
383900    T  Yyyeeah. I know.  At this stage of my life I a...

[383898 rows x 2 columns]


In [None]:
# Divide data into training, val, and testing
# Datasets are turned into iters for later use in dataloaders
trainset = df[:math.floor(df.shape[0]*0.80)]
valset = df[math.floor(df.shape[0]*0.80):math.floor(df.shape[0]*0.85)]
testset = df[math.floor(df.shape[0]*0.85):]
trainset = trainset.astype(str)
valset = valset.astype(str)
testset = testset.astype(str)
trtup = list(trainset.itertuples(index=False, name=None))
vatup = list(valset.itertuples(index=False, name=None))
tetup = list(testset.itertuples(index=False, name=None))
train_iter = iter(trtup)
val_iter = iter(vatup)
test_iter = iter(tetup)

In [1]:
# build vocabulary off of training data iterator
# (this means any word in testing that wasn't in training will be tokenized as <unk>)
tok = get_tokenizer("basic_english")
def yield_tokens(dataIter):
    for _, text in dataIter:
        yield tok(text)
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

NameError: ignored

In [None]:
# Data pipeline lambda functions for converting labels and post-words into their token forms
label_map = {'T': 0, 'F': 1}
text_pipe = lambda x: vocab(tok(x))
label_pipe = lambda x: label_map[x]

In [None]:
# set device to GPU if possible
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# batching function for dataloaders
def collate_batch(batch):
  labelList, textList, offsets = [], [], [0]
  for _label, _text in batch:
      labelList.append(label_pipe(_label))
      processed_text = torch.tensor(text_pipe(_text), dtype=torch.int64)
      textList.append(processed_text)
      offsets.append(processed_text.size(0))
  labelList = torch.tensor(labelList, dtype=torch.int64)
  offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
  textList = torch.cat(textList)
  return labelList.to(device), textList.to(device), offsets.to(device)

In [None]:
# instantiate dataloaders
dlTrain = DataLoader(train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch)
dlVal = DataLoader(val_iter, batch_size=8, shuffle=False, collate_fn=collate_batch)
dlTest = DataLoader(test_iter, batch_size=8, shuffle=False, collate_fn=collate_batch)

In [None]:
# Model Architecture Class
# 1 embedding bag followed by 2 dense layers
class TextClassifier(nn.Module):
  def __init__(self, voc_size, embedding, num_classes):
    super(TextClassifier, self).__init__()
    self.embedding = nn.EmbeddingBag(voc_size, 2*embedding, sparse=False)
    self.fc1 = nn.Linear(2*embedding, embedding)
    self.fc2 = nn.Linear(embedding, num_classes)
    self.init_weights()

  def init_weights(self):
      initrange = 0.5
      self.embedding.weight.data.uniform_(-initrange, initrange)
      self.fc1.weight.data.uniform_(-initrange, initrange)
      self.fc1.bias.data.zero_()
      self.fc2.weight.data.uniform_(-initrange, initrange)
      self.fc2.bias.data.zero_()
  def forward(self, text, offsets):
      embedded = self.embedding(text, offsets)
      filled = self.fc1(embedded)
      return self.fc2(filled)

In [None]:
# Model parameters and instantiation
numClasses = 2
vocab_size = len(vocab)
emsize = 32
model = TextClassifier(vocab_size, emsize, numClasses).to(device)

In [None]:
# training and evaluation functions
# (NOTE: these use global models and accumulators, just because schedulers are weird otherwise)
def train(dl):
  model.train()
  total_acc, total_count = 0,0
  log_interval = 500
  start_time = time.time()

  for idx, (label, text, offsets) in enumerate(dl):
      optimizer.zero_grad()
      predLabel = model(text, offsets)
      loss = criterion(predLabel, label)
      loss.backward()
      torch.nn.utils.clip_grad_norm(model.parameters(), 0.1)
      optimizer.step()
      total_acc += (predLabel.argmax(1) == label).sum().item()
      total_count  += label.size(0)
      if idx % log_interval == 0 and idx > 0:
        elapsed = time.time() - start_time
        print("Epoch {:3d}: {:5d}/{:5d} batches: accuracy {:8.3f}".format(epoch, idx, len(dl), total_acc/total_count))
        total_acc, total_count = 0, 0
        start_time = time.time()
def evaluate(dl):
  model.eval()
  total_acc, total_count = 0, 0

  with torch.no_grad():
    for idx, (label, text, offsets) in enumerate(dl):
      predLabel = model(text, offsets)
      loss = criterion(predLabel, label)
      total_acc += (predLabel.argmax(1) == label).sum().item()
      total_count += label.size(0)
  return total_acc / total_count

In [None]:
# Complete model hyperparams and instantiate new batch loaders
EPOCHS = 10
LR = 5
BATCH = 128

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr = LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma = 0.01)
total_accu = None

train_iter = iter(trtup)
val_iter = iter(vatup)
test_iter = iter(tetup)

train_ds = to_map_style_dataset(train_iter)
val_ds = to_map_style_dataset(val_iter)
test_ds = to_map_style_dataset(test_iter)

dlTrain = DataLoader(train_ds, batch_size = BATCH, shuffle=True, collate_fn = collate_batch)
dlVal = DataLoader(val_ds, batch_size = BATCH, shuffle=True, collate_fn = collate_batch)
dlTest = DataLoader(test_ds, batch_size = BATCH, shuffle=True, collate_fn = collate_batch)

In [None]:
print(df.type.unique())
with pd.option_context('display.max_seq_items', None):
    print (df.type)

['F' 'T']
1         F
2         F
3         F
4         F
5         F
         ..
383896    T
383897    T
383898    T
383899    T
383900    T
Name: type, Length: 383898, dtype: object


In [None]:
# Training Loop
for epoch in range(1, EPOCHS + 1):
  epoch_start = time.time()
  train(dlTrain)
  accu_val = evaluate(dlVal)
  if total_accu is not None and total_accu > accu_val:
    scheduler.step()
  else:
    total_accu = accu_val
  print('-'*59)
  print("End of Epoch {:3d}: Time: {:5.2f}, Valid Accuracy: {:8.3f}".format(epoch, time.time() - epoch_start, accu_val))
  print('-'*59)
torch.save(model, "SAVETHISjudgeperceive")

  torch.nn.utils.clip_grad_norm(model.parameters(), 0.1)


Epoch   1:   500/ 2700 batches: accuracy    0.575
Epoch   1:  1000/ 2700 batches: accuracy    0.579
Epoch   1:  1500/ 2700 batches: accuracy    0.580
Epoch   1:  2000/ 2700 batches: accuracy    0.581
Epoch   1:  2500/ 2700 batches: accuracy    0.583
-----------------------------------------------------------
End of Epoch   1: Time: 23.57, Valid Accuracy:    0.591
-----------------------------------------------------------
Epoch   2:   500/ 2700 batches: accuracy    0.585
Epoch   2:  1000/ 2700 batches: accuracy    0.578
Epoch   2:  1500/ 2700 batches: accuracy    0.586
Epoch   2:  2000/ 2700 batches: accuracy    0.585
Epoch   2:  2500/ 2700 batches: accuracy    0.589
-----------------------------------------------------------
End of Epoch   2: Time: 22.58, Valid Accuracy:    0.593
-----------------------------------------------------------
Epoch   3:   500/ 2700 batches: accuracy    0.585
Epoch   3:  1000/ 2700 batches: accuracy    0.589
Epoch   3:  1500/ 2700 batches: accuracy    0.58

In [None]:
print(evaluate(dlTest))

0.5005470174524616


In [None]:
# function for predicting new input
def predict(text, pipeline):
    with torch.no_grad():
        text = torch.tensor(pipeline(text))
        output = model(text, torch.tensor([0]))
        print(output)
        return torch.abs(output).argmax(1).item()

In [None]:
# Setup to test MBTI output for strings
s = 'I am an INTJ'
label_map = {'I': 0, 'E': 1}
mbti_map = {0: 'I', 1: 'E'}
model = model.to('cpu')
print("post: %s; \nType: %s" % (s, mbti_map[predict(s, text_pipe)]))

tensor([[ 0.4082, -0.8929]])
post: im post malone i love cleveland; 
Type: E


In [None]:
model = torch.load("SAVETHISthinkfeel")

In [None]:
# For Confusion Matrix creation
# Iterates through entire dataloader, keeping track of true positives,
#   false positives, false negatives, and true positives
def evalList(dl):
  model.eval()
  tp, fp, fn, tn = 0, 0, 0, 0
  #
  #   TP    FP
  #
  #   FN    TN

  with torch.no_grad():
    for idx, (label, text, offsets) in enumerate(dl):
      predLabel = model(text, offsets)

      for idx, target in enumerate(predLabel.argmax(1)):
        trueLabel = label[idx].item()
        guess = target.item()

        if trueLabel == 1:
          if guess == 1:
            tp += 1
          else:
            fn += 1
        else: #true label is 0
          if guess == 1:
            fp += 1
          else:
            tn += 1




      #print(predLabel)
      #print(predLabel.argmax(1))
      #print(label)

      #for idx, target in enumerate(predLabel.argmax(1)):
      #  print(target.item())
      #  print(label[idx].item())
      #  break
      #break
  return (tp, fp, fn, tn)

In [None]:
model.cuda()

TextClassifier(
  (embedding): EmbeddingBag(160930, 64, mode='mean')
  (fc1): Linear(in_features=64, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=2, bias=True)
)

In [None]:
(tp, fp, fn, tn) = evalList(dlTest)
#(clTe, evTe) = evalList(testset)
#(clV, evV) = evalList(valset)

In [None]:
print("Extrovert / Extrovert: %d \tExtrovert / Introvert: %d\n Introvert / Extrovert: %d \tIntrovert / Introvert: %d" % (tp, fp, fn, tn))

Extrovert / Extrovert: 7128 	Extrovert / Introvert: 5369
 Introvert / Extrovert: 2458 	Introvert / Introvert: 4240
