In [1]:
import torch
import json

In [2]:
import os
import numpy as np

def get_training_data(base_dir="/home/ubuntu/train_split", text_length=4000):
    train_texts = []
    train_labels = []
    label_counts = {}
    for subdir, dirs, files in os.walk(base_dir):
        label = subdir.split(r"/")[-1]
        for f in files:
            if label not in label_counts:
                label_counts[label] = 1
            else:
                label_counts[label] += 1
            if f.split('.')[-1] == 'txt':
                path = os.path.join(subdir, f)
                text = ""
                with open(path, 'r', encoding="utf-8") as r:
                    text = r.read()[:text_length]
                train_texts.append(text)
                train_labels.append(label)
    label_counts = {k: v for k, v in sorted(label_counts.items(), key=lambda item: -item[1])}

    return train_texts, train_labels, label_counts

train_texts, train_labels, label_counts = get_training_data()
test_texts, test_labels, test_label_counts = get_training_data('/home/ubuntu/test_split')

In [3]:
#label_mapping = {}
#for l in train_labels:
#    if l not in label_mapping:
#        label_mapping[l] = len(label_mapping)
#for l in test_labels:
#    if l not in label_mapping:
#        label_mapping[l] = len(label_mapping)

#with open('label_mapping.json', 'w') as f:
#    f.write(json.dumps(label_mapping))

with open('label_mapping.json', 'r') as f:
    label_mapping = json.loads(f.read())
reverse_mapping = {v:k for k,v in label_mapping.items()}


In [4]:
#shuffle out of paranoia
import random
zipped = list(zip(train_texts, train_labels))
random.shuffle(zipped)
train_texts = [x[0] for x in zipped]
train_labels = [x[1] for x in zipped]

In [5]:
from transformers import AutoTokenizer
from transformers import MobileBertTokenizerFast, MobileBertForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased', use_fast=True)
#tokenizer = MobileBertTokenizerFast.from_pretrained('google/mobilebert-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

In [6]:
encoded_train_labels = [0 for x in train_labels]
encoded_test_labels = [0 for x in test_labels]
for i in range(len(train_labels)):
    encoded_train_labels[i] = label_mapping[train_labels[i]]
for i in range(len(test_labels)):
    encoded_test_labels[i] = label_mapping[test_labels[i]]

In [7]:
import torch

class RecordsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = RecordsDataset(train_encodings, encoded_train_labels)
test_dataset = RecordsDataset(test_encodings, encoded_test_labels)

In [9]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, logging as hf_logging

#model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(label_mapping))
#hf_logging.set_verbosity_info()
model = AutoModelForSequenceClassification.from_pretrained('distilbert_10_epochs', local_files_only=True)



In [10]:
def top_k_accuracy(true_labels, predictions, k=3):
    ind = np.argpartition(predictions, -k, axis=1)[:,-k:]
    top_k_match = [(true_labels[i] in ind[i]) for i in range(len(true_labels))]
    return np.mean(top_k_match)

def compute_metrics(eval_pred):
    preds = eval_pred.predictions
    labels = eval_pred.label_ids
    return {'top_1_accuracy': top_k_accuracy(labels, preds, k=1), 'top_3_accuracy': top_k_accuracy(labels, preds, k=3),'top_5_accuracy': top_k_accuracy(labels, preds, k=5)}

training_args = TrainingArguments(
    'auto_distilbert',          # output directory
    num_train_epochs=8,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=128,   # batch size for evaluation
    evaluation_strategy="epoch",
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,
    save_steps=3000,
    load_best_model_at_end=True,
    metric_for_best_model='top_1_accuracy'
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,            # evaluation dataset
    compute_metrics = compute_metrics
)

In [10]:
# Freeze upstream layers
#for param in model.distilbert.parameters():
#    param.requires_grad = False

In [11]:
trainer.train()

Epoch,Training Loss,Validation Loss,Top 1 Accuracy,Top 3 Accuracy,Top 5 Accuracy
0,3.396956,3.317658,0.194643,0.390335,0.469259


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/ubuntu/Document_Processing_Scripts/model_training/env/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3418, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-11-3435b262f1ae>", line 1, in <module>
    trainer.train()
  File "/home/ubuntu/Document_Processing_Scripts/model_training/env/lib/python3.8/site-packages/transformers/trainer.py", line 747, in train
    tr_loss += self.training_step(model, inputs)
  File "/home/ubuntu/Document_Processing_Scripts/model_training/env/lib/python3.8/site-packages/transformers/trainer.py", line 1089, in training_step
    loss.backward()
  File "/home/ubuntu/Document_Processing_Scripts/model_training/env/lib/python3.8/site-packages/torch/tensor.py", line 221, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph)
  File "/home/ubuntu/Document_Processing_Scripts/model_training/env/lib/python3.8/site-packages/torch/autog

TypeError: object of type 'NoneType' has no len()

In [11]:
trainer.evaluate()

{'eval_loss': 1.653051733970642,
 'eval_top_1_accuracy': 0.7144218854256029,
 'eval_top_3_accuracy': 0.8945763034982366,
 'eval_top_5_accuracy': 0.9358497759984749}

In [17]:
p = trainer.predict(test_dataset)

In [12]:
model.eval()
print('eval')

eval


In [13]:
model.to('cuda')
print('cuda')

cuda


In [14]:
#train_tokens = [tokenizer(t, truncation=True, padding=True, return_tensors="pt") for t in train_texts]
test_tokens = [tokenizer(t, truncation=True, padding=True, return_tensors="pt") for t in test_texts]

#outputs = model(**(test_tokens[0].to('cuda')))

In [15]:
def embed(tokenized_text):
    outputs = model(**(tokenized_text.to('cuda')))
    embedding = outputs[0][0].detach().cpu().numpy()
    return embedding


In [16]:
test_preds = [embed(x) for x in test_tokens]

In [17]:
ind = np.argpartition(test_preds, -3, axis=1)[:,-3:]

In [18]:
list_ind = list([list(x) for x in ind])

In [19]:
for i in range(len(ind)):
    for j in range(len(ind[i])):
        list_ind[i][j] = reverse_mapping[list_ind[i][j]]

In [20]:
top_k_match = [(label_mapping[test_labels[i]] in ind[i]) for i in range(len(test_labels))]

In [21]:
np.mean(top_k_match)

0.8945763034982366

In [26]:
top1 = np.argpartition(p.predictions, -1, axis=1)[:,-1:]
list_top1 = list([list(x)[0] for x in top1])
for i in range(len(top1)):
    list_top1[i] = reverse_mapping[list_top1[i]]

NameError: name 'p' is not defined

In [75]:
import sklearn
import pandas as pd
conf_matrix = sklearn.metrics.confusion_matrix(decoded_label_ids, list_top1, labels=list(label_mapping.keys()))
conf_matrix = pd.DataFrame(conf_matrix, index=list(label_mapping.keys()), columns=['predicted ' + x for x in list(label_mapping.keys())])

In [76]:
conf_matrix

Unnamed: 0,predicted 205-1003-b,predicted 401-1006-b,predicted 108-1035-a,predicted 301-1051-a,predicted 404-1012-c,predicted 402-1005-f,predicted 405-1004-b,predicted 401-1010-a,predicted 306-1023-a,predicted 108-1036-a,...,predicted 108-1035-c,predicted 108-1044-c,predicted 404-1012-a,predicted 306-1023-c,predicted 301-1016-c,predicted 401-1001-b,predicted 205-1003-a,predicted 108-1035-e,predicted 306-1023-d,predicted 402-1005-b
205-1003-b,180,39,0,0,0,0,10,0,0,0,...,5,0,0,0,0,0,16,0,0,0
401-1006-b,26,1480,0,0,23,0,99,0,0,0,...,110,0,0,0,0,0,21,0,8,2
108-1035-a,0,27,24,0,2,0,1,0,0,0,...,48,0,0,0,0,0,1,0,0,0
301-1051-a,4,78,1,0,0,0,3,0,0,0,...,19,0,0,0,0,0,9,0,2,0
404-1012-c,0,44,0,0,230,0,31,0,0,0,...,4,0,0,0,0,0,0,0,0,0
402-1005-f,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
405-1004-b,8,129,0,0,12,0,233,0,0,0,...,28,0,0,0,0,0,5,0,0,1
401-1010-a,0,16,0,0,6,0,24,0,0,0,...,1,0,0,0,0,0,0,0,0,3
306-1023-a,0,7,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,0
108-1036-a,0,14,0,0,0,0,2,0,0,0,...,34,0,1,0,0,0,5,0,4,0


In [53]:
decoded_label_ids[0]

'205-1003-b'

In [64]:
old_top_k_match = [(p.label_ids[i] in ind[i]) for i in range(len(p.label_ids))]

In [65]:
np.mean(old_top_k_match)

0.6724811743399104

In [58]:
for i in range(len(top_k_match)):
    if top_k_match[i] != old_top_k_match[i]:
        print(i)
        break

5


In [59]:
top_k_match[5]

False

In [60]:
old_top_k_match[5]

True

In [63]:
p.label_ids[5] in ind[5]

False

In [62]:
ind[5]

array([ 6,  1, 26])