In [1]:
%load_ext autoreload
%autoreload 2

import sys, os
sys.path.append(os.path.dirname(sys.path[0]))

from tqdm.notebook import tqdm
from segtok import tokenizer

In [None]:
from data_parsing import load_dataset
data = load_dataset("../yelp_review_training_dataset.jsonl")

In [None]:
from training_utils import split_train_validation
train_x, valid_x, train_y, valid_y = split_train_validation(data, 0.01)

In [None]:
print(len(train_x))
print(len(train_y))
print(len(valid_x))
print(len(valid_y))

In [None]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [None]:
train_x_numerized = []
train_x_mask = []
for text in tqdm(train_x):
    tokenized = tokenizer(text, truncation=True, padding="max_length")[0]
    train_x_numerized.append(tokenized.ids)
    train_x_mask.append(tokenized.attention_mask)
valid_x_numerized = []
valid_x_mask = []
for text in tqdm(valid_x):
    tokenized = tokenizer(text, truncation=True, padding="max_length")[0]
    valid_x_numerized.append(tokenized.ids)
    valid_x_mask.append(tokenized.attention_mask)

In [None]:
import numpy as np

train_x_numerized = np.array(train_x_numerized)
train_x_mask = np.array(train_x_mask)
train_y = np.array(train_y)
valid_x_numerized = np.array(valid_x_numerized)
valid_x_mask = np.array(valid_x_mask)
valid_y = np.array(valid_y)

In [2]:
from utils import memo_load
(train_x_numerized, train_x_mask, train_y, valid_x_numerized, valid_x_mask, valid_y) = memo_load(
    lambda: (train_x_numerized, train_x_mask, train_y, valid_x_numerized, valid_x_mask, valid_y),
    "training_data"
)

In [3]:
from model import ReviewPredictionModel
import torch as th
import torch.optim as optim

device = th.device("cuda" if th.cuda.is_available() else "cpu")
print(device)

model_to_train = None
model_to_train = ReviewPredictionModel(0, 256)
model_to_train.to(device)
optimizer = optim.Adam(model_to_train.parameters(), lr=1e-5)
batch_size = 8
epochs = 10

cuda


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

In [None]:
from training_utils import run_training_loop

training_accuracies, validation_accuracies = run_training_loop(
    model_to_train, optimizer, device,
    batch_size, epochs,
    train_x_numerized, train_x_mask, train_y, valid_x_numerized, valid_x_mask, valid_y
)

Epoch: 0 Iteration: 66030 Loss: 0.396 Validation Loss: 0.812 Accuracy: 0.643 Validation Accuracy: 0.523: 100%|██████████| 66031/66031 [49:03<00:00, 22.43it/s]
Epoch: 1 Iteration: 66030 Loss: 0.432 Validation Loss: 0.823 Accuracy: 0.490 Validation Accuracy: 0.541: 100%|██████████| 66031/66031 [49:07<00:00, 22.40it/s]
Epoch: 2 Iteration: 36300 Loss: 0.281 Validation Loss: 0.502 Accuracy: 0.625 Validation Accuracy: 0.588:  55%|█████▍    | 36309/66031 [26:56<21:04, 23.50it/s]

In [None]:
import matplotlib.pyplot as plt
plt.plot(range(len(training_accuracies)), training_accuracies, label = "Training Accuracy")
plt.plot(list(map(lambda x: x * 10, range(len(validation_accuracies)))), validation_accuracies, label = "Validation Accuracy")
plt.xlabel("Training Iteration")
plt.ylabel("Accuracy")
plt.legend()