In [None]:
from google.colab import drive
drive.mount('/drive')

In [None]:
%%bash
cd ..
rm -rf content
ln -s /drive/MyDrive/cs182_project content
cd content

pip install -r requirements.txt

# RESTART YOUR RUNTIME NOW!

In [None]:
%load_ext autoreload
%autoreload 2

from tqdm.notebook import tqdm
from segtok import tokenizer

In [None]:
# Key Hyperparameters
max_tokenized_length = 64
enable_orig = False
enable_aug = False
enable_aug3 = True

batch_size = 8
epochs = 5

experiment_id = "classification-bert-64-tokens-5-epochs-aug3-only"

In [None]:
import os
experiment_dir = f"completed-experiments/{experiment_id}"
assert not os.path.exists(experiment_dir)
os.makedirs(experiment_dir)

In [None]:
from data_parsing import load_dataset, load_gen_dataset
data = load_dataset("./yelp_review_training_dataset.jsonl")

In [None]:
from training_utils import split_train_validation
train_x, valid_x, train_y, valid_y = split_train_validation(data, 0.01)

if not enable_orig:
    train_x = []
    train_y = []

if enable_aug:
    aug_data = load_gen_dataset("./new_data.json") + load_gen_dataset("./new_data2.json")
    train_x += [i[0] for i in aug_data]
    train_y += [i[1] for i in aug_data]

if enable_aug3:
    aug_data3 = load_gen_dataset("./new_data3.json")
    train_x += [i[0] for i in aug_data3]
    train_y += [i[1] for i in aug_data3]

In [None]:
print(len(train_x))
print(len(train_y))
print(len(valid_x))
print(len(valid_y))

In [None]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [None]:
train_x_numerized = []
train_x_mask = []
for text in tqdm(train_x):
    tokenized = tokenizer(text, truncation=True, padding="max_length", max_length=max_tokenized_length)[0]
    train_x_numerized.append(tokenized.ids)
    train_x_mask.append(tokenized.attention_mask)
valid_x_numerized = []
valid_x_mask = []
for text in tqdm(valid_x):
    tokenized = tokenizer(text, truncation=True, padding="max_length", max_length=max_tokenized_length)[0]
    valid_x_numerized.append(tokenized.ids)
    valid_x_mask.append(tokenized.attention_mask)

In [None]:
import numpy as np

train_x_numerized = np.array(train_x_numerized)
train_x_mask = np.array(train_x_mask)
train_y = np.array(train_y)
valid_x_numerized = np.array(valid_x_numerized)
valid_x_mask = np.array(valid_x_mask)
valid_y = np.array(valid_y)

from utils import memo_load
(train_x_numerized, train_x_mask, train_y, valid_x_numerized, valid_x_mask, valid_y) = memo_load(
    lambda: (train_x_numerized, train_x_mask, train_y, valid_x_numerized, valid_x_mask, valid_y),
    f"{experiment_dir}/training_data"
)

In [None]:
from model import ReviewPredictionModel
import torch as th
import torch.optim as optim

device = th.device("cuda" if th.cuda.is_available() else "cpu")
print(device)

model_to_train = None
model_to_train = ReviewPredictionModel(0, max_tokenized_length)
model_to_train.to(device)
optimizer = optim.Adam(model_to_train.parameters(), lr=1e-5)

In [None]:
from training_utils import run_training_loop

training_accuracies, validation_accuracies = run_training_loop(
    model_to_train, optimizer, device,
    batch_size, epochs,
    train_x_numerized, train_x_mask, train_y, valid_x_numerized, valid_x_mask, valid_y,
    model_id=experiment_id
)

from utils import memo_load
(training_accuracies, validation_accuracies) = memo_load(
    lambda: (training_accuracies, validation_accuracies),
    f"{experiment_dir}/training_validation_accuracies"
)

In [None]:
import matplotlib.pyplot as plt
plt.plot(range(len(training_accuracies)), training_accuracies, label = "Training Accuracy")
plt.plot(list(map(lambda x: x * 100, range(len(validation_accuracies)))), validation_accuracies, label = "Validation Accuracy")
plt.xlabel("Training Iteration")
plt.ylabel("Accuracy")
plt.legend()
plt.savefig(f"{experiment_dir}/training-plot.png")

In [None]:
from google.colab import drive
drive.mount('/drive')

In [None]:
%%bash
cd ..
rm -rf content
ln -s /drive/MyDrive/cs182_project content
cd content

pip install -r requirements.txt

# RESTART YOUR RUNTIME NOW!

In [None]:
%load_ext autoreload
%autoreload 2

from tqdm.notebook import tqdm
from segtok import tokenizer

In [None]:
# Key Hyperparameters
max_tokenized_length = 64
enable_orig = False
enable_aug = False
enable_aug3 = True

batch_size = 8
epochs = 5

experiment_id = "classification-bert-64-tokens-5-epochs-aug3-only"

In [None]:
import os
experiment_dir = f"completed-experiments/{experiment_id}"
assert not os.path.exists(experiment_dir)
os.makedirs(experiment_dir)

In [None]:
from data_parsing import load_dataset, load_gen_dataset
data = load_dataset("./yelp_review_training_dataset.jsonl")

In [None]:
from training_utils import split_train_validation
train_x, valid_x, train_y, valid_y = split_train_validation(data, 0.01)

if not enable_orig:
    train_x = []
    train_y = []

if enable_aug:
    aug_data = load_gen_dataset("./new_data.json") + load_gen_dataset("./new_data2.json")
    train_x += [i[0] for i in aug_data]
    train_y += [i[1] for i in aug_data]

if enable_aug3:
    aug_data3 = load_gen_dataset("./new_data3.json")
    train_x += [i[0] for i in aug_data3]
    train_y += [i[1] for i in aug_data3]

In [None]:
print(len(train_x))
print(len(train_y))
print(len(valid_x))
print(len(valid_y))

In [None]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [None]:
train_x_numerized = []
train_x_mask = []
for text in tqdm(train_x):
    tokenized = tokenizer(text, truncation=True, padding="max_length", max_length=max_tokenized_length)[0]
    train_x_numerized.append(tokenized.ids)
    train_x_mask.append(tokenized.attention_mask)
valid_x_numerized = []
valid_x_mask = []
for text in tqdm(valid_x):
    tokenized = tokenizer(text, truncation=True, padding="max_length", max_length=max_tokenized_length)[0]
    valid_x_numerized.append(tokenized.ids)
    valid_x_mask.append(tokenized.attention_mask)

In [None]:
import numpy as np

train_x_numerized = np.array(train_x_numerized)
train_x_mask = np.array(train_x_mask)
train_y = np.array(train_y)
valid_x_numerized = np.array(valid_x_numerized)
valid_x_mask = np.array(valid_x_mask)
valid_y = np.array(valid_y)

from utils import memo_load
(train_x_numerized, train_x_mask, train_y, valid_x_numerized, valid_x_mask, valid_y) = memo_load(
    lambda: (train_x_numerized, train_x_mask, train_y, valid_x_numerized, valid_x_mask, valid_y),
    f"{experiment_dir}/training_data"
)

In [None]:
from model import ReviewPredictionModel
import torch as th
import torch.optim as optim

device = th.device("cuda" if th.cuda.is_available() else "cpu")
print(device)

model_to_train = None
model_to_train = ReviewPredictionModel(0, max_tokenized_length)
model_to_train.to(device)
optimizer = optim.Adam(model_to_train.parameters(), lr=1e-5)

In [None]:
from training_utils import run_training_loop

training_accuracies, validation_accuracies = run_training_loop(
    model_to_train, optimizer, device,
    batch_size, epochs,
    train_x_numerized, train_x_mask, train_y, valid_x_numerized, valid_x_mask, valid_y,
    model_id=experiment_id
)

from utils import memo_load
(training_accuracies, validation_accuracies) = memo_load(
    lambda: (training_accuracies, validation_accuracies),
    f"{experiment_dir}/training_validation_accuracies"
)

In [None]:
import matplotlib.pyplot as plt
plt.plot(range(len(training_accuracies)), training_accuracies, label = "Training Accuracy")
plt.plot(list(map(lambda x: x * 100, range(len(validation_accuracies)))), validation_accuracies, label = "Validation Accuracy")
plt.xlabel("Training Iteration")
plt.ylabel("Accuracy")
plt.legend()
plt.savefig(f"{experiment_dir}/training-plot.png")