In [None]:
from function_library import *

In [None]:
# Download the GloVe word vectors
url = "http://nlp.stanford.edu/data/glove.6B.zip"
output = "glove.6B.zip"
def download_progress(block_num, block_size, total_size):
    progress = block_num * block_size / total_size * 100
    print(f"\rDownloading: {progress:.2f}%", end='')

urllib.request.urlretrieve(url, output, reporthook=download_progress)

# Use the custom ZipFileWithProgress class
with ZipFileWithProgress(output, 'r') as zip_ref:
    zip_ref.extractall("glove.6B")

# # List the extracted files
print(os.listdir("glove.6B"))

In [None]:
# Load glove vectors from file
glove_file = "glove.6B/glove.6B.300d.txt"  
glove_vectors = load_glove_vectors(glove_file)

# Get vocabulary from glove and create embeddings
vocab, inverse_vocab = get_vocabulary_from_glove(glove_vectors)
embedding_dim = 300 
vocab_size = len(glove_vectors) + 2
embedding = nn.Embedding(vocab_size, embedding_dim)

for idx, word in enumerate(inverse_vocab[2:]):
    i = idx + 2
    embedding.weight[i].data = glove_vectors[word]

print("Embedding layer created with shape:", embedding.weight.shape)

In [None]:
# Store dataframe in variable and clean it
df = pd.read_csv('Reviews.csv')
df = df.dropna()
df = df[df['Score'] != 3]
df["Score"] = df["Score"].replace(1, 0)
df["Score"] = df["Score"].replace(2, 0)
df["Score"] = df["Score"].replace(5, 1)
df["Score"] = df["Score"].replace(4, 1)
df.head()

In [None]:
# Split dataframe in 2: Test and Train
# X represents the words, y represents the Scores
X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['Score'], test_size=0.2, random_state=33)
classes = list(set(y_train))

# Turn the Scores into tensor
y_train_bin = torch.tensor([[classes.index(y) for y in y_train]]).T
y_test_bin = torch.tensor([[classes.index(z) for z in y_test]]).T

# Train the tokenizer
tokenizer = MyTokenizer(sentence_length=100)
tokenizer.fit(X_train)

# Create training batches
dataset = TextDataset(list(X_train), y_train_bin, tokenizer)
dataloader = DataLoader(dataset, batch_size=568000, shuffle=True)

In [None]:
# Define the model variables
model = MyOtherClassifier(vocab_size=tokenizer.vocab_size,
                          embedding_layer=embedding,
                          embedding_dim=300, 
                          hidden_dim=500,
                          output_dim=1,
                          n_special_tokens=2,
                          n_layers=3)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) # lr is the learning rate - this is our alpha
loss_fn = nn.BCEWithLogitsLoss() # Binary Cross Entropy from Logits

In [None]:
# Train for the necessary amount of epochs it takes to stabilize
losses = []
for epoch in tqdm(range(500)):
    epoch_loss = 0
    for batch in dataloader:
        X_train_vect, y_train_vect = batch
        optimizer.zero_grad()
        output = model(X_train_vect)
        loss = loss_fn(output, y_train_vect.float())
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    losses.append(epoch_loss / len(dataloader))

In [None]:
# Save Model and vector
torch.save(model.state_dict(), "model.pt")
torch.save(X_train_vect, "vector.pt")

# Save loss figure
plt.figure(figsize=(3,2))
plt.plot(losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.savefig('graph.png')
plt.show()

In [None]:
saved_model = MyOtherClassifier(vocab_size=tokenizer.vocab_size,
                          embedding_layer=embedding,
                          embedding_dim=300, 
                          hidden_dim=500,
                          output_dim=1,
                          n_special_tokens=2,
                          n_layers=3)

saved_model.load_state_dict(torch.load("model.pt", weights_only=True))
saved_model.eval()

my_vector = torch.load("vector.pt")

compilation_output = saved_model(my_vector)

In [None]:
classification = compilation_output > 0.5
torch.save(classification, "classification.pt")
my_classification = torch.load("classification.pt")
classification_list = my_classification.tolist()
print(classification_list[0])

In [None]:
y_train_list = y_train.to_list()
score_total_accuracy = 0
score_counter = 0
for i in range(len(y_train_list)):
    if classification_list[i] == [True]:
        classification_list[i] = 1
    elif classification_list[i] == [False]:
        classification_list[i] = 0
        
    if y_train_list[i] == classification_list[i]:
        score_total_accuracy += 1
    score_counter += 1 
print(score_total_accuracy/score_counter)
