In [1]:
from google.colab import drive
import pandas as pd
import torch
import spacy
import torch.nn as nn

import locale

def get_preferred_encoding(do_set_locale=True):
  return 'UTF-8'
locale.getpreferredencoding = get_preferred_encoding

In [2]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m65.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
csv_file = pd.read_csv('/content/drive/MyDrive/Proiect NLP/Preprocessed-Datasets/shortjokes/shortjokes.csv')

NUM_JOKES = 50
data = csv_file['Body'].head(NUM_JOKES).to_numpy()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
llm = spacy.load('en_core_web_md')

In [5]:
def tokenize_and_embed(jokes, llm):

  tokens = []
  embeddings = []
  for joke_idx, joke in enumerate(jokes):

    output = llm(joke)
    tokens.append([out.text for out in output])
    embeddings.append(torch.stack([torch.tensor(out.vector) for out in output]))

    if (joke_idx + 1) % 250 == 0:
      print(f'Joke {joke_idx + 1}/{len(jokes)} tokenized and embedded')

  return tokens, embeddings

In [6]:
tokens, embeddings = tokenize_and_embed(data, llm)

In [7]:
class RNNModel(nn.Module):
  def __init__(self, input_size, hidden_size, output_size):
    super(RNNModel, self).__init__()

    self.input_size = input_size
    self.hidden_size = hidden_size
    self.output_size = output_size

    self.rnn = nn.GRU(
        input_size=self.input_size,
        hidden_size=self.hidden_size,
        num_layers=3,
        batch_first=True
    )

    self.output_layer = nn.Linear(
        in_features=self.hidden_size,
        out_features=self.output_size
    )

  def forward(self, input, hidden_state):
    rnn_output, hidden_state = self.rnn(input, hidden_state)
    output = self.output_layer(rnn_output)

    return output, hidden_state

In [8]:
EMBEDDING_SIZE = llm.vocab.vectors_length

rnn_model = RNNModel(
    input_size=EMBEDDING_SIZE,
    hidden_size=512,
    output_size=EMBEDDING_SIZE,
)

In [9]:
def train_model(model, tokens, embeddings, learning_rate, num_epochs):

  model.train()

  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

  for epoch in range(num_epochs):
    epoch_loss = 0.0

    for joke_idx, joke in enumerate(tokens):

      current_hidden_state = None

      for token_idx, token in enumerate(joke):
        if token_idx == len(joke) - 1:
          break

        current_token_embedding = embeddings[joke_idx][token_idx]
        current_token_embedding = current_token_embedding.view(1, 1, -1)

        next_token_embedding = embeddings[joke_idx][token_idx + 1]
        next_token_embedding = next_token_embedding.view(1, 1, -1)

        if current_hidden_state is not None:
          current_hidden_state = current_hidden_state.detach()

        output, current_hidden_state = model(current_token_embedding, current_hidden_state)

        loss = nn.functional.mse_loss(output, next_token_embedding)

        epoch_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # print(f'Trained on joke {joke_idx + 1}/{len(tokens)}, token {token_idx + 1}/{len(joke)}')

      print(f'Trained on joke {joke_idx + 1}/{len(tokens)}')

    print(f"Epoch {epoch + 1}/{num_epochs} Loss: {epoch_loss / len(tokens)}")



In [10]:
train_model(rnn_model, tokens, embeddings, 0.001, 1)

Trained on joke 1/50
Trained on joke 2/50
Trained on joke 3/50
Trained on joke 4/50
Trained on joke 5/50
Trained on joke 6/50
Trained on joke 7/50
Trained on joke 8/50
Trained on joke 9/50
Trained on joke 10/50
Trained on joke 11/50
Trained on joke 12/50
Trained on joke 13/50
Trained on joke 14/50
Trained on joke 15/50
Trained on joke 16/50
Trained on joke 17/50
Trained on joke 18/50
Trained on joke 19/50
Trained on joke 20/50
Trained on joke 21/50
Trained on joke 22/50
Trained on joke 23/50
Trained on joke 24/50
Trained on joke 25/50
Trained on joke 26/50
Trained on joke 27/50
Trained on joke 28/50
Trained on joke 29/50
Trained on joke 30/50
Trained on joke 31/50
Trained on joke 32/50
Trained on joke 33/50
Trained on joke 34/50
Trained on joke 35/50
Trained on joke 36/50
Trained on joke 37/50
Trained on joke 38/50
Trained on joke 39/50
Trained on joke 40/50
Trained on joke 41/50
Trained on joke 42/50
Trained on joke 43/50
Trained on joke 44/50
Trained on joke 45/50
Trained on joke 46/

In [13]:
def find_most_similar_token(current_token_embedding, llm):
  EPSILON = 0.001
  max_similarity = -1 - EPSILON
  most_similar_token = None

  current_token_embedding = current_token_embedding.squeeze().reshape(EMBEDDING_SIZE)

  for token in llm.vocab:
    if not token.has_vector:
      continue

    token_embedding = torch.tensor(token.vector)

    similarity = nn.functional.cosine_similarity(current_token_embedding, token_embedding, dim=0)

    if similarity > max_similarity:
      max_similarity = similarity
      most_similar_token = token.text

  return most_similar_token


def generate_joke(model, llm, joke_num_tokens):

  joke = []

  model.eval()

  current_hidden_state = None

  current_token_embedding = torch.zeros(1, 1, EMBEDDING_SIZE)

  for token in range(joke_num_tokens):

    current_token_embedding, current_hidden_state = model(current_token_embedding, current_hidden_state)

    joke.append(find_most_similar_token(current_token_embedding, llm))

  return ' '.join(joke)

In [14]:
joke = generate_joke(rnn_model, llm, 100)
print('Answer:', joke, sep='\n\n')

Answer:

if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if if
