In [12]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import Counter
import re

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [13]:
class RNNNumPy:
    def __init__(self, vocab_size, hidden_size):
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        
        self.W_hx = np.random.randn(hidden_size, vocab_size) * 0.01

        self.W_hh = np.random.randn(hidden_size, hidden_size) * 0.01

        self.W_hy = np.random.randn(vocab_size, hidden_size) * 0.01
        
        self.b_h = np.zeros((hidden_size, 1)) 
        self.b_y = np.zeros((vocab_size, 1))  
        
    def forward(self, inputs):
        h_states = {}
        outputs = {}
        h_states[-1] = np.zeros((self.hidden_size, 1))
        
        loss = 0
        for t, word_idx in enumerate(inputs):
            x_t = np.zeros((self.vocab_size, 1))
            x_t[word_idx] = 1
            
            h_states[t] = np.tanh(np.dot(self.W_hx, x_t) + np.dot(self.W_hh, h_states[t-1]) + self.b_h)
            
            y_t = np.dot(self.W_hy, h_states[t]) + self.b_y

            outputs[t] = np.exp(y_t) / np.sum(np.exp(y_t))
            
        return outputs, h_states

In [14]:
df = pd.read_csv('/kaggle/input/poems-100-csv/poems-100 - poems-100.csv')
text_data = " ".join(df['text'].dropna().tolist())


def tokenize(text):
    text = text.lower()
    words = re.findall(r'\b\w+\b|[.,!?;]', text)
    return words

tokens = tokenize(text_data)

word_counts = Counter(tokens)
vocab = sorted(word_counts, key=word_counts.get, reverse=True)
vocab_size = len(vocab)

word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for idx, word in enumerate(vocab)}

print(f"Total tokens: {len(tokens)}")
print(f"Vocabulary size: {vocab_size}")

seq_length = 20
data_x = []
data_y = []

for i in range(0, len(tokens) - seq_length):
    seq_in = tokens[i : i + seq_length]
    seq_out = tokens[i + 1 : i + seq_length + 1] # Predict the next word in the sequence
    data_x.append([word2idx[word] for word in seq_in])
    data_y.append([word2idx[word] for word in seq_out])

X_tensor = torch.tensor(data_x, dtype=torch.long)
Y_tensor = torch.tensor(data_y, dtype=torch.long)

from torch.utils.data import TensorDataset, DataLoader
dataset = TensorDataset(X_tensor, Y_tensor)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

Total tokens: 29175
Vocabulary size: 5163


In [15]:
class RNNOneHot(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_layers=1):
        super(RNNOneHot, self).__init__()
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.rnn = nn.RNN(input_size=vocab_size, hidden_size=hidden_size, 
                          num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, x, hidden):
        x_one_hot = F.one_hot(x, num_classes=self.vocab_size).float()
        
        out, hidden = self.rnn(x_one_hot, hidden)
        out = self.fc(out)
        return out, hidden
    
    def init_hidden(self, batch_size):
        return torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)

hidden_dim = 128
model_onehot = RNNOneHot(vocab_size, hidden_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer_onehot = optim.Adam(model_onehot.parameters(), lr=0.005)

epochs = 50
print("Training--One-Hot--RNN")
for epoch in range(epochs):
    model_onehot.train()
    total_loss = 0
    
    for batch_x, batch_y in dataloader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        
        hidden = model_onehot.init_hidden(batch_x.size(0))
        
        optimizer_onehot.zero_grad()
        output, hidden = model_onehot(batch_x, hidden)
        
        loss = criterion(output.view(-1, vocab_size), batch_y.view(-1))
        loss.backward()
        optimizer_onehot.step()
        
        total_loss += loss.item()
        
    print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss/len(dataloader):.4f}")

Training--One-Hot--RNN
Epoch [1/50], Loss: 5.5285
Epoch [2/50], Loss: 1.8528
Epoch [3/50], Loss: 0.6141
Epoch [4/50], Loss: 0.4359
Epoch [5/50], Loss: 0.3842
Epoch [6/50], Loss: 0.3610
Epoch [7/50], Loss: 0.3471
Epoch [8/50], Loss: 0.3404
Epoch [9/50], Loss: 0.3346
Epoch [10/50], Loss: 0.3294
Epoch [11/50], Loss: 0.3259
Epoch [12/50], Loss: 0.3232
Epoch [13/50], Loss: 0.3195
Epoch [14/50], Loss: 0.3185
Epoch [15/50], Loss: 0.3169
Epoch [16/50], Loss: 0.3138
Epoch [17/50], Loss: 0.3119
Epoch [18/50], Loss: 0.3110
Epoch [19/50], Loss: 0.3097
Epoch [20/50], Loss: 0.3074
Epoch [21/50], Loss: 0.3069
Epoch [22/50], Loss: 0.3064
Epoch [23/50], Loss: 0.3052
Epoch [24/50], Loss: 0.3041
Epoch [25/50], Loss: 0.3035
Epoch [26/50], Loss: 0.3026
Epoch [27/50], Loss: 0.3017
Epoch [28/50], Loss: 0.3006
Epoch [29/50], Loss: 0.3002
Epoch [30/50], Loss: 0.2998
Epoch [31/50], Loss: 0.2987
Epoch [32/50], Loss: 0.2989
Epoch [33/50], Loss: 0.2982
Epoch [34/50], Loss: 0.2979
Epoch [35/50], Loss: 0.2977
Epoch 

In [16]:
class RNNEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=1):
        super(RNNEmbedding, self).__init__()
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.RNN(input_size=embed_size, hidden_size=hidden_size, 
                          num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, x, hidden):
        embedded = self.embedding(x)
        out, hidden = self.rnn(embedded, hidden)
        out = self.fc(out)
        return out, hidden
    
    def init_hidden(self, batch_size):
        return torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)

embed_dim = 64
model_embed = RNNEmbedding(vocab_size, embed_dim, hidden_dim).to(device)
optimizer_embed = optim.Adam(model_embed.parameters(), lr=0.005)

print("\n Training--Embedding--RNN")
for epoch in range(epochs):
    model_embed.train()
    total_loss = 0
    
    for batch_x, batch_y in dataloader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        
        hidden = model_embed.init_hidden(batch_x.size(0))
        
        optimizer_embed.zero_grad()
        output, hidden = model_embed(batch_x, hidden)
        
        loss = criterion(output.view(-1, vocab_size), batch_y.view(-1))
        loss.backward()
        optimizer_embed.step()
        
        total_loss += loss.item()
        
    print(f"Epoch [{epoch+1}/{epochs}], Loss: {total_loss/len(dataloader):.4f}")


 Training--Embedding--RNN
Epoch [1/50], Loss: 4.1066
Epoch [2/50], Loss: 1.4739
Epoch [3/50], Loss: 0.8581
Epoch [4/50], Loss: 0.6478
Epoch [5/50], Loss: 0.5510
Epoch [6/50], Loss: 0.5016
Epoch [7/50], Loss: 0.4713
Epoch [8/50], Loss: 0.4517
Epoch [9/50], Loss: 0.4390
Epoch [10/50], Loss: 0.4271
Epoch [11/50], Loss: 0.4209
Epoch [12/50], Loss: 0.4147
Epoch [13/50], Loss: 0.4094
Epoch [14/50], Loss: 0.4037
Epoch [15/50], Loss: 0.4003
Epoch [16/50], Loss: 0.3984
Epoch [17/50], Loss: 0.3941
Epoch [18/50], Loss: 0.3912
Epoch [19/50], Loss: 0.3897
Epoch [20/50], Loss: 0.3864
Epoch [21/50], Loss: 0.3860
Epoch [22/50], Loss: 0.3823
Epoch [23/50], Loss: 0.3824
Epoch [24/50], Loss: 0.3802
Epoch [25/50], Loss: 0.3785
Epoch [26/50], Loss: 0.3771
Epoch [27/50], Loss: 0.3768
Epoch [28/50], Loss: 0.3741
Epoch [29/50], Loss: 0.3748
Epoch [30/50], Loss: 0.3738
Epoch [31/50], Loss: 0.3718
Epoch [32/50], Loss: 0.3717
Epoch [33/50], Loss: 0.3698
Epoch [34/50], Loss: 0.3706
Epoch [35/50], Loss: 0.3691
Ep

In [17]:
def generate_text(model, start_text, num_words, is_one_hot=False):
    model.eval()
    words = tokenize(start_text)
    
    hidden = model.init_hidden(1)
    
    for _ in range(num_words):
        x_idx = [word2idx.get(w, word2idx[vocab[0]]) for w in words[-seq_length:]]
        x_tensor = torch.tensor([x_idx], dtype=torch.long).to(device)
        
        with torch.no_grad():
            output, hidden = model(x_tensor, hidden)
            
            last_word_logits = output[0, -1, :]

            probs = F.softmax(last_word_logits, dim=0).cpu().numpy()
            predicted_idx = np.random.choice(len(probs), p=probs)
            
            words.append(idx2word[predicted_idx])
            
    return " ".join(words)

seed_text = "the rose is red"
print("\nOne-Hot Generation:")
print(generate_text(model_onehot, seed_text, num_words=30, is_one_hot=True))

print("\nEmbedding Generation:")
print(generate_text(model_embed, seed_text, num_words=30, is_one_hot=False))


One-Hot Generation:
the rose is red , the violet s blue , sugar is sweet , and so are you . how do i love thee ? let me count the ways . i love thee

Embedding Generation:
the rose is red , the violet s blue , sugar is sweet , and so in man , o lord , to him , in vain the ocean settling in hollows and the
