In [1]:
import torch
from torch import nn
import pandas as pd
import matplotlib.pyplot as plt
import re
import requests

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
url = "https://cs.stanford.edu/people/karpathy/char-rnn/warpeace_input.txt"
response = requests.get(url)
text = response.text

# Add a space at the end of each line
text = "\n".join(line + " " for line in text.splitlines())

In [3]:
text = re.sub('[^a-zA-Z0-9 \.]', '', text)
text = text.lower().strip()
words = re.findall(r'\b\w+\b', text)
vocab = sorted(set(words))  # stores unique words in the training vocabulary
#text
print(words[:35])
print(len(words))

['well', 'prince', 'so', 'genoa', 'and', 'lucca', 'are', 'now', 'just', 'family', 'estates', 'of', 'the', 'buonapartes', 'but', 'i', 'warn', 'you', 'if', 'you', 'dont', 'tell', 'me', 'that', 'this', 'means', 'war', 'if', 'you', 'still', 'try', 'to', 'defend', 'the', 'infamies']
561460


  text = re.sub('[^a-zA-Z0-9 \.]', '', text)


In [4]:
# Create word-to-index and index-to-word mappings
stoi = {s: i+1 for i, s in enumerate(vocab)}  # '+1' to reserve index 0 for padding
stoi['.'] = 0  # Reserve index 0 for padding or end token
itos = {i: s for s, i in stoi.items()}
len(itos)

20068

In [6]:
block_size = 15  
X, Y = [], []   
i=0

context = [0] * block_size

for word in words + ['.']:  # Include end-of-sequence marker
    i=i+1
    ix = stoi[word]  # Get the index of the current word
    X.append(context)  # Append the current context to X
    Y.append(ix)       # Append the index of the current word to Y (target)

    if i<100:
        print(' '.join(itos[i] if i in itos else '.' for i in context), '--->', itos[ix])

    # Update context by sliding the window to include the next word index
    context = context[1:] + [ix]  # Move the window without reinitializing

# Convert lists to tensors and move to GPU if available
X = torch.tensor(X).to(device)
Y = torch.tensor(Y).to(device)

. . . . . . . . . . . . . . . ---> well
. . . . . . . . . . . . . . well ---> prince
. . . . . . . . . . . . . well prince ---> so
. . . . . . . . . . . . well prince so ---> genoa
. . . . . . . . . . . well prince so genoa ---> and
. . . . . . . . . . well prince so genoa and ---> lucca
. . . . . . . . . well prince so genoa and lucca ---> are
. . . . . . . . well prince so genoa and lucca are ---> now
. . . . . . . well prince so genoa and lucca are now ---> just
. . . . . . well prince so genoa and lucca are now just ---> family
. . . . . well prince so genoa and lucca are now just family ---> estates
. . . . well prince so genoa and lucca are now just family estates ---> of
. . . well prince so genoa and lucca are now just family estates of ---> the
. . well prince so genoa and lucca are now just family estates of the ---> buonapartes
. well prince so genoa and lucca are now just family estates of the buonapartes ---> but
well prince so genoa and lucca are now just family estates o

In [6]:
X.shape, Y.shape

(torch.Size([561461, 15]), torch.Size([561461]))

In [7]:
X[:15]

tensor([[    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0],
        [    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0, 19475],
        [    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0, 19475, 13578],
        [    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0, 19475, 13578, 16347],
        [    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0, 19475, 13578, 16347,  7493],
        [    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
         19475, 13578, 16347,  7493,   723],
        [    0,     0,     0,     0,     0,     0,     0,     0,     0, 19475,
         13578, 16347,  7493,   723, 10582],
        [    0,     0,     0,     0,     0,     0,     0,     0, 19475, 13578,
         16347,  7493,   723, 10582,   994],
        

In [8]:
# Hyperparameters
block_size = 15  # How many words to use as context
embedding_dim = 32  # Size of word embeddings
hidden_size = 1024  # Size of hidden layer in MLP

In [9]:
class NextWord(nn.Module):
  def __init__(self, block_size, vocab_size, emb_dim, hidden_size):
    super().__init__()
    self.emb = nn.Embedding(vocab_size, emb_dim)
    self.lin1 = nn.Linear(block_size * emb_dim, hidden_size)
    self.lin2 = nn.Linear(hidden_size, vocab_size)

  def forward(self, x):
    x = self.emb(x)
    x = x.view(x.shape[0], -1)
    x = torch.tanh(self.lin1(x))
    x = self.lin2(x)
    return x

In [10]:
# Initialize model
vocab_size = len(stoi)
model = NextWord(block_size, vocab_size, embedding_dim, hidden_size).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
loss_fn = nn.CrossEntropyLoss()

In [11]:
model

NextWord(
  (emb): Embedding(20068, 32)
  (lin1): Linear(in_features=480, out_features=1024, bias=True)
  (lin2): Linear(in_features=1024, out_features=20068, bias=True)
)

In [12]:
def predict_next_words(model, stoi, itos, input_words, device, num_words=5):

    model.eval()  # Set model to evaluation mode
    context = [stoi.get(word, 0) for word in input_words[-block_size:]]
    sentence = input_words.copy()
    
    for _ in range(num_words):
        input_seq = torch.tensor(context, dtype=torch.long).unsqueeze(0).to(device)
        with torch.no_grad():
            output = model(input_seq)
            _, predicted = torch.max(output, dim=1)
            next_word = itos[predicted.item()]
            sentence.append(next_word)
            
            # Update context for the next prediction
            context = context[1:] + [predicted.item()]
    
    return ' '.join(sentence)

# Example usage of sentence generation for untrained model
input_words = ['','','','','','','','','','','this', 'is', 'a', 'sample', 'sentence']
print("Generated sentence:", predict_next_words(model, stoi, itos, input_words, device, num_words=15).strip())

Generated sentence: this is a sample sentence fix wantedwhen babyhood snapped ultimately attracted kamenka sonyas rakes robespierre earthly workman profound subjectthe petted


In [13]:
batch_size = 4096
print_every = 25
for epoch in range(501):
    for i in range(0, X.shape[0], batch_size):
        x = X[i:i+batch_size]
        y = Y[i:i+batch_size]
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    if epoch % 10 ==0:
        print(epoch)
    if epoch % print_every == 0:
        print(epoch, loss.item())

0
0 7.209328651428223
10
20
25 1.5761239528656006
30
40
50
50 0.7671380639076233
60
70
75 0.440592885017395
80
90
100
100 0.2596025764942169
110
120
125 0.16512581706047058
130
140
150
150 0.11494843661785126
160
170
175 0.08619396388530731
180
190
200
200 0.06839853525161743
210
220
225 0.05643048882484436
230
240
250
250 0.04772890731692314
260
270
275 0.04104917496442795
280
290
300
300 0.03577254340052605
310
320
325 0.03156164288520813
330
340
350
350 0.02814546972513199
360
370
375 0.02529189921915531
380
390
400
400 0.022877603769302368
410
420
425 0.020785942673683167
430
440
450
450 0.018967704847455025
460
470
475 0.01738348975777626
480
490
500
500 0.01598852314054966


In [14]:
# Example usage of sentence generation for trained model
input_words = ['','','','','','','','','','','this', 'is', 'a', 'sample', 'sentence']
print("Generated sentence:", predict_next_words(model, stoi, itos, input_words, device, num_words=25).strip())

Generated sentence: this is a sample sentence of him he had made from the colonel of which had not merely passed up the hill and the battle of the regiment which had


In [15]:
# Assuming the model is already trained
model_path = 'next_word_model_32_15_tanh_500.pth'
torch.save(model.state_dict(), model_path)

In [16]:
import os
os.chdir(r'/kaggle/working')

In [17]:
from IPython.display import FileLink
FileLink(r'next_word_model_32_15_tanh_500.pth')      