In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
corpus = [
    "i love to eat pizza",
    "i love to eat pasta",
    "i love to write code",
    "i learn rnn",
    "rnn is simple",
    "rnn is cool",
    "i like programming",
    "programming is fun"
]

In [3]:
# Collect all unique words
words = set()
for sentence in corpus:
    words.update(sentence.lower().split())

# Sort vocabulary
vocab = sorted(list(words))
vocab_size = len(vocab)

print("Vocabulary:", vocab)
print("Vocabulary size:", vocab_size)

Vocabulary: ['code', 'cool', 'eat', 'fun', 'i', 'is', 'learn', 'like', 'love', 'pasta', 'pizza', 'programming', 'rnn', 'simple', 'to', 'write']
Vocabulary size: 16


In [6]:
word_to_idx = {word: i for i, word in enumerate(vocab)}
idx_to_word = {i: word for i, word in enumerate(vocab)}

print("Word to Index:", word_to_idx)

Word to Index: {'code': 0, 'cool': 1, 'eat': 2, 'fun': 3, 'i': 4, 'is': 5, 'learn': 6, 'like': 7, 'love': 8, 'pasta': 9, 'pizza': 10, 'programming': 11, 'rnn': 12, 'simple': 13, 'to': 14, 'write': 15}


In [7]:
# Example:
# "i love to eat pizza"
# (["i"], "love")
# (["i", "love"], "to")
# (["i", "love", "to"], "eat")
# (["i", "love", "to", "eat"], "pizza")

sequences = []

for sentence in corpus:
    tokens = sentence.lower().split()
    for i in range(1, len(tokens)):
        context = tokens[:i]
        target = tokens[i]
        sequences.append((context, target))

print("Total training sequences:", len(sequences))
print("Sample sequence:", sequences[0])

Total training sequences: 22
Sample sequence: (['i'], 'love')


In [8]:
class WordRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(WordRNN, self).__init__()
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_size)
        
        # RNN layer
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
        
        # Output layer
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        # x: (batch_size, seq_len)
        
        x = self.embedding(x)
        # x: (batch_size, seq_len, embed_size)
        
        out, _ = self.rnn(x)
        # out: (batch_size, seq_len, hidden_size)
        
        out = out[:, -1, :]
        # out: (batch_size, hidden_size)
        
        out = self.fc(out)
        # out: (batch_size, vocab_size)
        
        return out

In [9]:
EMBED_SIZE = 10
HIDDEN_SIZE = 32
LEARNING_RATE = 0.01
EPOCHS = 200

model = WordRNN(vocab_size, EMBED_SIZE, HIDDEN_SIZE)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [10]:
print("Starting training...")

for epoch in range(EPOCHS):
    total_loss = 0
    
    for context, target in sequences:
        # Convert words to indices
        context_idxs = [word_to_idx[w] for w in context]
        context_tensor = torch.tensor(context_idxs).unsqueeze(0)
        
        target_tensor = torch.tensor([word_to_idx[target]])
        
        # Forward pass
        optimizer.zero_grad()
        output = model(context_tensor)
        
        # Loss and backprop
        loss = criterion(output, target_tensor)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    if (epoch + 1) % 20 == 0:
        print(f"Epoch [{epoch+1}/{EPOCHS}], Loss: {total_loss/len(sequences):.4f}")

print("Training finished.")

Starting training...
Epoch [20/200], Loss: 0.5291
Epoch [40/200], Loss: 0.5123
Epoch [60/200], Loss: 0.5040
Epoch [80/200], Loss: 0.5016
Epoch [100/200], Loss: 0.4962
Epoch [120/200], Loss: 0.4957
Epoch [140/200], Loss: 0.4886
Epoch [160/200], Loss: 0.4906
Epoch [180/200], Loss: 0.4853
Epoch [200/200], Loss: 0.4893
Training finished.


In [11]:
def autocomplete(model, seed_text, n_words=3):
    model.eval()
    
    generated_words = seed_text.lower().split()
    
    with torch.no_grad():
        for _ in range(n_words):
            try:
                context_idxs = [word_to_idx[w] for w in generated_words]
            except KeyError as e:
                print(f"Word '{e.args[0]}' not in vocabulary.")
                break
            
            context_tensor = torch.tensor(context_idxs).unsqueeze(0)
            output = model(context_tensor)
            
            pred_idx = torch.argmax(output, dim=1).item()
            next_word = idx_to_word[pred_idx]
            
            generated_words.append(next_word)
    
    return " ".join(generated_words)

In [12]:
print("\n--- Autocomplete Examples ---")

print("i love →", autocomplete(model, "i love", 2))
print("rnn is →", autocomplete(model, "rnn is", 2))
print("i learn →", autocomplete(model, "i learn", 1))
print("programming is →", autocomplete(model, "programming is", 1))


--- Autocomplete Examples ---
i love → i love to eat
rnn is → rnn is simple programming
i learn → i learn rnn
programming is → programming is fun
