In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random

# Sample text for language modeling
text = """
Natural language processing is a field of artificial intelligence.
It focuses on the interaction between computers and humans in natural language.
The goal of NLP is to enable computers to understand, interpret, and generate human language.
RNNs are commonly used in NLP for tasks like language modeling and sentiment analysis.
"""

# Preprocess the text
words = text.lower().replace('\n', ' ').split()
vocab = sorted(set(words))
word_to_idx = {word: i for i, word in enumerate(vocab)}
idx_to_word = {i: word for i, word in enumerate(vocab)}

# Create input-output pairs for next word prediction
# Input: sequence of words, Output: next word
sequence_length = 3  # Number of previous words to use
input_sequences = []
output_words = []

for i in range(len(words) - sequence_length):
    input_sequences.append([word_to_idx[words[j]] for j in range(i, i + sequence_length)])
    output_words.append(word_to_idx[words[i + sequence_length]])

# Convert to PyTorch tensors
X = torch.tensor(input_sequences, dtype=torch.long)
y = torch.tensor(output_words, dtype=torch.long)

# Define a language model using RNN
class LanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(LanguageModel, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, sequence):
        # sequence shape: [batch size, seq length]

        embedded = self.embedding(sequence)
        # embedded shape: [batch size, seq length, embedding dim]

        output, (hidden, cell) = self.rnn(embedded)
        # output shape: [batch size, seq length, hidden dim]

        # Use the output from the last time step
        output = output[:, -1, :]
        # output shape: [batch size, hidden dim]

        prediction = self.fc(output)
        # prediction shape: [batch size, vocab size]

        return prediction

# Set up the model
vocab_size = len(vocab)
embedding_dim = 50
hidden_dim = 100

model = LanguageModel(vocab_size, embedding_dim, hidden_dim)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Train the model
num_epochs = 200

for epoch in range(num_epochs):
    # Forward pass
    outputs = model(X)

    # Compute loss
    loss = criterion(outputs, y)

    # Backward pass and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Print progress
    if (epoch + 1) % 50 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Generate text using the trained model
def generate_text(model, seed_text, word_to_idx, idx_to_word, max_length=20):
    model.eval()
    words = seed_text.lower().split()

    # Use the last 'sequence_length' words as our initial sequence
    current_sequence = [word_to_idx.get(word, random.randint(0, len(vocab)-1)) for word in words[-sequence_length:]]

    generated_words = words.copy()

    for _ in range(max_length):
        # Convert sequence to tensor
        sequence_tensor = torch.tensor([current_sequence], dtype=torch.long)

        # Get prediction
        with torch.no_grad():
            output = model(sequence_tensor)

        # Get the word with highest probability
        _, predicted_idx = torch.max(output, dim=1)
        predicted_word = idx_to_word[predicted_idx.item()]

        # Add the predicted word to our generated words
        generated_words.append(predicted_word)

        # Update the sequence for next prediction (remove first element, add predicted)
        current_sequence = current_sequence[1:] + [predicted_idx.item()]

    return ' '.join(generated_words)

# Test the language model with different seed texts
seed_texts = [
    "natural language processing is",
    "the goal of nlp",
    "computers and humans"
]

for seed in seed_texts:
    generated = generate_text(model, seed, word_to_idx, idx_to_word)
    print(f"\nSeed: '{seed}'")
    print(f"Generated: '{generated}'")

Epoch [50/200], Loss: 0.0003
Epoch [100/200], Loss: 0.0001
Epoch [150/200], Loss: 0.0001
Epoch [200/200], Loss: 0.0001

Seed: 'natural language processing is'
Generated: 'natural language processing is a field of artificial intelligence. it focuses on the interaction between computers and humans in natural language. the goal of'

Seed: 'the goal of nlp'
Generated: 'the goal of nlp is to enable computers to understand, interpret, and generate human language. rnns are commonly used in nlp for tasks like'

Seed: 'computers and humans'
Generated: 'computers and humans in natural language. the goal of nlp is to enable computers to understand, interpret, and generate human language. rnns are'
