In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import numpy as np
import zipfile
import requests
from io import BytesIO
import re

# Download and unzip the dataset
url = 'http://mattmahoney.net/dc/text8.zip'
response = requests.get(url)
with zipfile.ZipFile(BytesIO(response.content)) as z:
    text = z.read(z.namelist()[0]).decode('utf-8')

# Tokenize the text
words = re.findall(r'\b\w+\b', text.lower())
word_counts = Counter(words)
vocab = sorted(word_counts, key=word_counts.get, reverse=True)
vocab_to_int = {word: i for i, word in enumerate(vocab, 1)}
int_to_vocab = {i: word for word, i in vocab_to_int.items()}
encoded = [vocab_to_int[word] for word in words]

# Prepare the dataset
seq_length = 50
sequences = []
targets = []
for i in range(0, len(encoded) - seq_length):
    sequences.append(encoded[i:i + seq_length])
    targets.append(encoded[i + seq_length])


class TextDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = sequences
        self.targets = targets

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx], dtype=torch.long), torch.tensor(self.targets[idx], dtype=torch.long)


dataset = TextDataset(sequences, targets)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(LSTMModel, self)._init_()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.fc(x[:, -1, :])
        return x

vocab_size = len(vocab_to_int) + 1  # +1 for padding token
embed_size = 128
hidden_size = 256
num_layers = 2
model = LSTMModel(vocab_size, embed_size, hidden_size, num_layers)

# Training the model
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
epochs = 10

for epoch in range(epochs):
    for inputs, targets in dataloader:
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

# Text generation function
def generate_text(model, start_text, length):
    model.eval()
    words = start_text.split()
    state_h, state_c = None, None

    for _ in range(length):
        x = torch.tensor([[vocab_to_int[w] for w in words[-seq_length:]]], dtype=torch.long)
        with torch.no_grad():
            output, (state_h, state_c) = model.lstm(model.embedding(x), (state_h, state_c))
            output = model.fc(output[:, -1, :])
        word_id = output.argmax(dim=1).item()
        words.append(int_to_vocab[word_id])

    return ' '.join(words)

# Generate sample texts
start_text = 'anarchism originated as a term of abuse'
generated_text = generate_text(model, start_text, 100)
print(generated_text)