In [1]:
import re
import torch
import numpy as np
from torch.utils.data import DataLoader, Dataset

class TextDataset(Dataset):
    def __init__(self, text, block_size):
        words = re.sub('[^a-zA-Z0-9 \.]', '', text).lower().split()
        self.vocab = sorted(set(words))
        self.word_to_idx = {word: i for i, word in enumerate(self.vocab)}
        self.idx_to_word = {i: word for word, i in self.word_to_idx.items()}
        self.data = [self.word_to_idx[word] for word in words]
        self.block_size = block_size

    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        return (
            torch.tensor(self.data[idx: idx + self.block_size], dtype=torch.long),
            torch.tensor(self.data[idx + self.block_size], dtype=torch.long),
        )

# Example usage:
# dataset = TextDataset(your_text_data, block_size=5)
# loader = DataLoader(dataset, batch_size=32, shuffle=True)


  words = re.sub('[^a-zA-Z0-9 \.]', '', text).lower().split()


In [4]:
import torch.nn as nn
import torch.nn.functional as F

class TextGeneratorMLP(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_size):
        super(TextGeneratorMLP, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.fc1 = nn.Linear(emb_dim * block_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x = x.view(x.size(0), -1)  # Flatten the embeddings
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Example usage:
# model = TextGeneratorMLP(vocab_size=len(dataset.vocab), emb_dim=32, hidden_size=1024)


In [5]:
def train_model(model, loader, num_epochs=10, learning_rate=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    for epoch in range(num_epochs):
        total_loss = 0
        for X_batch, y_batch in loader:
            optimizer.zero_grad()
            output = model(X_batch)
            loss = criterion(output, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {total_loss / len(loader):.4f}')

# Example usage:
# train_model(model, loader, num_epochs=10, learning_rate=0.001)


In [6]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

def visualize_embeddings(model, dataset, num_words=100):
    embeddings = model.embedding.weight.data[:num_words].numpy()
    tsne = TSNE(n_components=2)
    reduced = tsne.fit_transform(embeddings)

    plt.figure(figsize=(10, 10))
    for i, label in enumerate(dataset.vocab[:num_words]):
        x, y = reduced[i]
        plt.scatter(x, y)
        plt.annotate(label, (x, y))
    plt.show()

# Example usage:
# visualize_embeddings(model, dataset)


In [7]:
import streamlit as st

# Define the main app function
def main():
    st.title("Next-Word Prediction with MLP")
    
    # User inputs
    block_size = st.slider("Context Length", min_value=1, max_value=10, value=5)
    emb_dim = st.slider("Embedding Dimension", min_value=8, max_value=128, value=32)
    hidden_size = st.slider("Hidden Layer Size", min_value=64, max_value=2048, value=1024)
    seed = st.number_input("Random Seed", min_value=1, max_value=100, value=42)

    # Model and dataset setup with the user's configuration
    dataset = TextDataset(your_text_data, block_size)
    model = TextGeneratorMLP(vocab_size=len(dataset.vocab), emb_dim=emb_dim, hidden_size=hidden_size)
    model.eval()

    # Predict the next word
    user_input = st.text_input("Input Seed Text", "")
    if user_input:
        words = user_input.lower().split()
        input_ids = [dataset.word_to_idx.get(word, 0) for word in words[-block_size:]]
        input_ids = torch.tensor(input_ids).unsqueeze(0)

        with torch.no_grad():
            output = model(input_ids)
            next_word_idx = output.argmax(dim=-1).item()
            next_word = dataset.idx_to_word[next_word_idx]
        
        st.write(f"Predicted Next Word: **{next_word}**")

# Run the Streamlit app
# Run this command in your terminal:
# `streamlit run your_script.py`

if __name__ == "__main__":
    main()


ModuleNotFoundError: No module named 'streamlit'