In [1]:
import requests
import nltk
import re
from nltk.corpus import stopwords
import numpy as np

In [2]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
# Download the 'punkt_tab' resource
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [7]:
# Load the new dataset: Alice's Adventures in Wonderland from Project Gutenberg
url = "https://www.gutenberg.org/files/11/11-0.txt"  # Alice's Adventures in Wonderland
response = requests.get(url)
data = response.text

In [8]:
# Save the dataset to a file
with open("alice_dataset.txt", "w", encoding="utf-8") as file:
    file.write(data)

# Load the dataset from file
with open('alice_dataset.txt', 'r', encoding='utf-8') as file:
    text = file.read()

In [9]:
# Tokenization
tokens = nltk.word_tokenize(text)

# Lowercasing
tokens = [token.lower() for token in tokens]


In [10]:
# Removing punctuation and special characters
tokens = [re.sub(r'\W+', '', token) for token in tokens if re.sub(r'\W+', '', token)]

# Removing stop words (optional)
stop_words = set(stopwords.words('english'))
tokens = [token for token in tokens if token not in stop_words]

# Add a special token for unknown words
tokens.append('')

In [11]:
# Numericalization
vocab = list(set(tokens))
word2index = {word: i for i, word in enumerate(vocab)}
index2word = {i: word for i, word in enumerate(vocab)}

# Creating sequences
sequence_length = 5
sequences = []
for i in range(len(tokens) - sequence_length):
    sequences.append(tokens[i:i + sequence_length])


In [12]:
# Convert sequences to numerical indices
input_sequences = []
for sequence in sequences:
    input_sequences.append([word2index[word] for word in sequence])

# Convert to numpy array
input_sequences = np.array(input_sequences)

print(f"Total sequences: {len(input_sequences)}")

Total sequences: 12360


In [13]:
import torch
import torch.nn as nn

class LanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(LanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, prev_state):
        x = self.embedding(x)
        x, state = self.lstm(x, prev_state)
        x = self.fc(x)
        return x, state

    def init_state(self, batch_size=1):
        return (torch.zeros(2, batch_size, self.lstm.hidden_size),
                torch.zeros(2, batch_size, self.lstm.hidden_size))

In [14]:
import torch.optim as optim

# Hyperparameters
embedding_dim = 50
hidden_dim = 100
vocab_size = len(vocab)
batch_size = 32
epochs = 15

# Model, loss function, optimizer
model = LanguageModel(vocab_size, embedding_dim, hidden_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
def train_model(model, input_sequences, criterion, optimizer, epochs):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for i in range(0, len(input_sequences) - batch_size, batch_size):
            inputs = torch.tensor(input_sequences[i:i + batch_size, :-1], dtype=torch.long)
            targets = torch.tensor(input_sequences[i:i + batch_size, 1:], dtype=torch.long)

            optimizer.zero_grad()
            state_h, state_c = model.init_state(batch_size)
            outputs, _ = model(inputs, (state_h, state_c))
            loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / (len(input_sequences) // batch_size)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

In [15]:
# Train the model
train_model(model, input_sequences, criterion, optimizer, epochs)

Epoch 1/15, Loss: 7.3981
Epoch 2/15, Loss: 6.8018
Epoch 3/15, Loss: 6.6632
Epoch 4/15, Loss: 6.5446
Epoch 5/15, Loss: 6.3856
Epoch 6/15, Loss: 6.1572
Epoch 7/15, Loss: 5.8600
Epoch 8/15, Loss: 5.5359
Epoch 9/15, Loss: 5.2207
Epoch 10/15, Loss: 4.9215
Epoch 11/15, Loss: 4.6406
Epoch 12/15, Loss: 4.3785
Epoch 13/15, Loss: 4.1347
Epoch 14/15, Loss: 3.9073
Epoch 15/15, Loss: 3.6949


In [16]:
# Save the model's state dictionary
torch.save(model.state_dict(), 'model.pth')
print("Model saved successfully.")

Model saved successfully.


In [17]:
def generate_text(model, start_text, max_length=50):
    model.eval()
    words = start_text.split()
    state_h, state_c = model.init_state(batch_size=1)

    for _ in range(max_length):
        # Use the word2index dictionary with a fallback to  if the word is not found
        x = torch.tensor([[word2index.get(w, word2index['']) for w in words]], dtype=torch.long)
        y_pred, (state_h, state_c) = model(x, (state_h, state_c))
        last_word_logits = y_pred[0][-1]
        p = torch.nn.functional.softmax(last_word_logits, dim=0).detach().numpy()
        word_index = np.random.choice(len(last_word_logits), p=p)
        words.append(index2word[word_index])

    return ' '.join(words)

# Test the model with a sample input
start_text = "harry potter is"
generated_text = generate_text(model, start_text)
print("Generated Text:")
print(generated_text)

Generated Text:
harry potter is instantly _have_ directions came frogfootman oh get hands said hatter feel know _was_ said king _began_ guilt dinah heavy soldiers shouted king queen reading finish appeared poor think felt bright eager many powdered day leave left great ii _are_ later mock fast mouths eyes else mock turtle slates remember interrupting


In [18]:
pip install streamlit

Collecting streamlit
  Downloading streamlit-1.41.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.41.1-py2.py3-none-any.whl (9.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m51.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m54.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[

In [19]:
import streamlit as st

# Define the model class (this should match your trained model's class)
class LanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(LanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, prev_state):
        x = self.embedding(x)
        x, state = self.lstm(x, prev_state)
        x = self.fc(x)
        return x, state

    def init_state(self, batch_size=1):
        return (torch.zeros(2, batch_size, self.lstm.hidden_size),
                torch.zeros(2, batch_size, self.lstm.hidden_size))

# Load the trained model
model = LanguageModel(vocab_size=len(vocab), embedding_dim=50, hidden_dim=100)
model.load_state_dict(torch.load('model.pth'))
model.eval()

# Define the text generation function
def generate_text(model, start_text, max_length=50):
    model.eval()
    words = start_text.split()
    state_h, state_c = model.init_state(batch_size=1)

    for _ in range(max_length):
        # Use the word2index dictionary with a fallback to  if the word is not found
        x = torch.tensor([[word2index.get(w, word2index['']) for w in words]], dtype=torch.long)
        y_pred, (state_h, state_c) = model(x, (state_h, state_c))
        last_word_logits = y_pred[0][-1]
        p = torch.nn.functional.softmax(last_word_logits, dim=0).detach().numpy()
        word_index = np.random.choice(len(last_word_logits), p=p)
        words.append(index2word[word_index])

    return ' '.join(words)

# Streamlit app interface
st.title("Text Generation with Language Model")
st.write("Enter a text prompt and the model will generate a continuation of the text.")

# Input box for user to type in a text prompt
user_input = st.text_input("Enter text prompt:", "Harry Potter is")

# Generate text based on user input
if user_input:
    with st.spinner('Generating text...'):
        generated_text = generate_text(model, user_input)
        st.success("Generated Text:")
        st.write(generated_text)



  model.load_state_dict(torch.load('model.pth'))
2025-01-26 18:44:36.895 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-01-26 18:44:36.938 Session state does not function when running a script without `streamlit run`
