In [1]:
!pip install notebook
!pip install torch torchvision torchaudio torchtext

Collecting jedi>=0.16 (from ipython>=5.0.0->ipykernel->notebook)
  Using cached jedi-0.19.1-py2.py3-none-any.whl.metadata (22 kB)
Using cached jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
Installing collected packages: jedi
Successfully installed jedi-0.19.1
Collecting torchtext
  Downloading torchtext-0.18.0-cp310-cp310-manylinux1_x86_64.whl.metadata (7.9 kB)
Downloading torchtext-0.18.0-cp310-cp310-manylinux1_x86_64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m971.3 kB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchtext
Successfully installed torchtext-0.18.0


In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [3]:
from google.colab import files
uploaded = files.upload()

Saving Training Data.txt to Training Data.txt


In [4]:
import io
with io.open('Training Data.txt', 'r', encoding='utf-8') as file:
    text = file.read()

print("First 500 characters of the dataset:")
print(text[:500])

First 500 characters of the dataset:
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vivamus condimentum sagittis lacus, laoreet luctus ligula laoreet ut. Vestibulum ullamcorper accumsan velit vel vehicula. Proin tempor lacus arcu. Nunc at elit condimentum, semper nisi et, condimentum mi. In venenatis blandit nibh at sollicitudin. Vestibulum dapibus mauris at orci maximus pellentesque. Nullam id elementum ipsum. Suspendisse cursus lobortis viverra. Proin et erat at mauris tincidunt porttitor vitae ac dui.

Donec vulputate 


In [5]:
import re
import numpy as np
from collections import Counter

# Cleaning and tokenizing the text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters and numbers
    words = text.split()  # Tokenize the text into words
    return words

words = clean_text(text)
print(f"Total words in the dataset: {len(words)}")

# Create N-gram sequences (N previous words, 1 target word)
def create_sequences(words, N=5):
    sequences = []
    for i in range(N, len(words)):
        seq = words[i-N:i+1]  # N previous words and the next word (target)
        sequences.append(seq)
    return sequences

N = 5  # N-gram window size
sequences = create_sequences(words, N)
print(f"Total sequences: {len(sequences)}")

# Convert words to integer tokens (creating a vocabulary)
vocab = Counter(words)
word_to_idx = {word: idx+1 for idx, (word, _) in enumerate(vocab.items())}  # Reserve 0 for padding
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

def encode_sequences(sequences, word_to_idx):
    encoded_sequences = []
    for sequence in sequences:
        encoded_sequences.append([word_to_idx[word] for word in sequence])
    return np.array(encoded_sequences)

encoded_sequences = encode_sequences(sequences, word_to_idx)
print(f"Encoded sequences shape: {encoded_sequences.shape}")

Total words in the dataset: 75003
Total sequences: 74998
Encoded sequences shape: (74998, 6)


In [6]:
from sklearn.model_selection import train_test_split

# Split the data into training (70%), validation (20%), and test (10%)
train_data, temp_data = train_test_split(encoded_sequences, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.33, random_state=42)

In [7]:
import torch
from torch.utils.data import Dataset, DataLoader

# Custom Dataset class
class TextDataset(Dataset):
    def __init__(self, data):
        self.data = torch.tensor(data[:, :-1], dtype=torch.long)  # N previous words
        self.targets = torch.tensor(data[:, -1], dtype=torch.long)  # The next word (target)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.targets[idx]

# Dataloaders
batch_size = 64
train_loader = DataLoader(TextDataset(train_data), batch_size=batch_size, shuffle=True)
val_loader = DataLoader(TextDataset(val_data), batch_size=batch_size)
test_loader = DataLoader(TextDataset(test_data), batch_size=batch_size)

In [8]:
import torch.nn as nn

class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMLanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        output = self.fc(lstm_out[:, -1, :])  # Output for the last time step
        return output

# Hyperparameters
vocab_size = len(word_to_idx) + 1  # Vocabulary size (+1 for padding)
embedding_dim = 128
hidden_dim = 256
output_dim = vocab_size

# Instantiate the model
model = LSTMLanguageModel(vocab_size, embedding_dim, hidden_dim, output_dim)

In [9]:
import torch.optim as optim

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
def train(model, train_loader, val_loader, epochs=10):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        # Validation loss
        val_loss = 0.0
        model.eval()
        with torch.no_grad():
            for inputs, targets in val_loader:
                outputs = model(inputs)
                loss = criterion(outputs, targets)
                val_loss += loss.item()
        model.train()

        print(f"Epoch [{epoch+1}/{epochs}], Training Loss: {running_loss/len(train_loader)}, Validation Loss: {val_loss/len(val_loader)}")

# Train the model
train(model, train_loader, val_loader, epochs=10)

Epoch [1/10], Training Loss: 4.0209695134180325, Validation Loss: 3.0515655046802457
Epoch [2/10], Training Loss: 1.7995070269133955, Validation Loss: 1.090729338385291
Epoch [3/10], Training Loss: 0.4263016787084244, Validation Loss: 0.2389125434397641
Epoch [4/10], Training Loss: 0.10154119737437232, Validation Loss: 0.07832261684776867
Epoch [5/10], Training Loss: 0.047261647682695966, Validation Loss: 0.042495823125907425
Epoch [6/10], Training Loss: 0.031655981821369895, Validation Loss: 0.031728573431560804
Epoch [7/10], Training Loss: 0.025050541954330565, Validation Loss: 0.025902221587060365
Epoch [8/10], Training Loss: 0.0216142586730171, Validation Loss: 0.021704404411632253
Epoch [9/10], Training Loss: 0.019574140663622865, Validation Loss: 0.02092929833797517
Epoch [10/10], Training Loss: 0.017930781897684916, Validation Loss: 0.01921191047857342


In [10]:
def evaluate(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, targets in test_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()
    accuracy = 100 * correct / total
    print(f"Test Accuracy: {accuracy:.2f}%")

# Run evaluation
evaluate(model, test_loader)

Test Accuracy: 99.30%


In [11]:
def train_model(embedding_dim, hidden_dim, learning_rate, batch_size, epochs):
    # Define Model
    model = LSTMLanguageModel(vocab_size, embedding_dim, hidden_dim, vocab_size)

    # Loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Dataloaders with dynamic batch size
    train_loader = DataLoader(TextDataset(train_data), batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(TextDataset(val_data), batch_size=batch_size)

    # Training loop
    for epoch in range(epochs):
        running_loss = 0.0
        model.train()
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        val_loss = 0.0
        model.eval()
        with torch.no_grad():
            for inputs, targets in val_loader:
                outputs = model(inputs)
                loss = criterion(outputs, targets)
                val_loss += loss.item()

        print(f"Epoch [{epoch+1}/{epochs}], Training Loss: {running_loss/len(train_loader)}, Validation Loss: {val_loss/len(val_loader)}")

    return model

# Function to evaluate accuracy
def evaluate_model(model, test_loader):
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, targets in test_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()
    accuracy = 100 * correct / total
    return accuracy

In [None]:
learning_rates = [0.01, 0.001, 0.0001]
batch_size = 64
embedding_dim = 128
hidden_dim = 256
epochs = 5

for lr in learning_rates:
    print(f"\nExperimenting with Learning Rate: {lr}")
    model = train_model(embedding_dim, hidden_dim, lr, batch_size, epochs)
    test_accuracy = evaluate_model(model, test_loader)
    print(f"Test Accuracy with Learning Rate {lr}: {test_accuracy}%")


Experimenting with Learning Rate: 0.01
Epoch [1/5], Training Loss: 2.892352076598991, Validation Loss: 1.6303590000686
Epoch [2/5], Training Loss: 1.1250609138415588, Validation Loss: 1.0443021276239621
Epoch [3/5], Training Loss: 0.8843255243042936, Validation Loss: 0.9138378072845734
Epoch [4/5], Training Loss: 0.8282726956270499, Validation Loss: 0.940524202913551
Epoch [5/5], Training Loss: 0.8215757545251648, Validation Loss: 0.8666604156211272
Test Accuracy with Learning Rate 0.01: 78.1952861952862%

Experimenting with Learning Rate: 0.001
Epoch [1/5], Training Loss: 4.01746449458904, Validation Loss: 3.042187265420364
Epoch [2/5], Training Loss: 1.7847572594844758, Validation Loss: 1.080606292364961
Epoch [3/5], Training Loss: 0.41981839229796314, Validation Loss: 0.23838399293816695
Epoch [4/5], Training Loss: 0.1017581250532173, Validation Loss: 0.07397659454431574
Epoch [5/5], Training Loss: 0.04677816220852983, Validation Loss: 0.04174769230155369
Test Accuracy with Learnin

In [14]:
batch_sizes = [32, 64, 128]
learning_rate = 0.001
embedding_dim = 128
hidden_dim = 256
epochs = 5

for batch in batch_sizes:
    print(f"\nExperimenting with Batch Size: {batch}")
    model = train_model(embedding_dim, hidden_dim, learning_rate, batch, epochs)
    test_accuracy = evaluate_model(model, test_loader)
    print(f"Test Accuracy with Batch Size {batch}: {test_accuracy}%")


Experimenting with Batch Size: 32
Epoch [1/5], Training Loss: 3.7076539138989215, Validation Loss: 2.3700188385733103
Epoch [2/5], Training Loss: 1.0469843016336924, Validation Loss: 0.412513990839154
Epoch [3/5], Training Loss: 0.13901504186719274, Validation Loss: 0.0717561066316409
Epoch [4/5], Training Loss: 0.041682738731322964, Validation Loss: 0.035863324547959964
Epoch [5/5], Training Loss: 0.027377910108125604, Validation Loss: 0.025622359762102427
Test Accuracy with Batch Size 32: 99.42087542087542%

Experimenting with Batch Size: 64
Epoch [1/5], Training Loss: 4.032072605953147, Validation Loss: 3.0568531913272405
Epoch [2/5], Training Loss: 1.8025340218636934, Validation Loss: 1.0900447396403652
Epoch [3/5], Training Loss: 0.4264572784576869, Validation Loss: 0.23876656149908648
Epoch [4/5], Training Loss: 0.10222241523404504, Validation Loss: 0.07280017310848176
Epoch [5/5], Training Loss: 0.04691633337089989, Validation Loss: 0.042840370379577754
Test Accuracy with Batch

In [18]:
def predict_next_word(model, seed_text, word_to_idx, idx_to_word, max_len=5):
    model.eval()
    words = seed_text.split()
    for _ in range(max_len):
        encoded = [word_to_idx.get(w, 0) for w in words]
        encoded = torch.tensor([encoded], dtype=torch.long)
        with torch.no_grad():
            output = model(encoded)
            predicted_idx = torch.argmax(output, dim=1).item()
            predicted_word = idx_to_word[predicted_idx]
            words.append(predicted_word)
    return ' '.join(words)
seed_text = "just like that is"
print(predict_next_word(model, seed_text, word_to_idx, idx_to_word))

just like that is placerat fermentum metus erat a
