In [None]:
from google.colab import files

uploaded = files.upload()

Saving test.txt to test (1).txt
Saving train.txt to train (1).txt
Saving val.txt to val (1).txt


In [29]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from collections import Counter
from nltk.tokenize import word_tokenize
import torch.nn as nn
import torch.optim as optim
from google.colab import files

In [30]:
# Read the dataset
with open('train.txt', 'r') as file:
    train_sentences = file.readlines()

with open('val.txt', 'r') as file:
    val_sentences = file.readlines()

with open('test.txt', 'r') as file:
    test_sentences = file.readlines()

In [31]:
# Preprocessing: Tokenizing, clipping sentences, and adding EOL padding
MAX_LENGTH = 20
EOL = "<eol>"

def preprocess_sentences(sentences):
    processed_sentences = []
    for sentence in sentences:
        words = word_tokenize(sentence.lower())  # Tokenize and lowercase
        # Clip to 20 words, add EOL if needed
        if len(words) > MAX_LENGTH:
            words = words[:MAX_LENGTH]
        elif len(words) < MAX_LENGTH:
            words += [EOL] * (MAX_LENGTH - len(words))
        processed_sentences.append(words)
    return processed_sentences

train_sentences = preprocess_sentences(train_sentences)
val_sentences = preprocess_sentences(val_sentences)
test_sentences = preprocess_sentences(test_sentences)

In [32]:
# Prepare vocabulary
# Combine all sentences from training, validation, and test sets
all_sentences = train_sentences + val_sentences + test_sentences
vocab = set([word for sentence in all_sentences for word in sentence])

word_to_index = {word: idx for idx, word in enumerate(vocab)}
index_to_word = {idx: word for word, idx in word_to_index.items()}

In [33]:
# Create a dataset class
class NextWordPredictionDataset(Dataset):
    def __init__(self, sentences, word_to_index):
        self.sentences = sentences
        self.word_to_index = word_to_index

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        input_sequence = [self.word_to_index[word] for word in sentence[:-1]]
        target_sequence = [self.word_to_index[word] for word in sentence[1:]]
        return torch.tensor(input_sequence), torch.tensor(target_sequence)

train_dataset = NextWordPredictionDataset(train_sentences, word_to_index)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

val_dataset = NextWordPredictionDataset(val_sentences, word_to_index)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

test_dataset = NextWordPredictionDataset(test_sentences, word_to_index)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [34]:
# Building the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)  # Simple embedding layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        logits = self.fc(lstm_out)
        return logits

In [35]:
# Set parameters
embedding_dim = 100
hidden_dim = 128
num_layers = 2
vocab_size = len(word_to_index)

# Initialize model
model = LSTMModel(vocab_size, embedding_dim, hidden_dim, num_layers)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [36]:
# Training the model
def train_model(model, train_loader, val_loader, num_epochs=10):
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for inputs, targets in train_loader:
            # If no GPU, remove or comment out the following line:
            # inputs, targets = inputs.cuda(), targets.cuda()

            optimizer.zero_grad()
            output = model(inputs)

            # Reshape output and targets for the loss function
            output = output.view(-1, vocab_size)
            targets = targets.view(-1)

            loss = criterion(output, targets)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        # Print epoch loss
        print(f'Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}')

        # Validation step
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for inputs, targets in val_loader:
                # If no GPU, remove or comment out the following line:
                # inputs, targets = inputs.cuda(), targets.cuda()
                output = model(inputs)
                output = output.view(-1, vocab_size)
                targets = targets.view(-1)
                val_loss += criterion(output, targets).item()
        print(f'Validation Loss: {val_loss / len(val_loader)}')

In [26]:
# Start training
train_model(model, train_loader, val_loader)

Epoch 1, Loss: 5.136407536506653
Validation Loss: 4.634415581112816
Epoch 2, Loss: 4.4130806627273556
Validation Loss: 4.327412760446942
Epoch 3, Loss: 4.156963615894318
Validation Loss: 4.199228025618053
Epoch 4, Loss: 4.011617884635926
Validation Loss: 4.133322628717574
Epoch 5, Loss: 3.905031321525574
Validation Loss: 4.095317378876701
Epoch 6, Loss: 3.8162808027267454
Validation Loss: 4.073888983045306
Epoch 7, Loss: 3.7375731387138367
Validation Loss: 4.063295814726088
Epoch 8, Loss: 3.6646124806404115
Validation Loss: 4.069407330618964
Epoch 9, Loss: 3.595582187652588
Validation Loss: 4.065436196705652
Epoch 10, Loss: 3.527846902370453
Validation Loss: 4.075001311680627


In [37]:
# Predicting next word in the sentence
def predict_next_word(model, sentence, word_to_index, index_to_word, max_length=20):
    model.eval()
    words = word_tokenize(sentence.lower())
    while len(words) < max_length:
        input_sequence = [word_to_index.get(word, word_to_index[EOL]) for word in words[-(max_length-1):]]
        # Instead of using cuda, use cpu
        input_sequence = torch.tensor(input_sequence).unsqueeze(0).cpu() # Changed .cuda() to .cpu()

        with torch.no_grad():
            output = model(input_sequence)

        # Ensure output is on CPU before accessing elements
        predicted_word_idx = output.cpu().argmax(dim=2)[0, -1].item() # Added .cpu()
        predicted_word = index_to_word[predicted_word_idx]

        if predicted_word == EOL:
            break

        words.append(predicted_word)

    return ' '.join(words)

In [38]:
# Test the model on partial sentences
partial_sentence = "I drink tea"
predicted_sentence = predict_next_word(model, partial_sentence, word_to_index, index_to_word)
print(predicted_sentence)

i drink tea barely entertainments contributions contributions catholic unhealthy cu cu arguments pollicino pollicino zombie zombie hurtled gripped quest haggle
