In [1]:
from google.colab import files

uploaded = files.upload()

Saving test.txt to test.txt
Saving train.txt to train.txt
Saving val.txt to val.txt


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import word_tokenize

In [5]:
!pip install nltk
import nltk
nltk.download('punkt_tab')



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:
# Read dataset
with open('train.txt', 'r') as file:
    train_sentences = file.readlines()
with open('val.txt', 'r') as file:
    val_sentences = file.readlines()
with open('test.txt', 'r') as file:
    test_sentences = file.readlines()

In [6]:
# Preprocessing
MAX_LENGTH = 20
EOL = "<eol>"

def preprocess_sentences(sentences):
    processed_sentences = []
    for sentence in sentences:
        words = word_tokenize(sentence.lower())  # Tokenize and lowercase
        if len(words) > MAX_LENGTH:
            words = words[:MAX_LENGTH]
        elif len(words) < MAX_LENGTH:
            words += [EOL] * (MAX_LENGTH - len(words))
        processed_sentences.append(words)
    return processed_sentences

train_sentences = preprocess_sentences(train_sentences)
val_sentences = preprocess_sentences(val_sentences)
test_sentences = preprocess_sentences(test_sentences)

In [17]:
# Vocabulary
vocab = set([word for sentence in train_sentences + val_sentences for word in sentence]) # Combine train and val sentences
word_to_index = {word: idx for idx, word in enumerate(vocab)}
index_to_word = {idx: word for word, idx in word_to_index.items()}
vocab_size = len(word_to_index)

In [18]:
# Dataset Class
class NextWordPredictionDataset(Dataset):
    def __init__(self, sentences, word_to_index):
        self.sentences = sentences
        self.word_to_index = word_to_index

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        input_sequence = [self.word_to_index[word] for word in sentence[:-1]]
        target_sequence = [self.word_to_index[word] for word in sentence[1:]]
        return torch.tensor(input_sequence), torch.tensor(target_sequence)

train_dataset = NextWordPredictionDataset(train_sentences, word_to_index)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

val_dataset = NextWordPredictionDataset(val_sentences, word_to_index)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [19]:
# Transformer Model
class TransformerLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, nhead, num_encoder_layers, hidden_dim):
        super(TransformerLanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.pos_encoder = nn.Embedding(MAX_LENGTH, embedding_dim)
        self.transformer = nn.Transformer(
            d_model=embedding_dim,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            batch_first=True
        )
        self.fc = nn.Linear(embedding_dim, vocab_size)

    def forward(self, x):
        positions = torch.arange(0, x.size(1), device=x.device).unsqueeze(0)
        x = self.embedding(x) + self.pos_encoder(positions)
        x = self.transformer(x, x)
        logits = self.fc(x)
        return logits

In [20]:
# Model Parameters
embedding_dim = 128
hidden_dim = 256
num_encoder_layers = 3
nhead = 8

model = TransformerLanguageModel(vocab_size, embedding_dim, nhead, num_encoder_layers, hidden_dim)

In [21]:
# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [22]:
# Training the Model
def train_model(model, train_loader, val_loader, num_epochs=10):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            output = model(inputs)

            output = output.view(-1, vocab_size)
            targets = targets.view(-1)

            loss = criterion(output, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f'Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}')

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for inputs, targets in val_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                output = model(inputs)
                output = output.view(-1, vocab_size)
                targets = targets.view(-1)
                val_loss += criterion(output, targets).item()
        print(f'Validation Loss: {val_loss / len(val_loader)}')

In [23]:
# Train the model
train_model(model, train_loader, val_loader)

Epoch 1, Loss: 5.380984558105469
Validation Loss: 5.163858769431947
Epoch 2, Loss: 5.139534201622009
Validation Loss: 5.2098576454889205
Epoch 3, Loss: 5.2361633577346804
Validation Loss: 5.201630811842661
Epoch 4, Loss: 5.266215858459472
Validation Loss: 5.381108813815647
Epoch 5, Loss: 5.275724738121033
Validation Loss: 5.387431984856015
Epoch 6, Loss: 5.2673503704071045
Validation Loss: 5.388023134261843
Epoch 7, Loss: 5.2625789022445675
Validation Loss: 5.39472106146434
Epoch 8, Loss: 5.256971749305725
Validation Loss: 5.4079967226300925
Epoch 9, Loss: 5.251948892593384
Validation Loss: 5.415240325624981
Epoch 10, Loss: 5.247565874099731
Validation Loss: 5.416885300288125


In [26]:
def predict_next_word(model, sentence, word_to_index, index_to_word, max_length=20):
    model.eval()
    words = word_tokenize(sentence.lower())
    while len(words) < max_length:
        input_sequence = [word_to_index.get(word, word_to_index[EOL]) for word in words[-(max_length-1):]]
        # Use CPU instead of GPU
        input_sequence = torch.tensor(input_sequence).unsqueeze(0).cpu() # Changed .cuda() to .cpu()

        with torch.no_grad():
            output = model(input_sequence)
        predicted_word_idx = output.argmax(dim=2)[0, -1].item()
        predicted_word = index_to_word[predicted_word_idx]

        if predicted_word == EOL:
            break

        words.append(predicted_word)

    return ' '.join(words)

In [27]:
# Test the model
partial_sentence = "I drink tea"
predicted_sentence = predict_next_word(model, partial_sentence, word_to_index, index_to_word)
print(predicted_sentence)

i drink tea i i i i i i i i i i i i i i i i i
