In [1]:
import pandas as pd
import numpy as np
import string

import warnings
warnings.filterwarnings("ignore")

In [2]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

def read_file_to_sentences(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    sentences = sent_tokenize(text)
    return sentences
all_headlines = read_file_to_sentences("/kaggle/input/corpus2/game_of_thrones.txt")
len(all_headlines)

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


24248

In [3]:
all_headlines[:10]

['A Song of Ice and Fire\n\nA Game of Thrones\n\nPROLOGUE\n\nWe should start back, Gared urged as the woods began to grow dark around them.',
 'The wildlings are dead.',
 'Do the dead frighten you?',
 'Ser Waymar Royce asked with just the hint of a smile.',
 'Gared did not rise to the bait.',
 'He was an old man, past fifty, and he had seen the lordlings come and go.',
 'Dead is dead, he said.',
 'We have no business with the dead.',
 'Are they dead?',
 'Royce asked softly.']

In [4]:
import string

def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt

corpus = [clean_text(x) for x in all_headlines]
corpus[:10]

['a song of ice and fire\n\na game of thrones\n\nprologue\n\nwe should start back gared urged as the woods began to grow dark around them',
 'the wildlings are dead',
 'do the dead frighten you',
 'ser waymar royce asked with just the hint of a smile',
 'gared did not rise to the bait',
 'he was an old man past fifty and he had seen the lordlings come and go',
 'dead is dead he said',
 'we have no business with the dead',
 'are they dead',
 'royce asked softly']

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
token_list = tokenizer.texts_to_sequences(["I am happy to see you here today"])[0]
print(token_list)

check=[]

for i in range(1, len(token_list)):
  n_gram_sequence = token_list[:i+1]
  check.append(n_gram_sequence)

[15, 187, 2292, 3, 77, 10, 83, 831]


In [6]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1

    ## convert data to sequence of tokens
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)

inp_sequences[:10], total_words

([[4, 1031],
  [4, 1031, 5],
  [4, 1031, 5, 553],
  [4, 1031, 5, 553, 2],
  [4, 1031, 5, 553, 2, 256],
  [4, 1031, 5, 553, 2, 256, 4],
  [4, 1031, 5, 553, 2, 256, 4, 1293],
  [4, 1031, 5, 553, 2, 256, 4, 1293, 5],
  [4, 1031, 5, 553, 2, 256, 4, 1293, 5, 2085],
  [4, 1031, 5, 553, 2, 256, 4, 1293, 5, 2085, 7573]],
 12116)

In [7]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)
predictors,label,len(label[0]),max_sequence_len

(array([[   0,    0,    0, ...,    0,    0,    4],
        [   0,    0,    0, ...,    0,    4, 1031],
        [   0,    0,    0, ...,    4, 1031,    5],
        ...,
        [   0,    0,    0, ...,   10,   11,    7],
        [   0,    0,    0, ...,   11,    7, 1382],
        [   0,    0,    0, ...,    7, 1382, 1026]], dtype=int32),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32),
 12116,
 131)

In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np

# Define the dataset
class TextDataset(Dataset):
    def __init__(self, predictors, labels):
        self.predictors = predictors
        self.labels = labels

    def __len__(self):
        return len(self.predictors)

    def __getitem__(self, idx):
        return self.predictors[idx], self.labels[idx]

# Define the model class
class TextGenerationModel(nn.Module):
    def __init__(self, total_words, embedding_dim, rnn_units):
        super(TextGenerationModel, self).__init__()
        self.embedding = nn.Embedding(total_words, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, rnn_units, batch_first=True)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(rnn_units, total_words)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.rnn(x)
        x = self.dropout(x[:, -1, :])  # Only take the output from the last time step
        x = self.fc(x)
        return x

# Function to train the model
def train_model(model, dataset, epochs, batch_size, learning_rate, patience, device):
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=patience, min_lr=1e-5)

    model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        for batch_idx, (data, target) in enumerate(dataloader):
            data, target = data.to(device), target.to(device)
            
            optimizer.zero_grad()
            output = model(data)

            # Flatten the output and target for the loss function
            output = output.view(-1, output.size(-1))
            target = target.view(-1)
            
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        avg_epoch_loss = epoch_loss / len(dataloader)
        print(f'Epoch {epoch+1}/{epochs}, Loss: {avg_epoch_loss:.4f}')
        scheduler.step(avg_epoch_loss)

# Parameters
total_words = 10000  # Example value; use the actual value from your data
embedding_dim = 32
rnn_units = 200
max_sequence_len = 50  # Example value; use the actual value from your data
batch_size = 32
learning_rate = 0.001
epochs = 100
patience = 5

# Example data (replace with actual data)
predictors = np.random.randint(0, total_words, (1000, max_sequence_len-1))  # Dummy data
labels = np.random.randint(0, total_words, 1000)  # Dummy data

# Convert data to tensors
predictors = torch.tensor(predictors, dtype=torch.long)
labels = torch.tensor(labels, dtype=torch.long)

# Check tensor shapes and device
print(f"Predictors shape: {predictors.shape}, Labels shape: {labels.shape}")
print(f"Predictors device: {predictors.device}, Labels device: {labels.device}")

# Prepare dataset
dataset = TextDataset(predictors, labels)

# Initialize and train the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TextGenerationModel(total_words, embedding_dim, rnn_units).to(device)

# Train the model
train_model(model, dataset, epochs, batch_size, learning_rate, patience, device)


Predictors shape: torch.Size([1000, 49]), Labels shape: torch.Size([1000])
Predictors device: cpu, Labels device: cpu
Epoch 1/100, Loss: 9.2294
Epoch 2/100, Loss: 7.9520
Epoch 3/100, Loss: 7.0736
Epoch 4/100, Loss: 6.5865
Epoch 5/100, Loss: 6.1579
Epoch 6/100, Loss: 5.6549
Epoch 7/100, Loss: 5.0389
Epoch 8/100, Loss: 4.3272
Epoch 9/100, Loss: 3.5775
Epoch 10/100, Loss: 2.8011
Epoch 11/100, Loss: 2.0762
Epoch 12/100, Loss: 1.4474
Epoch 13/100, Loss: 0.9579
Epoch 14/100, Loss: 0.6314
Epoch 15/100, Loss: 0.4311
Epoch 16/100, Loss: 0.3157
Epoch 17/100, Loss: 0.2334
Epoch 18/100, Loss: 0.1845
Epoch 19/100, Loss: 0.1508
Epoch 20/100, Loss: 0.1265
Epoch 21/100, Loss: 0.1079
Epoch 22/100, Loss: 0.0921
Epoch 23/100, Loss: 0.0818
Epoch 24/100, Loss: 0.0743
Epoch 25/100, Loss: 0.0635
Epoch 26/100, Loss: 0.0591
Epoch 27/100, Loss: 0.0538
Epoch 28/100, Loss: 0.0492
Epoch 29/100, Loss: 0.0449
Epoch 30/100, Loss: 0.0409
Epoch 31/100, Loss: 0.0372
Epoch 32/100, Loss: 0.0353
Epoch 33/100, Loss: 0.0325


Lets train our model now

In [None]:
model.fit(predictors, label, epochs=8)

Epoch 1/8
Epoch 2/8

In [17]:
import torch
from keras.preprocessing.sequence import pad_sequences
import numpy as np

def generate_text(seed_text, next_words, model, tokenizer, max_sequence_len, device):
    model.eval()  # Set the model to evaluation mode
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        token_list = torch.tensor(token_list, dtype=torch.long).to(device)
        
        with torch.no_grad():
            predicted_probs = model(token_list)
        
        predicted = torch.argmax(predicted_probs, dim=-1).item()  # Get the index of the highest probability

        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text.title()

# Ensure the model, tokenizer, and device are already defined
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [23]:
print(generate_text("Have ", 5, model, tokenizer, max_sequence_len, device))
print(generate_text("Did", 5, model, tokenizer, max_sequence_len, device))
print(generate_text("We", 6, model, tokenizer, max_sequence_len, device))
print(generate_text(" All the bodies", 3, model, tokenizer, max_sequence_len, device))
print(generate_text("Especially", 4, model, tokenizer, max_sequence_len, device))
print(generate_text("The young knight", 6, model, tokenizer, max_sequence_len, device))

Have  Fiery Bends Sneaking Eighth Favored
Did Magister Mornings Humpbacked Stalking Debt
We Glistened Bends Startledand Shields Hallooed Claws
 All The Bodies Favored Outrange Obey
Especially Tale List Tide Impressive
The Young Knight Partner Fashioned Helplessly Lemon Reward Notched


In [29]:
print(generate_text("Nobody will feel", 4, model, tokenizer, max_sequence_len, device))

Nobody Will Feel Lolling Outside Silverpale Leffords
