In [2]:
import pandas as pd
import numpy as np
import torch

In [3]:
df = pd.read_csv('./data/large_text_dataset.csv')
df.shape

(1, 1)

In [4]:
text = df['text'][0]

In [5]:
sentences = text.split('\n')

In [6]:
import re

In [7]:
processed_sentences = []

for sentence in sentences:
    sentence = sentence.lower()
    sentence = sentence.strip()
    sentence = re.sub(r'[.,!-?]+', '', sentence)
    sentence = sentence.replace('â€”', ' ')
    if len(sentence):
        processed_sentences.append(sentence)

In [8]:
processed_sentences

['call me ishmael some years ago never mind how long precisely having little or no money in my purse',
 'and nothing particular to interest me on shore i thought i would sail about a little and see the watery',
 'part of the world it is a way i have of driving off the spleen and regulating the circulation whenever',
 'i find myself growing grim about the mouth whenever it is a damp drizzly november in my soul whenever',
 'i find myself involuntarily pausing before coffin warehouses and bringing up the rear of every funeral',
 'i meet and especially whenever my hypos get such an upper hand of me that it requires a strong moral',
 'principle to prevent me from deliberately stepping into the street and methodically knocking peoples',
 'hats off then i account it high time to get to sea as soon as i can',
 'this is my substitute for pistol and ball with a philosophical flourish cato throws himself upon his sword',
 'i quietly take to the ship there is nothing surprising in this if they but

In [9]:
import nltk

In [10]:
from nltk import word_tokenize

In [11]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ojas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Ojas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [12]:
vocab = {'<unk>': 0}

for sentence in processed_sentences:
    tokens = word_tokenize(sentence)
    for token in tokens:
        if token not in vocab:
            vocab[token] = len(vocab)

In [13]:
vocab

{'<unk>': 0,
 'call': 1,
 'me': 2,
 'ishmael': 3,
 'some': 4,
 'years': 5,
 'ago': 6,
 'never': 7,
 'mind': 8,
 'how': 9,
 'long': 10,
 'precisely': 11,
 'having': 12,
 'little': 13,
 'or': 14,
 'no': 15,
 'money': 16,
 'in': 17,
 'my': 18,
 'purse': 19,
 'and': 20,
 'nothing': 21,
 'particular': 22,
 'to': 23,
 'interest': 24,
 'on': 25,
 'shore': 26,
 'i': 27,
 'thought': 28,
 'would': 29,
 'sail': 30,
 'about': 31,
 'a': 32,
 'see': 33,
 'the': 34,
 'watery': 35,
 'part': 36,
 'of': 37,
 'world': 38,
 'it': 39,
 'is': 40,
 'way': 41,
 'have': 42,
 'driving': 43,
 'off': 44,
 'spleen': 45,
 'regulating': 46,
 'circulation': 47,
 'whenever': 48,
 'find': 49,
 'myself': 50,
 'growing': 51,
 'grim': 52,
 'mouth': 53,
 'damp': 54,
 'drizzly': 55,
 'november': 56,
 'soul': 57,
 'involuntarily': 58,
 'pausing': 59,
 'before': 60,
 'coffin': 61,
 'warehouses': 62,
 'bringing': 63,
 'up': 64,
 'rear': 65,
 'every': 66,
 'funeral': 67,
 'meet': 68,
 'especially': 69,
 'hypos': 70,
 'get': 71,

In [14]:
len(vocab)

346

In [15]:
numeric_sentences = []

for sentence in processed_sentences:
    numeric_sentence = []
    for token in word_tokenize(sentence):
        numeric_sentence.append(vocab[token])
    numeric_sentences.append(numeric_sentence)        

In [16]:
numeric_sentences

[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20,
  21,
  22,
  23,
  24,
  2,
  25,
  26,
  27,
  28,
  27,
  29,
  30,
  31,
  32,
  13,
  20,
  33,
  34,
  35],
 [36,
  37,
  34,
  38,
  39,
  40,
  32,
  41,
  27,
  42,
  37,
  43,
  44,
  34,
  45,
  20,
  46,
  34,
  47,
  48],
 [27, 49, 50, 51, 52, 31, 34, 53, 48, 39, 40, 32, 54, 55, 56, 17, 18, 57, 48],
 [27, 49, 50, 58, 59, 60, 61, 62, 20, 63, 64, 34, 65, 37, 66, 67],
 [27,
  68,
  20,
  69,
  48,
  18,
  70,
  71,
  72,
  73,
  74,
  75,
  37,
  2,
  76,
  39,
  77,
  32,
  78,
  79],
 [80, 23, 81, 2, 82, 83, 84, 85, 34, 86, 20, 87, 88, 89],
 [90, 44, 91, 27, 92, 39, 93, 94, 23, 71, 23, 95, 96, 97, 96, 27, 98],
 [99,
  40,
  18,
  100,
  101,
  102,
  20,
  103,
  104,
  32,
  105,
  106,
  107,
  108,
  109,
  110,
  111,
  112],
 [27,
  113,
  114,
  23,
  34,
  115,
  116,
  40,
  21,
  117,
  17,
  99,
  118,
  119,
  120,
  121,
  39,
  122,
  123,
  124,
  17,
  125],
 [126, 4, 94, 14, 127, 128

In [17]:
sequences = []

for sequence in numeric_sentences:
    for i in range(1, len(sequence) + 1):
        sentence = sequence[:i+1]
        sequences.append(sentence)

In [18]:
sequences

[[1, 2],
 [1, 2, 3],
 [1, 2, 3, 4],
 [1, 2, 3, 4, 5],
 [1, 2, 3, 4, 5, 6],
 [1, 2, 3, 4, 5, 6, 7],
 [1, 2, 3, 4, 5, 6, 7, 8],
 [1, 2, 3, 4, 5, 6, 7, 8, 9],
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17],
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18],
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21],
 [20, 21, 22],
 [20, 21, 22, 23],
 [20, 21, 22, 23, 24],
 [20, 21, 22, 23, 24, 2],
 [20, 21, 22, 23, 24, 2, 25],
 [20, 21, 22, 23, 24, 2, 25, 26],
 [20, 21, 22, 23, 24, 2, 25, 26, 27],
 [20, 21, 22, 23, 24, 2, 25, 26, 27, 28],
 [20, 21, 22, 23

In [19]:
sequence_sizes = []

for sequence in sequences:
    sequence_sizes.append(len(sequence))
    
sequence_size = np.max(sequence_sizes) 

In [20]:
sequence_size

np.int64(22)

In [21]:
padded_sequences = []

for sequence in sequences:
    padded_sequence = torch.tensor(([0]*(sequence_size - len(sequence))) + sequence, dtype=torch.long)
    padded_sequences.append(padded_sequence)

In [22]:
len(padded_sequences[10])

22

In [23]:
padded_sequences = torch.stack(padded_sequences)

In [24]:
X = padded_sequences[:,:-1]
y = padded_sequences[:,1:].clone()

In [25]:
X.shape,y.shape

(torch.Size([48720, 21]), torch.Size([48720, 21]))

In [26]:
y.shape

torch.Size([48720, 21])

In [27]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [28]:
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def __len__(self):
        return self.X.shape[0]
    
    def __getitem__(self, index):
        return self.X[index], self.y[index]        

In [29]:
dataset = CustomDataset(X,y)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [37]:
class GRU(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,100)
        self.gru = nn.GRU(100, 150, batch_first=True, num_layers=3)
        self.fc = nn.Linear(150, vocab_size)
    
    def forward(self,x):
        embedding = self.embedding(x)
        gru_out, hidden_states = self.gru(embedding)
        output = self.fc(gru_out)
        return output

In [38]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [39]:
num_epochs = 50
learning_rate = 0.001

In [44]:
model = GRU(len(vocab))

In [45]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [46]:
model.to(device)

GRU(
  (embedding): Embedding(346, 100)
  (gru): GRU(100, 150, num_layers=3, batch_first=True)
  (fc): Linear(in_features=150, out_features=346, bias=True)
)

In [47]:
for _ in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    epoch_acc = 0.0
    batch_count = 0
    
    for batch_inputs, batch_labels in dataloader:
        batch_inputs = batch_inputs.to(device)
        batch_labels = batch_labels.to(device)
        
        optimizer.zero_grad()
        
        outputs = model(batch_inputs) 
        
        outputs = outputs.reshape(-1, len(vocab))
        batch_labels = batch_labels.reshape(-1).long()

        loss = criterion(outputs, batch_labels)
        loss.backward()
        optimizer.step()
        
        
        pred = torch.argmax(outputs, dim=-1)
        correct = (pred == batch_labels).float().sum()
        total = torch.numel(batch_labels)
        acc = (correct / total) 
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    batch_count += 1
    avg_loss = epoch_loss / 1523
    avg_acc = (epoch_acc / 1523) * 100
    
    print(f"Batch: {batch_count}\tAccuracy: {avg_acc}\tLoss: {avg_loss}")       

Batch: 1	Accuracy: 89.77777463037249	Loss: 0.5994778240267101
Batch: 1	Accuracy: 94.4467592301998	Loss: 0.31141375358450674
Batch: 1	Accuracy: 94.46591004993441	Loss: 0.30967894255214823
Batch: 1	Accuracy: 94.47519227382938	Loss: 0.30832177762600127
Batch: 1	Accuracy: 94.47704872173928	Loss: 0.3076009356263274
Batch: 1	Accuracy: 94.47763507802942	Loss: 0.306872864772987
Batch: 1	Accuracy: 94.48848057951649	Loss: 0.30634963939006754
Batch: 1	Accuracy: 94.45056982206343	Loss: 0.307037413609474
Batch: 1	Accuracy: 94.4825204555251	Loss: 0.30551572176319003
Batch: 1	Accuracy: 94.48173872280184	Loss: 0.30493734680111284
Batch: 1	Accuracy: 94.47450832716353	Loss: 0.3046096080396806
Batch: 1	Accuracy: 94.48437687995319	Loss: 0.30394108680542253
Batch: 1	Accuracy: 94.48408377225354	Loss: 0.3045430779652774
Batch: 1	Accuracy: 94.48681959805329	Loss: 0.30399706439211116
Batch: 1	Accuracy: 94.48926224570798	Loss: 0.30329902361508504
Batch: 1	Accuracy: 94.48701495491825	Loss: 0.30388406594176864
Ba