In [1]:
import numpy as np
import pandas as pd

import torch.nn as nn
import torch
import re
from torchtext.data import Field
import spacy
from torch.utils.data import Dataset, DataLoader

device = torch.device("cuda")

print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device()) 
print(torch.cuda.device(0))

  from .autonotebook import tqdm as notebook_tqdm


True
1
0
<torch.cuda.device object at 0x00000274113AD648>


In [2]:
INPUT = "data/quotes.csv"
# INPUT = "/kaggle/input/quotes-500k/quotes.csv"

ds = pd.read_csv(INPUT)

ds.head()

ds = ds.drop(columns=["author", "category"], axis=1)

ds = np.array(ds)
ds = ds.T[0].astype(str)

ds = ds[0:1000]

ds = np.char.lower(ds)

ds = np.array(list(map(lambda x: re.sub("[^a-z0-9\s]+", "", x), ds)))

print(ds.shape)
print(ds[0:3])

(1000,)
['im selfish impatient and a little insecure i make mistakes i am out of control and at times hard to handle but if you cant handle me at my worst then you sure as hell dont deserve me at my best'
 'youve gotta dance like theres nobody watchinglove like youll never be hurtsing like theres nobody listeningand live like its heaven on earth'
 'you know youre in love when you cant fall asleep because reality is finally better than your dreams']


In [3]:
print(ds.shape)

MAX_LENGTH = 150

def length_check(x):
    return len(x) < MAX_LENGTH

ds = np.array(list(filter(length_check, ds)))
print(ds.shape)

(1000,)
(555,)


In [4]:
# fifty_most_common_words = [ "the", "be", "of", "and", "a", "to", "in", "he", "have", "it", "that", "for", "they", "I", "with", "as", "not", "on", "she", "at", "by", "this", "we", "you", "do", "but", "from", "or", "which", "one", "would", "all", "will", "there", "say", "who", "make", "when", "can", "more", "if", "no", "man", "out", "other", "so", "what", "time", "up", "go"]

text = " ".join(ds)
# def remove_fifty_most_common_words_from_text(text):
#     for word in fifty_most_common_words:
#         text = text.replace(word, "")
#     return text

# text = remove_fifty_most_common_words_from_text(text)

In [5]:
max_tokens = 20000
max_len = 400

def letters(input):
    valids = []
    for character in input:
        if character.isalpha():
            valids.append(character)
    return ''.join(valids)


def more_than_once(input):
    if len(input) < 1:
        return False
    return text.count(input) > 6
words = text.split(" ")

words = list(set(words))
vocab = sorted(words)

vocab = list(map(letters, vocab))
vocab = list(filter(lambda x: len(x) > 0, vocab))
vocab = list(set(vocab))
vocab = sorted(vocab)

vocab = list(filter(more_than_once, vocab))

print(len(vocab))
print(vocab[0:100])
print(vocab[-10:])

301
['a', 'able', 'about', 'act', 'after', 'again', 'all', 'always', 'am', 'an', 'and', 'another', 'any', 'anything', 'are', 'arm', 'around', 'art', 'as', 'at', 'away', 'back', 'be', 'beautiful', 'because', 'become', 'been', 'being', 'best', 'bet', 'better', 'between', 'body', 'break', 'but', 'by', 'can', 'cannot', 'cant', 'care', 'cause', 'change', 'choice', 'choose', 'close', 'come', 'comes', 'could', 'dark', 'day', 'deep', 'did', 'die', 'do', 'doe', 'does', 'doesnt', 'dont', 'each', 'ear', 'ears', 'eat', 'else', 'end', 'ends', 'enough', 'even', 'ever', 'every', 'everything', 'eye', 'fall', 'fat', 'feel', 'fight', 'find', 'fire', 'for', 'forever', 'forget', 'friend', 'friends', 'from', 'full', 'get', 'give', 'go', 'god', 'going', 'good', 'great', 'had', 'hang', 'happiness', 'happy', 'has', 'hate', 'have', 'he', 'hear']
['worth', 'would', 'ye', 'yes', 'you', 'youll', 'your', 'youre', 'yours', 'yourself']


In [6]:
spacy.prefer_gpu()
nlp = spacy.load('en_core_web_sm')


def tokenizer_fn(s): 
    s = str(s)
    return [w.text.lower() for w in nlp(s)]

# Define the preprocessing steps for your text fields
tokenizer = Field(sequential=True, tokenize="basic_english", lower=True, batch_first=True, fix_length=max_len)

# quote = ds[0]
# quote = str(quote)

# tokenized = tokenizer.tokenize(quote)

# print(tokenized)

# tokenizer.build_vocab([tokenized])

# print(tokenizer.vocab.freqs.most_common(10))
# print(len(tokenizer.vocab))

# numericalized = tokenizer.process([tokenized])

# print(numericalized)

def tokenize(text):
    text = str(text)
    return tokenizer.tokenize(text)

dataset = list(map(tokenize, ds))

print(dataset[0:3])

tokenizer.build_vocab(dataset, max_size=max_tokens)

print(tokenizer.vocab.freqs.most_common(1000))

numericalized_dataset = tokenizer.process(dataset)

print(numericalized_dataset[0:3])

vocab_size = len(tokenizer.vocab)

print(vocab_size)

[['youve', 'gotta', 'dance', 'like', 'theres', 'nobody', 'watchinglove', 'like', 'youll', 'never', 'be', 'hurtsing', 'like', 'theres', 'nobody', 'listeningand', 'live', 'like', 'its', 'heaven', 'on', 'earth'], ['you', 'know', 'youre', 'in', 'love', 'when', 'you', 'cant', 'fall', 'asleep', 'because', 'reality', 'is', 'finally', 'better', 'than', 'your', 'dreams'], ['a', 'friend', 'is', 'someone', 'who', 'knows', 'all', 'about', 'you', 'and', 'still', 'loves', 'you']]
[('you', 379), ('love', 363), ('the', 356), ('to', 269), ('i', 227), ('is', 226), ('a', 198), ('and', 184), ('of', 178), ('in', 141), ('it', 140), ('that', 118), ('be', 98), ('for', 80), ('not', 78), ('are', 76), ('with', 75), ('my', 71), ('if', 70), ('when', 62), ('me', 62), ('your', 61), ('we', 61), ('but', 61), ('one', 60), ('all', 51), ('can', 51), ('have', 51), ('heart', 49), ('as', 48), ('someone', 46), ('its', 45), ('what', 45), ('like', 43), ('was', 43), ('who', 42), ('never', 41), ('loved', 41), ('there', 40), ('do

In [7]:
def split_input_sequence(x):
    input_text = x[:-1]
    target_text = x[1:]
    return input_text, target_text

# sample = numericalized_dataset[0]

# print(sample)

# print(split_input_sequence(sample))

dataset = list(map(split_input_sequence, numericalized_dataset))

# print(dataset.shape)

print(dataset[0:3])

[(tensor([ 347, 1034,  877,   35,  208,  232,  673,   35,  174,   39,   14, 1087,
          35,  208,  232, 1163,  165,   35,   33,  389,   56,  370,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1, 

In [9]:
class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sample = self.data[index]
        # Process and return the sample
        return sample

dataset = MyDataset(dataset)

batch_size = 32

dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True,
                        generator=torch.Generator(device='cuda'),
                    )

for batch in dataloader:
    print(batch[0].shape)
    print(batch[1].shape)
    break

torch.Size([32, 399])
torch.Size([32, 399])


In [22]:
embedding_dim = 1024
rnn_hidden_dim = 2048

class QGT(nn.Module):
    def __init__(self):
        super(QGT, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim=embedding_dim)
        self.rnn = nn.GRU(embedding_dim, rnn_hidden_dim, num_layers=1)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x = torch.relu(x)
        x = self.linear(x)
        return x
    

model = QGT()

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

criterion = nn.CrossEntropyLoss()

num_epochs = 1000

for epoch in range(num_epochs):
    for i in range(100):
        batch = next(iter(dataloader))
        input_text = batch[0]
        target_text = batch[1]

        output = model(input_text)

        print(output.shape)
        print(target_text.shape)
        loss = criterion(output, target_text)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if epoch % 100 == 0:
        print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))


torch.Size([32, 399, 1637])
torch.Size([32, 399])


RuntimeError: Expected target size [32, 1637], got [32, 399]