In [1]:
import numpy as np
from string import punctuation
from collections import Counter
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
from torch import optim

### Let us check if the kernel is using GPU

In [2]:
train_on_gpu = torch.cuda.is_available()

if train_on_gpu:
    print("GPU is available and will be used")
else:
    print("CPU is available and will be used")


GPU is available and will be used


**Read File**
* This code open up the file and reads so it will be interpreted as a byte by bte file

In [3]:
file = open("../input/poetry/Kanye_West.txt")
data = file.read()

In [4]:
def get_stats(data):
    data = data.split("\n")
    line_size = len(data)
    sum_of_lenlines = 0
    for line in data:
        line = line.split()
        sum_of_lenlines += len(line)
    
    average_per_line = sum_of_lenlines / line_size
    return average_per_line, line_size

def print_stats(average_per_line, line_size):
    print("The average words per line is: " + str(average_per_line))
    print("The amount of lines there are: " + str(line_size))
    
average_per_line, line_size = get_stats(data)
print_stats(average_per_line, line_size)

The average words per line is: 8.298562893589537
The amount of lines there are: 6193


In [5]:
def preprocess_text(text):
    text_no_punc = "".join(char for char in text if char not in punctuation)
    text_no_punc = text_no_punc.lower()
    
    text_split = text_no_punc.split()
    text_split.pop(0)
    return text_split

split_data = preprocess_text(data)



In [20]:
def get_v_to_int(split_data):
    #Frequency of every word in a dictionary
    vocab_freq = Counter(split_data)
    sorted_vocab = sorted(vocab_freq, key = vocab_freq.get, reverse = True)
    vocab_to_int = {word: idx for idx, word in enumerate(sorted_vocab)}
    int_to_vocab = {idx: word for idx, word in enumerate(sorted_vocab)}
    
    return vocab_to_int, int_to_vocab

vocab_to_int, int_to_vocab = get_v_to_int(split_data)


In [7]:
def get_int_text(data_split):
    int_text = [vocab_to_int[word] for word in data_split]
    return int_text

int_text = get_int_text(split_data)
        


### Let's define some labels and target Tensors
* We have to first see how much (sequence_lenght * batch_size) batches square can fit into int_text
* We then want to develop our target tensors and then our labels.

#### For Example
* [[Hello, my, really, long, name, is], [[I, really, like, to, play]]
* [Sarthak, Ball]

* [[9, 78, 97, 32, 14, 90], [6, 7, 2, 1, 3, 6]] - That would be one sentence
* [67, 23] - That would be the target 

In [8]:
def create_data_loader(integer_text, batch_size, sequence_length):
    
    #Our x must be a sentence 
    #Our y must be the next word
    
    #Let's make our batches fit
    number_per_batch = batch_size * sequence_length
    batch_number = len(integer_text) // number_per_batch
    
    #Determine new length of int_text
    integer_text = list(integer_text[:batch_number * sequence_length * batch_size])
    
    number_of_targets = len(integer_text) - sequence_length
    data_x = []
    target_y = []
    
    for idx in range(number_of_targets):
        #First find the context numbers
        end_idx = idx + sequence_length
        current_context = int_text[idx : end_idx]
        data_x.append(current_context)
        
        #Now we have to find the the next word to the context
        next_word = int_text[end_idx]
        target_y.append(next_word)
        
    #Turn numpy array into a Tensor
    data_x = torch.from_numpy(np.array(data_x))
    data_y = torch.from_numpy(np.array(target_y))
    
    dataset = TensorDataset(data_x, data_y)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size = batch_size)
    
    return dataloader        

In [9]:
testDataLoaderfunc = create_data_loader(int_text, batch_size = 10, sequence_length = 8)
batch_iter_test = iter(testDataLoaderfunc)

sample_x, sample_y = batch_iter_test.next()
print(sample_x)
print("\n\n")
print(sample_y)





tensor([[ 239, 1096,  239,   21,   22,   75,   17,   16],
        [1096,  239,   21,   22,   75,   17,   16,  173],
        [ 239,   21,   22,   75,   17,   16,  173,   22],
        [  21,   22,   75,   17,   16,  173,   22,    2],
        [  22,   75,   17,   16,  173,   22,    2,   31],
        [  75,   17,   16,  173,   22,    2,   31, 1096],
        [  17,   16,  173,   22,    2,   31, 1096,  239],
        [  16,  173,   22,    2,   31, 1096,  239,    1],
        [ 173,   22,    2,   31, 1096,  239,    1,   25],
        [  22,    2,   31, 1096,  239,    1,   25,  240]])



tensor([ 173,   22,    2,   31, 1096,  239,    1,   25,  240, 1743])


In [10]:
class RNN (nn.Module):
    
    def __init__(self, vocab_size, embedding_dimensions, n_of_layers, hidden_dimensions, output_size, dropout = 0.25):
        super(RNN, self).__init__()
        
        self.output_size = output_size
        self.n_of_layers = n_of_layers
        self.hidden_dim = hidden_dimensions
        
        self.embedding_layer = nn.Embedding(vocab_size, embedding_dimensions)
        self.lstm = nn.LSTM(embedding_dimensions, hidden_dimensions, n_of_layers, dropout = dropout, batch_first = True)
        
        self.fc1 = nn.Linear(hidden_dimensions, output_size)
        
        
        
    def forward(self, context, hidden):
        
        batch_size = context.size(0)
        embeddings = self.embedding_layer(context)
        lstm_output, hidden = self.lstm(embeddings)
        
        lstm_output = lstm_output.contiguous().view(-1, self.hidden_dim)
        
        output = self.fc1(lstm_output)
        output = output.view(batch_size, -1, self.output_size)
        
        output_word = output[:, -1]
        
        return output_word, hidden
    
    def init_hidden(self, batch_size):
        
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_of_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_of_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_of_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_of_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden
    

In [11]:
def forward_and_back_prop(rnn, optimizer, criterion, inp, target, hidden):
    
    if train_on_gpu:
        rnn.cuda()
    
    h = ([each.data for each in hidden])
    
    
    optimizer.zero_grad()
    output_word, hidden = rnn(inp, h)
    loss = criterion(output_word, target)
    loss.backward()
    
    optimizer.step()
    
    batch_loss = loss.item()
    return batch_loss, hidden
    

### This is where we will now finish the training model and Setting HyperParameters

In [12]:
def train(rnn, batch_size, epochs, train_loader, optimizer, criterion):
    
    for count, epoch in enumerate(range(epochs)):
        train_loss = 0
        hidden = rnn.init_hidden(batch_size)
        
        for batch_i, (context, target) in enumerate(train_loader):
            
            if train_on_gpu:
                context, target = context.cuda(), target.cuda()
                
            batch_loss, hidden = forward_and_back_prop(rnn, optimizer, criterion, context, target, hidden)
            
            train_loss += batch_loss
        
        print("Epoch: ", str(count), "Train Loss: ", str(train_loss))
            

In [13]:
##### vocab_size, embedding_dimensions, n_of_layers, hidden_dimensions, output_size, dropout = 0.25

vocab_size = len(vocab_to_int)
embedding_dimensions = 200
n_of_layers = 2
hidden_dimensions = 250
output_size = vocab_size

lyric_rnn = RNN(vocab_size, embedding_dimensions, n_of_layers, hidden_dimensions, output_size, dropout = 0.25)


#### This is where we will define Training Hyperparameters 

In [14]:
epochs = 15
lr = 0.001
rnn_optimizer = optim.Adam(lyric_rnn.parameters(), lr = lr)
criterion = nn.CrossEntropyLoss()

# Lets us define a train_loader
batch_size = 20
sequence_length = 7

train_loader = create_data_loader(int_text, batch_size, sequence_length)



### Lets us train the model by putting all the hyperparamters in the train function

In [15]:
train(lyric_rnn, batch_size, epochs, train_loader, rnn_optimizer, criterion)

Epoch:  0 Train Loss:  16745.937193393707
Epoch:  1 Train Loss:  14668.418756723404
Epoch:  2 Train Loss:  13246.659477472305
Epoch:  3 Train Loss:  11990.370819330215
Epoch:  4 Train Loss:  10839.431100726128
Epoch:  5 Train Loss:  9849.846519231796
Epoch:  6 Train Loss:  8994.414194345474
Epoch:  7 Train Loss:  8296.985239446163
Epoch:  8 Train Loss:  7783.301341831684
Epoch:  9 Train Loss:  7185.863565444946
