In [31]:
import torch
import torch.nn as nn # the neural network package that contains functions for creating the neural network layers
import torch.nn.functional as F
import torch.optim as optim # a package that allows use to use an optimizer in order to update the parameters during training
from torch.utils.data import DataLoader # allows use to process the data in batches
from torch.nn.utils.rnn import pad_sequence # a function that zero-pads the sentences so they can have equal size in a batch
from collections import Counter
import pandas as pd
import pickle
torch.manual_seed(0) # set a random seed for reproducibility
from tqdm import tqdm

MODEL_CONSTANT = ['GRU', 'LSTM', 'BI-LSTM']

###Set important constants

In [32]:
is_google_colab = True
batch_size = 32
is_preprocessing = False #This toggle preprocessing for RNN model (FALSE = not preprocessing for text, but instead load from processed data saved from previous run)
model_type = MODEL_CONSTANT[2] # change this variable to correct model type
load_model = False # this is for saving/loading the model from the checkpoint when colab crashes

device = torch.device('cuda:0')
#device = torch.device('cpu')

# Below are the dataset file we will perform our training on:
data_file = "data_sample_2x.txt"
prep_file_name = data_file[:-4]


learning_rate = 0.0005 # Learning rate
n_epochs = 40 # the number of epochs to train
embed_dim = 300 # the size of the embedding
hidden_size = 450 # the size of the hidden state
num_layers = 1 # the number of layers in the RNN cell

In [33]:
# this is to connect this notebook to the contents of your Google Drive
# this is Ryoko's Google Drive filepath
# please specify your own, or we can (probably) share a folder for it
if is_google_colab:
    # files uploaded to Google Drive will not be deleted by inactivity,
    # but it does require an authorization code every time you use it
    from google.colab import drive
    filepath = r'/content/drive/My Drive/RetardBot/'
    drive.mount(r'/content/drive')
else:
    filepath = ''
print("Running model :", model_type)

load_file = 'temp_model_' + model_type + '_' + prep_file_name #file name if I'm going to temporary save/load the model

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Running model : BI-LSTM


###  Sentence boundaries

When dealing with language, it is good to know when a sentence starts and when it ends. That will help the model at the beginning of the prediction, when we don't have any previous words as context. For that purpose, we are going to pad each sentence with a start-of-sentence symbol _"&lt;s>"_ and an end-of-sentence symbol _"&lt;/s>"_. 

Since you already did a similar thing in the n-grams exercise, this function is already implemented for you.

In [34]:
def add_sentence_boundaries(data):
    """
    Takes the data, where each line is a sentence, appends <s> token at the beginning and </s> at the end of each sentence
    Example input: I live in Helsinki
    Example output: <s> I live in Helsinki </s>
    
    Arguments
    ---------
    data : list
            a list of sentences
    
    Returns
    -------
    res : list
            a list of sentences, where each sentence has <s> at the beginning and </s> at the end
    """
    #res = []
    #for sent in data:
    #    sent = '<s> ' + sent.rstrip() + ' </s>'
    #    res.append(sent)

    res = ['<s> ' + sent.rstrip() + ' </s>' for sent in data]
    
    return res

### Index dictionaries <a class="anchor" id="task_1_1"></a> 
Neural networks can't process words as raw strings. Due to that, we need to represent the words with numbers. The first step in doing that is creating two dictionaries: word2idx and idx2word.

The word2idx dictionary contains unique words as keys and unique indices for each of the words as values. <br>
The idx2word dictionary contains unique indices as keys and unique words for each of those indices as values. It is essentially a reversed word2dx, where the keys are the values and the values are the keys.

Example sentences: ["I look forward", "You look forward"] <br>
word2idx = {"I": 1, "look": 2, "forward": 3, "You": 4} <br>
idx2word = {1: "I", 2: "look", 3: "forward", 4: "You"} <br>

Write a function that creates two dictionaries: word2idx and idx2work. The dictionaries should contain all the unique words in the data. <b>The indices should start from 1 and not from 0<b>

In [35]:
def create_indices(data):
    """
    This function creates two dictionaries: word2idx and idx2word, containing each unique word in the dataset
    and its corresponding index.
    Remember that the starting index should be 1 and not 0
    
    Arguments
    ---------
    data - list
            a list of sentences, where each sentence starts with <s>
            and ends with </s> token
    
    Returns
    -------
    word2idx - dictionary
                a dictionary, where the keys are the words and the values are the indices
                
    idx2word - dictionary
                a dictionary, where the keys are the indices and the values are the words
    """
    
    # YOUR CODE HERE
    #raise NotImplementedError()
      # There's several method to to this, using Counter would make our model run faster
    # in preprocessing, otherwise it would take forever to run with full data
    # sklearn also offer similar function, just watch out for stopwords
    # sklearn.feature_extraction.text.CountVectorizer(min_df=1, stop_words=None)
    # word_tokenizer = vectorizer.build_tokenizer()
    # vectorizer.fit(data)
    # word2idx = vectorizer.vocabulary_

    data_list = ' '.join(data).split(' ')    
    data_unique = list(dict.fromkeys(data_list))
    word2idx = {word:data_unique.index(word)+1 for word in data_unique}
    idx2word = {value:key for key,value in word2idx.items()}
    
    return word2idx, idx2word

### Index data <a class="anchor" id="task_1_2"></a>
After we have created the word2idx and idx2word dictionaries, it is time to index the data. In other words, we need to replace each word in the data with its corresponding index.

Write a function that reads each sentence from the data and replaces each word in the sentence with its index from the word2idx dictionary.

In [36]:
def index_data(data, word2idx):
    """
    This function replaces each word in the data with its corresponding index
    
    Arguments
    ---------
    data - list
            a list of sentences, where each sentence starts with <s>
            and ends with </s> token
    
    word2idx - dict
            a dictionary where the keys are the unique words in the data
            and the values are the unique indices corresponding to the words%
    
    Returns
    -------
    data_indexed - list
                a list of sentences, where each word in the sentence is replaced with its index
    """
    
    data_indexed = []
    # YOUR CODE HERE
    #raise NotImplementedError()
    
    for sentence in data:
        sentence_index = [word2idx[word] for word in sentence.split(' ')]
        # Further improvement can be done here to remove the top loop
        # For now it's fast enough to ignore
        data_indexed.append(sentence_index)
    

    return data_indexed

### Convert sentences to tensors

This function converts each indexed sentence to a LongTensor data type. This is required in order to process it later using Pytorch.

You don't have to modify this function. It is already implemented for you.

In [37]:
def convert_to_tensor(data_indexed):
    """
    This function converts the indexed sentences to LongTensors
    
    Arguments
    ---------
    data_indexed - list
            a list of sentences, where each word in the sentence
            is replaced by its index
    
    Returns
    -------
    tensor_array - list
                a list of sentences, where each sentence
                is a LongTensor
    """
    tensor_array = [torch.LongTensor(sent) for sent in data_indexed]
        
    return tensor_array

### Combine features and labels in a tuple

This function combines each indexed sentence and its corresponding labels to a tuple. This will be beneficial for us when we zero-pad the data later, in order to make the batches have equal-length samples.

You don't have to modify this function. It is already implemented for you.

In [38]:
def combine_data(input_data, labels_data):
    """
    This function converts the input features and the labels into tuples
    where each tuple corresponds to one sentence in the format (features, labels)
    
    Arguments
    ---------
    input_data - list
            a list of tensors containing the training features
    
    labels_data - list
            a list of tensors containing the training labels
    
    Returns
    -------
    res - list
            a list of tuples, where each tuple corresponds to one sentece pair
            in the format (features, labels)
    """
    
    res = [(input_data[i], labels_data[i]) for i in range(len(input_data))]

    return res

### Remove extra data

Since we will be processing the data in equal batches during training, we need to make sure that each batch has equal number of sentences. In case the last batch contains less sentences than the batch size, that batch will be discarded.

This function discards the extra data that doesn't fit in a batch.

You don't have to modify this function. It is already implemented for you.

In [39]:
def remove_extra(data, batch_size):
    """
    This function removes the extra data that does not fit in a batch   
    
    Arguments
    ---------
    data - list
            a list of tuples, where each tuple corresponds to a
            sentence in a format (features, labels)
            
    batch_size - integer
                    the size of the batch
    
    
    Returns
    -------
    data - list
            a list of tuples, where each tuple corresponds to a
            sentence in a format (features, labels)
    """
    
    extra = len(data) % batch_size
    if extra != 0:
        data = data[:-extra][:]

    return data

### Zero-pad the data

In order to process the data in batches, we need to make sure that the sentences in each batch have equal lengths. Since we are working with sentences, each sentence in a batch can have different number of words. In this case, we need to  make the length of each sentence the same as the length of the longest sentence in that batch. We do that by adding zeros at the end of each sentence, until the sentence has equal length as the longest one in the batch.

This function implements the zero-padding.

You don't have to modify this function. It is already implemented for you.

In [40]:
def collate(list_of_samples):
    """
    This function zero-pads the training data in order to process the sentences
    in a batch during training
    
    Arguments
    ---------
    list_of_samples - list
                        a list of tuples, where each tuple corresponds to a
                        sentence in a format (features, labels)
    
    
    Returns
    -------
    pad_input_data - tensor
                        a tensor of input features equal to the batch size,
                        where features are zero-padded to have equal lengths
                        
    input_data_lengths - list
                        a list where each element is the length of the 
                        corresponding sentence
    
    pad_labels_data - tensor
                        a tensor of labels equal to the batch size,
                        where labels are zero-padded to have equal lengths
            
    """
    
    
    list_of_samples.sort(key=lambda x: len(x[0]), reverse=True)
    input_data, labels_data = zip(*list_of_samples)

    input_data_lengths = [len(seq) for seq in input_data]
    
    padding_value = 0

    # pad input
    pad_input_data = pad_sequence(input_data, padding_value=padding_value)
    
    # pad labels
    pad_labels_data = pad_sequence(labels_data, padding_value=padding_value)

    return pad_input_data, input_data_lengths, pad_labels_data

### Prepare features and labels <a class="anchor" id="task_1_3"></a> 
During training, the model takes an input word and outputs a prediction. We will need to compare this prediction to 'true label'. True label is just the next word in the text, but we will need to organize the data, so that every word in the text is considered as this 'true label'.

In the label sentence, every word is moved a step in time, and for the input sentence the last word is missing. 

Example sentence: oops i did it again <br>
INPUT: oops i did it <br>
LABEL: i did it again

Note: the first word in the sentence is start-of-sentence symbol and the last one is end-of-sentence symbol.

Write a function that takes as input the indexed data and returns two arrays: the input array where the last word from each sentence is missing, and the label array, where every word is moved a step in time.

In [41]:
def prepare_for_training(data_indexed):
    """
    This function creates the input features and their corresponding labels
    
    Arguments
    ---------
    data_indexed - list
            a list of sentences, where each word in the sentence
            is replaced by its index
    
    
    Returns
    -------
    input_data - list
            a list of indexed sentences, where the last element of each sentence is removed
            
    labels_data - list
            a list of indexed sentences, where the first element of each sentence is removed
    """
    
    #input_data = []
    #labels_data = []

     # YOUR CODE HERE
    #raise NotImplementedError()

    input_data  = [data[:-1] for data in data_indexed]
    labels_data = [data[1:] for data in data_indexed]

    return input_data, labels_data

### Preprocess data <a class="anchor" id="task_1_4"></a>
At this point, we have all the necessary functions to prepare the data for training. What is left to do is to run them one by one and get the data in the desired format.

Write a function that takes the data and prepares it for training. You need to do the following steps:

    1. Add sentence boundaries
    2. Create index dictionaries (word2idx and idx2word)
    3. Index the data in a way that each word is replaced by its index
    4. Convert the indexed data to a list of tensors, where each tensor is a sentence
    5. Split each sentence to input and labels

In [42]:
def preprocess_data(data):
    """
    This function runs the whole preprocessing pipeline and returns the prepared
    input features and labels, along with the word2idx and idx2word dictionaries
    
    Arguments
    ---------
    data - list
            a list of sentences that need to be prepared for training
    
    
    Returns
    -------
    input_data - list
            a list of tensors, where each tensor is an indexed sentence used as input feature
            
    labels_data - list
            a list of tensors, where each tensor is an indexed sentence used as a true label
    
    word2idx - dictionary
                a dictionary, where the keys are the words and the values are the indices
                
    idx2word - dictionary
                a dictionary, where the keys are the indices and the values are the words
    """
    
    # YOUR CODE HERE
    #raise NotImplementedError()
    #1. Add sentence boundaries    
    print("Add sentence boundaries")    
    res = add_sentence_boundaries(data)
        
    #2. Create index dictionaries (word2idx and idx2word)
    print("Create index dictionaries (word2idx and idx2word)")
    word2idx, idx2word = create_indices(res)    
    
    #3. Index the data in a way that each word is replaced by its index
    print("Index the data in a way that each word is replaced by its index")
    indexed_data = index_data(res, word2idx)
    
    print("Convert the indexed data to a list of tensors")
    #4. Convert the indexed data to a list of tensors, where each tensor is a sentence
    tensor_array = convert_to_tensor(indexed_data)   

    print("Split each sentence to input and labels")
    #5. Split each sentence to input and labels
    input_data, labels_data = prepare_for_training(tensor_array)

    print("Done preprocessing")
    return input_data, labels_data, word2idx, idx2word

In [43]:
# Load data
# Load result:
with open(filepath + data_file, "rb") as fp:   # Unpickling
#with open(filepath + "clean_data.txt", "rb") as fp:   # Unpickling
    sentences = pickle.load(fp)

print(sentences[22:35])
print(len(sentences))

['you see grand kids, there was a time long long ago, the markets would have two directions, some times there were days where it actually went down, and the color would change from green to red.', 'what will happen is he‚Äôll have to go all cash for 90 days though if he doesn‚Äôt pay it.', "it's been a long and winding journey to here.", 'i want to buy tqqq at $70/share..', '" sadly for some of us their immune system is stronger\'', 'you‚Äôre welcome.', 'so i sold everything and bought puts.', 'whole market is down, just load up more pltr positions for cheap üí∞', 'matt and i just used 53m for total shares outstanding (i later added an adjustment to 73m in my update to account for all of the options and warrants going in the money.', 'pltr $30 üöÄüöÄüöÄü§ëü§ëü§ë', 'investing in yourself is some of the highest possible dividends.', 'i check my robinhood.', 'lmao there‚Äôs really gonna be an only fans ipo']
40000


### Preprocessing data

Preprocessing data and save those to files for future run (very important if we run big model in Google Colab, as it take DAYS and multiple run to get good result). If we run short model like 40k sentences then we might not need this, but large model will take a long time to create word2idx.

Or load the data if we already run it before

**Remember to create the folder to store the data (torch.save) won't auto create new folder**

In [44]:
if is_preprocessing:
    train_input, train_labels, word2idx, idx2word = preprocess_data(sentences) # run the preprocessing pipeline
    print("Saving...")
    torch.save(train_input, filepath + prep_file_name + '/train_input.pt')
    torch.save(train_labels, filepath + prep_file_name + '/train_labels.pt')
    torch.save(word2idx, filepath + prep_file_name + '/word2idx.pt')
    torch.save(idx2word, filepath + prep_file_name + '/idx2word.pt')
    train_data = combine_data(train_input, train_labels)
    train_data = remove_extra(train_data, batch_size)
    torch.save(train_data, filepath + prep_file_name + '/train_data.pt')    
else:
    print("Loading...")
    #train_input = torch.load(filepath + prep_file_name + '/train_input.pt')
    #train_labels = torch.load(filepath + prep_file_name + '/train_labels.pt')    
    #train_data = combine_data(train_input, train_labels)
    #train_data = remove_extra(train_data, batch_size)
    # Fastest load is only load 3 variable below:
    word2idx = torch.load(filepath + prep_file_name + '/word2idx.pt')
    idx2word = torch.load(filepath + prep_file_name + '/idx2word.pt')
    train_data = torch.load(filepath + prep_file_name + '/train_data.pt') 

Loading...


In [45]:
pairs_batch_train = DataLoader(dataset=train_data,
                    batch_size=batch_size,
                    shuffle=True,
                    collate_fn=collate,
                    pin_memory=True)

### RNN model

In [46]:
class RNN(nn.Module):
    def __init__(self, word2idx, embed_dim, context_dim, num_layers):
        """
        This function initializes the layers of the model
        
        Arguments
        ---------
        word2idx - dictionary
                    a dictionary where the keys are the unique words in the data
                    and the values are the unique indices corresponding to the words
        
        embed_dim - integer
                        the size of the word embeddings

        context_dim - integer
                        the dimension of the hidden size
                        
        num_layers - integer
                        the number of layers in the GRU cell
        """
        super(RNN, self).__init__()
        self.word2idx = word2idx
        self.embed_dim = embed_dim
        self.context_dim = context_dim
        self.num_layers = num_layers
        
        # here we initialise weighs of a model
        self.word_embed = nn.Embedding(len(self.word2idx)+1, self.embed_dim) # embedding layer
        
        #GRU
        if model_type == MODEL_CONSTANT[0]:            
            self.rnn = nn.GRU(self.embed_dim,
                              self.context_dim,
                              num_layers=self.num_layers)
        
        #LSTM
        elif model_type == MODEL_CONSTANT[1]:            
            self.rnn = nn.LSTM(input_size = self.embed_dim, 
                               hidden_size = self.context_dim,
                               num_layers = self.num_layers)
        #BI-LSTM
        else:
            self.rnn = nn.LSTM(input_size = self.embed_dim, 
                               hidden_size = self.context_dim,
                               num_layers = self.num_layers,
                               bidirectional=True)
        
        self.dropout = nn.Dropout(0.1) # Dropout        
        
        if model_type == MODEL_CONSTANT[2]:
            self.out = nn.Linear(self.context_dim*2, len(self.word2idx)+1) # output layer
        else:
            self.out = nn.Linear(self.context_dim, len(self.word2idx)+1) # output layer

    
    def forward(self, word, hidden):
        """
        This function implements the forward pass of the model
        
        Arguments
        ---------
        word - tensor
                a tensor containing indices of the words in a batch
                
        hidden - tensor
                    the previous hidden state of the GRU model
        
        Returns
        -------
        output - tensor
                    a tensor of logits from the linear transformation
        
        hidden - tensor
                    the current hidden state of the GRU model
        """ 
        
        # YOUR CODE HERE
        #raise NotImplementedError()
        #1. Replace the indexed word with its embedding vector. 
        #In other words, pass it through the embedding layer
        embeds = self.word_embed(word)
        
        batch_size = word.shape[0]
        #print(batch_size)
        #2. Reshape the embedding vector to a shape of (1, batch_size)
        embeds = embeds.reshape(1, batch_size, self.embed_dim)
        
        #3. Pass the embedding through the GRU cell to get the output 
        #and the hidden tensors. The GRU function takes as input the 
        #work embedding and the previous hidden state.
        output, hidden = self.rnn(embeds, hidden)
        
        #4. Addpy a dropout to the output of the RNN
        output = self.dropout(output)
        
        #5. Apply the linear transformation to the output of the dropout layer.
        output = self.out(output)
        
        #6. Reshape the output to have a shape (batch_size, vocab_length+1)
        output = output.reshape(batch_size, len(self.word2idx) + 1)
        
        #7. Return the output of the linear transformation and the hidden tensor
        #only GRU need to move to device
        if model_type == MODEL_CONSTANT[0]:
            hidden = hidden.to(device)
        return output, hidden
    
    def init_state(self, sequence_length):
        if model_type == MODEL_CONSTANT[1]:
            return (torch.zeros(self.num_layers, sequence_length, self.context_dim).to(device),
                    torch.zeros(self.num_layers, sequence_length, self.context_dim).to(device))
        else:
            return (torch.zeros(self.num_layers*2, sequence_length, self.context_dim).to(device),
                    torch.zeros(self.num_layers*2, sequence_length, self.context_dim).to(device))

In [47]:
rnn_model = RNN(word2idx, embed_dim, hidden_size, num_layers).to(device) # initialize the RNN model
loss_function = nn.CrossEntropyLoss(ignore_index=0) # define the loss function
rnn_optimizer = optim.Adam(rnn_model.parameters(), lr=learning_rate) # define the optimizer

### Training RNN network

In [48]:
def train_rnn(pairs_batch_train, rnn_model, hidden_size, num_layers, loss_function, rnn_optimizer, n_epochs):
    """
    This function implements the training of the model

    Arguments
    ---------
    pairs_batch_train - object
                            a DataLoader object that contains the batched data

    rnn_model - object
                an RNN object that contains the initialized model
                
    hidden_size - integer
                    the size of the hidden layer (the context size)
    
    num_layers - integer
                        the number of layers in the GRU cell

    loss_function - object
                        the CrossEntropy loss function

    rnn_optimizer - object
                        an Adam object of the optimizer class

    n_epochs - integer
                the number of epochs to train
    """ 
    loss_list = []
    word_loss_list = []    

    if load_model == True:
        if not torch.cuda.is_available():
            checkpoint = torch.load(filepath+load_file,map_location=torch.device('cpu'))
        else:
            checkpoint = torch.load(filepath+load_file)
        rnn_model.load_state_dict(checkpoint['model_state_dict'])
        rnn_optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        last_epoch = checkpoint['epoch']
    else:
        last_epoch = 0

    for epoch in tqdm(range(last_epoch, n_epochs)): # iterate over the epochs
        epoch_loss = 0
        epoch_word_loss = 0
        rnn_model.train() # put the model in training mode
        
        for iteration, batch in enumerate(pairs_batch_train): # at each step take a batch of sentences
            sent_loss = 0
            rnn_optimizer.zero_grad() # clear gradients
            
            train_input, train_input_lengths, train_labels = batch # extract the data from the batch
            train_input = train_input.to(device)
            #train_input_lengths = train_input_lengths.to(device) #this is a list
            train_labels = train_labels.to(device)
            
            # initialize the hidden state for GRU
            if model_type == MODEL_CONSTANT[0]: 
                hidden = torch.zeros((num_layers, train_input.size(1), hidden_size)) 
                hidden = hidden.to(device)
            else:
                hidden = rnn_model.init_state(train_input.size(1))      
            
            
            for i in range(train_input.size(0)): # iterate over the word in the sentence
                output, hidden = rnn_model(train_input[i], hidden) # forward pass               
                    
                labels = torch.LongTensor(train_labels.size(1)) # define a random tensor with batch_size as number of elements
                labels = labels.to(device)
                labels[:] = train_labels[i][:] # put the correct label values in the tensor
                
                sent_loss += loss_function(output, labels) # compute the loss, compare the predictions and the labels

            print(train_labels.size(1))
            word_loss = sent_loss.item()/train_labels.size(1)
            #if (iteration%500) == 0:
            #  print("sentence_loss, pair_batch_train, train_inputsize",
            #        sent_loss.item(), len(pairs_batch_train), train_input.shape)
            sent_loss.backward() # compute the backward pass
            rnn_optimizer.step() # update the parameters

            epoch_loss += word_loss        

        # print the loss at each epoch
        print('Epoch: {}   Loss: {}'.format(epoch+1, 
                                            epoch_loss / len(pairs_batch_train)))
        
        # Save model every 4 epoch
        if(epoch+1)%4 == 0:
            filename = filepath + 'models/' + model_type + '_' + prep_file_name + '_' + str(learning_rate) + '_' + str(epoch+1) + '.pt'
            torch.save(rnn_model.state_dict(), filename)
        
        loss_list.append(epoch_loss/ len(pairs_batch_train))
        #print(loss_list)
        # this is needed for work in Colab because once the time limit is up,
        # it will automatically delete all files that are not saved in Google Drive      
        print("saving model and loss data")
        list_df = {'loss':loss_list}
        df = pd.DataFrame(list_df)
        df.to_csv(filepath + 'models/' + model_type + '_' + prep_file_name + '_' + str(learning_rate) + '_' + str(epoch+1) + '.csv')
        torch.save({
              'epoch': epoch+1,
              'model_state_dict': rnn_model.state_dict(),
              'optimizer_state_dict': rnn_optimizer.state_dict(),
              }, filepath + load_file)
    
    return loss_list

#### Training step

In [49]:
lost_list = train_rnn(pairs_batch_train, rnn_model,hidden_size, num_layers, loss_function, rnn_optimizer, n_epochs)

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch: 28   Loss: 0.6502838764667511
saving model and loss data


  8%|‚ñä         | 1/13 [07:02<1:24:34, 422.86s/it]

Epoch: 29   Loss: 0.5731421462297439
saving model and loss data


 15%|‚ñà‚ñå        | 2/13 [14:05<1:17:30, 422.79s/it]

Epoch: 30   Loss: 0.5369274916172028
saving model and loss data


 23%|‚ñà‚ñà‚ñé       | 3/13 [21:07<1:10:24, 422.49s/it]

Epoch: 31   Loss: 0.5289100328683853
saving model and loss data


 31%|‚ñà‚ñà‚ñà       | 4/13 [28:08<1:03:20, 422.24s/it]

Epoch: 32   Loss: 0.5196785936832428
saving model and loss data


 38%|‚ñà‚ñà‚ñà‚ñä      | 5/13 [35:09<56:14, 421.84s/it]  

Epoch: 33   Loss: 0.5101091520309449
saving model and loss data


 46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 6/13 [42:10<49:10, 421.48s/it]

Epoch: 34   Loss: 0.5165486315965653
saving model and loss data


 54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 7/13 [49:12<42:10, 421.79s/it]

Epoch: 35   Loss: 0.5193125486135483
saving model and loss data


 62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 8/13 [56:12<35:05, 421.14s/it]

Epoch: 36   Loss: 0.5098922646284103
saving model and loss data


 69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 9/13 [1:03:14<28:05, 421.27s/it]

Epoch: 37   Loss: 0.5045015212535858
saving model and loss data


 77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 10/13 [1:10:14<21:03, 421.05s/it]

Epoch: 38   Loss: 0.49602408640384676
saving model and loss data


 85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 11/13 [1:17:18<14:03, 421.99s/it]

Epoch: 39   Loss: 0.49522546541690826
saving model and loss data


 92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 12/13 [1:24:19<07:01, 421.57s/it]

Epoch: 40   Loss: 0.4922507870674133
saving model and loss data


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 13/13 [1:31:22<00:00, 421.71s/it]


In [50]:
def predict_rnn(rnn_model, hidden_size, num_layers, word2idx, idx2word, context, max_len):
    """
    This function predicts the next word, based on the history of the previous words.
    We start with the 'context' and then feed the prediction as the next input.
    
    Arguments
    ---------
    rnn_model - object
                an RNN object that contains the trained model
                
    hidden_size - integer
                    the size of the hidden layer (the context size)
                    
    num_layers - integer
                    the number of layers in the GRU cell
                
    word2idx - dictionary
                    a dictionary where the keys are the unique words in the data
                    and the values are the unique indices corresponding to the words
                    
    idx2word - dictionary
                a dictionary, where the keys are the indices and the values are the words
                    
    context - string
                the context sentence
    
    max_len - integer
                integer value representing up to how many words to generate
                            
    Returns
    -------
    
    predictions - string
                    a string containing the generated sentence
    """
    
    # index the context
    context_indexed = []
    for word in context.split():
        word_indexed = torch.LongTensor(1)
        word_indexed[:] = word2idx[word]
        context_indexed.append(word_indexed)
    
    with torch.no_grad():
        predictions = []
        # first build the hidden state from the context
        if model_type == MODEL_CONSTANT[0]:
            hidden = torch.zeros((num_layers, 1, hidden_size), device=device)
        else:
            hidden = rnn_model.init_state(1)   
            
        for word in context_indexed:
            predictions.append(idx2word[word.item()])
            word = word.to(device)
            output, hidden = rnn_model(word, hidden)
            
        next_input = context_indexed[-1]
        while((len(predictions) < max_len) and (predictions[-1] != '</s>')):
            
            # YOUR CODE HERE
            #raise NotImplementedError()
            #1. Run the forward pass to get the output. Don't forget to include the `hidden` state
            #print("INPUT", next_input)
            next_input = next_input.to(device)
            out, hidden = rnn_model.forward(next_input, hidden)
            
            #2. Run the output through a softmax to convert it to a probability distribution (`F.softmax`)
            out = F.softmax(out)
            
            #3. Get the word with the highest probability using the `topk(1)` function
            value, index = out.topk(1)
            
            #4. Convert the index of the predicted word to the actual word using the idx2word dictionary
            word = idx2word[index.item()]
            
            #5. Append the predicted word to the `predictions` array
            predictions.append(word)
            next_input = index
            
    predictions = ' '.join(predictions)
    
    return predictions

In [51]:
checkpoint = torch.load(filepath+load_file)
rnn_model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [52]:
contexts = ['<s> can someone', '<s> bought the dip', '<s> wait', '<s> this is the', '<s> to the']

max_len = 50

for context in contexts:
    for i in range(5):
      predictions = predict_rnn(rnn_model, hidden_size, num_layers, word2idx, idx2word, context, max_len)
      print(predictions)
    print('\n')



<s> can someone please explain to me why not just the worst of the american people hate so to look at the scoreboard and make its - great way to keep yoloing he's from options trading it is a certain way, but there is no need for it </s>
<s> can someone with profit is the smart money in this sub put options trading for a 10 percent in the last quarter </s>
<s> can someone with please do my life can keep between the single stock over his name on the day. </s>
<s> can someone say is a wsb moron it‚Äôs out. </s>
<s> can someone please have their house prices they don‚Äôt use it to buy more shares at a strike price </s>


<s> bought the dip on dkng </s>
<s> bought the dip at $59.30, $60c 8/7 mean edit: like that. </s>
<s> bought the dip at $59.30, $60c 8/7 yolo whole account edit: down 20% so far, still won‚Äôt sell, because i believe in it, not just cause i‚Äôm restricted from day trading ü§° </s>
<s> bought the dip for the 25th </s>
<s> bought the dip yesterday </s>


<s> wait a second.

In [53]:
#rnn_model.load_state_dict(torch.load(filepath + "models/LSTM_00005-6.pt"))