# Script Generation
This notebook uses the features and model created in the "Project.ipynb" file to generate TV scripts. 
To modify feature descriptions or to re-train the model, please use the "Project.ipynb" notebook. 

In [1]:
#Import dependencies
import util
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

#Check for a GPU
train_on_gpu = torch.cuda.is_available()

## Load Features

In [2]:
int_text, vocab_to_int, int_to_vocab, token_dict = util.load_preprocess()

## Load Class Definition

In [3]:
class RNN(nn.Module):
    
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=0.5,lr=0.001):
        """
        Initialize the PyTorch RNN Module
        :param vocab_size: The number of input dimensions of the neural network (the size of the vocabulary)
        :param output_size: The number of output dimensions of the neural network
        :param embedding_dim: The size of embeddings, should you choose to use them        
        :param hidden_dim: The size of the hidden layer outputs
        :param dropout: dropout to add in between LSTM/GRU layers
        """
        super(RNN, self).__init__()
        
        # define embedding layer        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        ## Define the LSTM
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        
        # set class variables
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # Define the final, fully-connected output layer
        self.fc = nn.Linear(hidden_dim, output_size)

        
    def forward(self, nn_input, hidden):
        """
        Forward propagation of the neural network
        :param nn_input: The input to the neural network
        :param hidden: The hidden state        
        :return: Two Tensors, the output of the neural network and the latest hidden state
        """

        batch_size = nn_input.size(0)

        # embeddings and lstm_out
        embeds = self.embedding(nn_input)
        lstm_out, hidden = self.lstm(embeds, hidden)
    
        # stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # dropout and fully-connected layer
        out = self.fc(lstm_out)
        
        # reshape into (batch_size, seq_length, output_size)
        out = out.view(batch_size, -1, self.output_size)
        # get last batch
        out = out[:, -1]

        return out, hidden

    
    def init_hidden(self, batch_size):
        '''
        Initialize the hidden state of an LSTM/GRU
        :param batch_size: The batch_size of the hidden state
        :return: hidden state of dims (n_layers, batch_size, hidden_dim)
        '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(3),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(3))
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden

## Load Trained Model

In [4]:
trained_rnn = util.load_model('trained_rnn_8') 

## Function to generate scripts

In [5]:
def generate(rnn, prime_id, int_to_vocab, token_dict, pad_value, predict_len=100, sequence_length=10):
    """
    Generate text using the neural network
    :param rnn: The PyTorch Module that holds the trained neural network
    :param prime_id: The word id to start the first prediction
    :param int_to_vocab: Dict of word id keys to word values
    :param token_dict: Dict of puncuation tokens keys to puncuation values
    :param pad_value: The value used to pad a sequence
    :param predict_len: The length of text to generate
    :param sequence_len: The length of dialogue to generate
    :return: The generated text
    """
    rnn.eval()
    
    # create a sequence (batch_size=1) with the prime_id
    current_seq = np.full((1, sequence_length), pad_value)
    current_seq[-1][-1] = prime_id
    predicted = [int_to_vocab[prime_id]]
    
    for _ in range(predict_len):
        if train_on_gpu:
            current_seq = torch.LongTensor(current_seq).cuda(3)
        else:
            current_seq = torch.LongTensor(current_seq)
        
        # initialize the hidden state
        hidden = rnn.init_hidden(current_seq.size(0))
        
        # get the output of the rnn
        output, _ = rnn(current_seq, hidden)
        
        # get the next word probabilities
        p = F.softmax(output, dim=1).data
        if(train_on_gpu):
            p = p.cpu()
         
        # use top_k sampling to get the index of the next word
        top_k = 5
        p, top_i = p.topk(top_k)
        top_i = top_i.numpy().squeeze()
        
        # select the likely next word index with some element of randomness
        p = p.numpy().squeeze()
        word_i = np.random.choice(top_i, p=p/p.sum()) 

        # retrieve that word from the dictionary
        word = int_to_vocab[word_i]
        predicted.append(word)     
        
        # the generated word becomes the next "current sequence" and the cycle can continue
        current_seq = np.roll(current_seq, -1, 1)
        current_seq[-1][-1] = word_i
    
    gen_sentences = ' '.join(predicted)
    
    # Replace punctuation tokens
    for key, token in token_dict.items():
        ending = ' ' if key in ['\n', '(', '"'] else ''
        gen_sentences = gen_sentences.replace(' ' + token.lower(), key + ending)
    gen_sentences = gen_sentences.replace('\n ', '\n')
    gen_sentences = gen_sentences.replace('( ', '(')
    
    # return all the sentences
    return gen_sentences

## Generate Scripts

In [6]:
# run the cell multiple times to get different results!
gen_length = 400 # modify the length to your preference
prime_word = 'phoebe' # name for starting the script

"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
pad_word = util.SPECIAL_WORDS['PADDING']
generated_script = generate(trained_rnn, vocab_to_int[prime_word + ':'], int_to_vocab, token_dict, vocab_to_int[pad_word], gen_length)
print(generated_script)



phoebe:( to students) well,
 realized i realized, maybe she'll call.
 joey: oh...( happily nervously, and smile
 and changes.)
 joey: hey! estelle gave me the wedding?
 joey: yeah, yeah.
 joey: uhuh. scared!
( monica rolls onto bed and ross closes door.)
 rachel:( laughing) oh, honey, c'mon,
 please, please stay everywhere. plus,
 chance he'll honest with you before. i
 promise we'd say.
 chandler: okay. hold on second.
 ross: oh, well, agree interview?
 joey: yeah, yeah...
 chandler: really smart. besides, obviously kept quiet
 sisters, i mean...
( loud touches a pair scissors)
 mike:( speaking angrily) nope,
 i'm sorry. i couldn't admit... long parents embarrassed,
 she sang" "  crap" . raise sound butt.)
 [scene: restaurant. phoebe putting bat. phoebe
 pacing walking aside. ]
 joey: hey rach, what's matter?
 joey: nothing! forced to get a hospital
 and get to steal coin, 'cause she several ways
 and falls onto floor.) nope...
 joey: i bet.
 rachel: oh my god! shouldn't let grab!
 ph

## Save Script

In [35]:
f =  open("generated_script_model8_1_bad.txt","w")
f.write(generated_script)
f.close()

#1 - 0, 2 - 1, 3 - 2, 4 - 0