<a href="https://colab.research.google.com/github/PseudoPythonista/nlp/blob/master/unsupervised_LSTM_president_simulator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Are you a president in need of a nonsensical speech?**

* inspired by: https://www.youtube.com/watch?v=EFHyzuqjaok
* data scraped used BeautifulSoup from: https://www.rev.com/blog transcript-category/donald-trump-transcripts
* code based on a udemy course: https://www.udemy.com/course/pytorch-for-deep-learning-with-python-bootcamp/
* works best with gpu

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
with open('/content/clean_super_ultimate_speech.txt','r',encoding='utf8') as f:
    text = f.read()

In [None]:
text[:500]

'President :  … number that you can have, and that’s all we’re allowed. So you’re a very special group, and thank you all for coming out here. Thank you, thank you. So I’m thrilled to be back in Ohio, and proud of the hardworking patriots of the Buckeye State. Four years ago, I came here, right in here. That was an amazing four years ago, wasn’t it, when you think how time flies? Right here in Cleveland, to accept the Republican nomination for President of the United States. And we’ve worked hard'

In [None]:
len(text)

2277900

In [None]:
all_characters = set(text)
decoder = dict(enumerate(all_characters))
encoder = {char: ind for ind,char in decoder.items()}
encoded_text = np.array([encoder[char] for char in text])

In [None]:
def one_hot_encoder(encoded_text, num_uni_chars):
    #encoded_text -> batch of encoded text
    #num_uni_chars -> number of unique characters (len(set(text)))

    one_hot = np.zeros((encoded_text.size, num_uni_chars)) # creates a matrix of zeros
    one_hot = one_hot.astype(np.float32) #convert to float32 to avoid errors
    one_hot[np.arange(one_hot.shape[0]), encoded_text.flatten()] = 1.0 #fancy idexing - fills the matrix at correct positions
    one_hot = one_hot.reshape((*encoded_text.shape, num_uni_chars)) # reshaped to match the batch shape
    
    return one_hot

In [None]:
def generate_batches(encoded_text, samp_per_batch=10, seq_len=50):
    

    # x: Encoded Text of length seq_len
    # y: Encoded Text shifted by one
    
    # encoded_text : Complete Encoded Text to make batches from
    # batch_size : Number of samples per batch
    # seq_len : Length of character sequence


    char_per_batch = samp_per_batch * seq_len #n of chars per batch
    num_batches_avail = int(len(encoded_text)/char_per_batch)#n of full batches available
    encoded_text = encoded_text[:num_batches_avail * char_per_batch]#drops the end that doesnt fit
    encoded_text = encoded_text.reshape((samp_per_batch, -1))#reshape the txt into rows the size of a batch
    
    for n in range(0, encoded_text.shape[1], seq_len): #goes thru each row
        
        x = encoded_text[:, n:n+seq_len] # grabs a sequence of chars
        y = np.zeros_like(x) #zero array shaped like x
       
        try:#y sequence like x but shifted by 1
            y[:, :-1] = x[:, 1:] 
            y[:, -1]  = encoded_text[:, n+seq_len]
                
        except:#potential indexing error at the end
            y[:, :-1] = x[:, 1:]
            y[:, -1] = encoded_text[:, 0]
            
        yield x, y

In [None]:
sample = encoded_text[:20]
sample

array([24, 82, 25, 38,  6, 62, 25, 55, 68, 17, 39, 17, 17,  9, 17, 55, 80,
       54,  7, 25])

In [None]:
batch_generator = generate_batches(sample,samp_per_batch=2,seq_len=5)
x,y = next(batch_generator)#one step thru the generator

In [None]:
x

array([[24, 82, 25, 38,  6],
       [39, 17, 17,  9, 17]])

In [None]:
y# the same sequence as x but shifted by 1

array([[82, 25, 38,  6, 62],
       [17, 17,  9, 17, 55]])

In [None]:
torch.cuda.is_available()

True

In [None]:
class CharModel(nn.Module):
    
    def __init__(self, all_chars, num_hidden=256, num_layers=4,drop_prob=0.5,use_gpu=False):
        
        
        # SET UP ATTRIBUTES
        super().__init__()
        self.drop_prob = drop_prob
        self.num_layers = num_layers
        self.num_hidden = num_hidden
        self.use_gpu = use_gpu
        
        self.all_chars = all_chars
        self.decoder = dict(enumerate(all_chars)) #internal decoder
        self.encoder = {char: ind for ind,char in decoder.items()} #internal encoder
                                                                                            #if True input and output provided at batch,seq,feature (the same format as the batch generator)
        self.lstm = nn.LSTM(len(self.all_chars), num_hidden, num_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(drop_prob) #dropout layer
        self.fc_linear = nn.Linear(num_hidden, len(self.all_chars))
      
    
    def forward(self, x, hidden):
                  
        lstm_output, hidden = self.lstm(x, hidden)
        drop_output = self.dropout(lstm_output)
        drop_output = drop_output.contiguous().view(-1, self.num_hidden) #needs to be reshaped to be connected to the last layer
        final_out = self.fc_linear(drop_output)

        return final_out, hidden
    
    def hidden_state(self, batch_size): #separate method to take gpu/cpu into account 
       
        if self.use_gpu:    
            hidden = (torch.zeros(self.num_layers,batch_size,self.num_hidden).cuda(),
                     torch.zeros(self.num_layers,batch_size,self.num_hidden).cuda())
        else:
            hidden = (torch.zeros(self.num_layers,batch_size,self.num_hidden),
                     torch.zeros(self.num_layers,batch_size,self.num_hidden))
        
        return hidden
        

In [None]:
model = CharModel(
    all_chars=all_characters,
    num_hidden=332,
    num_layers=3,
    drop_prob=0.5,
    use_gpu=True,
)

In [None]:
total_param  = [] #ideally the num of params the same as len(text)
for p in model.parameters():
    total_param.append(int(p.numel()))

In [None]:
sum(total_param)

2365260

In [None]:
len(text)

2277900

In [None]:
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
criterion = nn.CrossEntropyLoss()

In [None]:
train_percent = 0.9
train_ind = int(len(encoded_text) * (train_percent)) #to cut off the data for training

train_data = encoded_text[:train_ind]
val_data = encoded_text[train_ind:]

In [None]:
epochs = 50
batch_size = 128
seq_len = 100 # Length of sequence

#printing starts at 0
tracker = 0

# number of characters in text
num_char = max(encoded_text)+1

In [None]:
#training section

model.train()

if model.use_gpu:
    model.cuda()

for i in range(epochs):
    
    hidden = model.hidden_state(batch_size)
    
    
    for x,y in generate_batches(train_data,batch_size,seq_len):
        
        tracker += 1
        
        x = one_hot_encoder(x,num_char)#one-hot encoded data
        
        inputs = torch.from_numpy(x) # arrays converted to tensors
        targets = torch.from_numpy(y)
        
        if model.use_gpu:
            
            inputs = inputs.cuda()
            targets = targets.cuda()
            

        hidden = tuple([state.data for state in hidden]) #resets the hidden state, so it doesn´t backpropagate
        
        model.zero_grad()
        
        lstm_output, hidden = model.forward(inputs,hidden)
        loss = criterion(lstm_output,targets.view(batch_size*seq_len).long())
        
        loss.backward()
        
        nn.utils.clip_grad_norm_(model.parameters(),max_norm=5)#clips gradients so they don´t explode
        
        optimizer.step()

        if tracker % 25 == 0:
            
            val_hidden = model.hidden_state(batch_size)
            val_losses = []

            #training section

            model.eval()
            
            for x,y in generate_batches(val_data,batch_size,seq_len):
                
                x = one_hot_encoder(x,num_char) #one-hot encoded data
                
                inputs = torch.from_numpy(x) # arrays converted to tensor
                targets = torch.from_numpy(y)

                if model.use_gpu:

                    inputs = inputs.cuda()
                    targets = targets.cuda()
                    
                val_hidden = tuple([state.data for state in val_hidden])#again resets hidden
                
                lstm_output, val_hidden = model.forward(inputs,val_hidden)
                val_loss = criterion(lstm_output,targets.view(batch_size*seq_len).long())
        
                val_losses.append(val_loss.item())
            
            model.train() #resets back into training mode
            
            print(f"Epoch: {i} Step: {tracker} Val Loss: {val_loss.item()}")

Epoch: 0 Step: 25 Val Loss: 3.112819194793701
Epoch: 0 Step: 50 Val Loss: 3.10469126701355
Epoch: 0 Step: 75 Val Loss: 3.1051485538482666
Epoch: 0 Step: 100 Val Loss: 3.1035537719726562
Epoch: 0 Step: 125 Val Loss: 3.1016550064086914
Epoch: 0 Step: 150 Val Loss: 3.103311777114868
Epoch: 1 Step: 175 Val Loss: 3.0972135066986084
Epoch: 1 Step: 200 Val Loss: 3.0428037643432617
Epoch: 1 Step: 225 Val Loss: 2.831845760345459
Epoch: 1 Step: 250 Val Loss: 2.6896631717681885
Epoch: 1 Step: 275 Val Loss: 2.578984260559082
Epoch: 1 Step: 300 Val Loss: 2.4870636463165283
Epoch: 2 Step: 325 Val Loss: 2.4149580001831055
Epoch: 2 Step: 350 Val Loss: 2.3701016902923584
Epoch: 2 Step: 375 Val Loss: 2.3276145458221436
Epoch: 2 Step: 400 Val Loss: 2.297665596008301
Epoch: 2 Step: 425 Val Loss: 2.257132053375244
Epoch: 2 Step: 450 Val Loss: 2.2191321849823
Epoch: 2 Step: 475 Val Loss: 2.183788537979126
Epoch: 3 Step: 500 Val Loss: 2.150916814804077
Epoch: 3 Step: 525 Val Loss: 2.1153910160064697
Epoch: 3

In [None]:
def predict_next_char(model, char, hidden=None, k=1):
        
        encoded_text = model.encoder[char] #will use chars in seed to predict following letters
        encoded_text = np.array([[encoded_text]])#convert to array for one-hot encoding
        encoded_text = one_hot_encoder(encoded_text, len(model.all_chars))
        
        inputs = torch.from_numpy(encoded_text)#array to tensor

        if(model.use_gpu):
            inputs = inputs.cuda()
        
        hidden = tuple([state.data for state in hidden]) #hidden states for the model
        
        lstm_out, hidden = model(inputs, hidden)#get the output

        probs = F.softmax(lstm_out, dim=1).data #lstm to probalities
        
        if(model.use_gpu):
            probs = probs.cpu()# move back to CPU to use with numpy
              
        probs, index_positions = probs.topk(k)#number of chars prediction based on
        
        index_positions = index_positions.numpy().squeeze()
        
        probs = probs.numpy().flatten() # flattened array of probabilities
        
        probs = probs/probs.sum() #probs per index
      
        char = np.random.choice(index_positions, p=probs)  # randomly choose a character based on probabilities
       
        return model.decoder[char], hidden # return the encoded value of the predicted char and the hidden state

In [None]:
def generate_text(model, size, seed='Welcome', k=1): #prediction based on 1 top char 
    
    if(model.use_gpu):
        model.cuda()
    else:
        model.cpu()
    
    model.eval()
    
    output_chars = [c for c in seed] #output based on the seed
    
    # intiate hidden state
    hidden = model.hidden_state(1)
    
    for char in seed: #predicts next char for every cher in seed
        char, hidden = predict_next_char(model, char, hidden, k=k)
    
    output_chars.append(char) #adds initial chars to ouput
    
    for i in range(size): #repeats the operation for size provided
        
        char, hidden = predict_next_char(model, output_chars[-1], hidden, k=k) # predict based off last letter in output_chars
        
        output_chars.append(char)# add predicted character
        
    return ''.join(output_chars) # return string of predicted text

In [None]:
with open("speech.txt", "w") as text_file:
  text_file.write(generate_text(model,1000, seed='Ohio', k=3))
with open('/content/speech.txt','r',encoding='utf8') as f:
  text = f.read()
text

'Ohio and I won our stuff, this was a great start. Thank you. Thank you. I said, “Washington. We are going to be the program. They wouldn’t have taken a great president. I said, “What’s too building a lot of things are standing up to tell me that. That’s a big person. It was truck to send our care of our children with the party of their country. We are trying to be the strongest than anybody that they’re dealing that we’re doing and we have the best is a sacrifice that we have to say, “What a group and any president. I don’t know, they don’t have to be saying. It’s the way, they hope you’re going to be a big deal. That’s what the party of this power of the second children. What are you had to do it about a good, and thank you, Mr. President. I said, “I want to thank this state of the women.:  It’s a lot of could go together where I was a long time, because I didn’t have to be a lot of people who stand in our country.:  They want to say, “That made the state of America with the world an