In [64]:
from collections import Counter
import torch
import torch.nn as nn
from IPython.core.debugger import set_trace
from argparse import Namespace
import csv
import re
import numpy as np

In [65]:
with open('trumpTweets.csv', encoding='utf-8') as openFile:
    entireFileContent = list()
    file = csv.reader(openFile)
    for i,line in enumerate(file):
        if i == 0:
            continue
        
        #set_trace()       
        tweet = line[-1]
        #set_trace()
        tweet = re.sub(" +"," ", tweet)
        tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet, flags=re.MULTILINE)
        entireFileContent.append(tweet)

with open("Train.txt", "w") as trainer:
    for tweet in entireFileContent:
        trainer.write(tweet)
        trainer.write(".")
    

In [66]:
flags = Namespace(train_file='Train.txt',
                  seq_size=10,
                  batch_size=16,
                  embedding_size=64,
                  lstm_size=64,
                  gradients_norm=5,
                  initial_words=['I', 'am'],
                  predict_top_k=5,
                  checkpoint_path='checkpoint'
                 )

In [67]:
def get_data_from_file(train_file, batch_size, seq_size):
    with open(train_file, 'r') as f:
        text = f.read()
    text = text.split()

    word_counts = Counter(text)
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    int_to_vocab = {k: w for k, w in enumerate(sorted_vocab)}
    vocab_to_int = {w: k for k, w in int_to_vocab.items()}
    n_vocab = len(int_to_vocab)

    print('Vocabulary size', n_vocab)

    int_text = [vocab_to_int[w] for w in text]
    num_batches = int(len(int_text) / (seq_size * batch_size))
    in_text = int_text[:num_batches * batch_size * seq_size]
    out_text = np.zeros_like(in_text)
    out_text[:-1] = in_text[1:]
    out_text[-1] = in_text[0]
    in_text = np.reshape(in_text, (batch_size, -1))
    out_text = np.reshape(out_text, (batch_size, -1))
    return int_to_vocab, vocab_to_int, n_vocab, in_text, out_text

In [68]:
def get_batches(in_text, out_text, batch_size, seq_size):
    num_batches = np.prod(in_text.shape) // (seq_size * batch_size)
    for i in range(0, num_batches * seq_size, seq_size):
        yield in_text[:, i:i+seq_size], out_text[:, i:i+seq_size]

In [69]:
class RNNModule(nn.Module):
    def __init__(self, n_vocab, seq_size, embedding_size, lstm_size):
        super(RNNModule, self).__init__()
        self.seq_size = seq_size
        self.lstm_size = lstm_size
        self.embedding = nn.Embedding(n_vocab, embedding_size)
        self.lstm = nn.LSTM(embedding_size, lstm_size,batch_first=True)
        self.dense = nn.Linear(lstm_size, n_vocab)
    
    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.dense(output)
        return logits, state
    
    def zero_state(self, batch_size):
        return (torch.zeros(1, batch_size, self.lstm_size),torch.zeros(1, batch_size, self.lstm_size))

In [70]:
def get_loss_and_train_op(net, lr=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    return criterion, optimizer

In [71]:
def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    int_to_vocab, vocab_to_int, n_vocab, in_text, out_text = get_data_from_file(flags.train_file, flags.batch_size, flags.seq_size)
    net = RNNModule(n_vocab, flags.seq_size, flags.embedding_size, flags.lstm_size)
    net = net.to(device)

    criterion, optimizer = get_loss_and_train_op(net, 0.01)
    iteration = 0
    for e in range(50):
        batches = get_batches(in_text, out_text, flags.batch_size, flags.seq_size)
        state_h, state_c = net.zero_state(flags.batch_size)
        
        # Transfer data to GPU
        state_h = state_h.to(device)
        state_c = state_c.to(device)
        for x, y in batches:
            iteration += 1
            #set_trace()
            # Tell it we are in training mode
            net.train()
            # Reset all gradients
            optimizer.zero_grad()
            # Transfer data to GPU
            x = torch.tensor(x).to(device)
            y = torch.tensor(y).to(device)
            
            logits, (state_h, state_c) = net.forward(x, (state_h, state_c))
            loss = criterion(logits.transpose(1, 2), y)

            state_h = state_h.detach()
            state_c = state_c.detach()

            loss_value = loss.item()

            # Perform back-propagation
            loss.backward()
            #gradient clipping
            _ = torch.nn.utils.clip_grad_norm_(net.parameters(), flags.gradients_norm)

            # Update the network's parameters
            optimizer.step()
            
            if iteration % 100 == 0:
                print('Epoch: {}/{}'.format(e, 200),
                      'Iteration: {}'.format(iteration),
                      'Loss: {}'.format(loss_value))

            if iteration % 1000 == 0:
                predict(device, net, flags.initial_words, n_vocab, vocab_to_int, int_to_vocab, top_k=5)
                torch.save(net.state_dict(),'model-{}.pth'.format(iteration))

In [72]:
def predict(device, net, words, n_vocab, vocab_to_int, int_to_vocab, top_k):
    net.eval()
    state_h, state_c = net.zero_state(1)
    state_h = state_h.to(device)
    state_c = state_c.to(device)
    for w in words:
        ix = torch.tensor([[vocab_to_int[w]]]).to(device)
        output, (state_h, state_c) = net(ix, (state_h, state_c))
    
    _, top_ix = torch.topk(output[0], k=top_k)
    choices = top_ix.tolist()
    choice = np.random.choice(choices[0])

    words.append(int_to_vocab[choice])
    for _ in range(100):
        ix = torch.tensor([[choice]]).to(device)
        output, (state_h, state_c) = net(ix, (state_h, state_c))

        _, top_ix = torch.topk(output[0], k=top_k)
        choices = top_ix.tolist()
        choice = np.random.choice(choices[0])
        words.append(int_to_vocab[choice])

    print(' '.join(words))
    

In [73]:
main()

Vocabulary size 13301
Epoch: 0/200 Iteration: 100 Loss: 7.802257537841797
Epoch: 1/200 Iteration: 200 Loss: 6.63480806350708
Epoch: 2/200 Iteration: 300 Loss: 5.946083068847656
Epoch: 3/200 Iteration: 400 Loss: 4.878835678100586
Epoch: 4/200 Iteration: 500 Loss: 4.193507194519043
Epoch: 5/200 Iteration: 600 Loss: 3.5952210426330566
Epoch: 6/200 Iteration: 700 Loss: 3.365565061569214
Epoch: 7/200 Iteration: 800 Loss: 3.1843464374542236
Epoch: 8/200 Iteration: 900 Loss: 2.9766852855682373
Epoch: 9/200 Iteration: 1000 Loss: 2.621324062347412
I am getting stronger .RT @RNCResearch: Sen. Cruz On .Great The CBS are in New areas being… .The Dow does is doing really by one $bill… happen and their “neighborhood.” not. Thank you @LouDobbs talking for his “red line” Co… unstable WHISTLEBLOWER REPORT? .“Comey and thousands, to the @WhiteHouse with our farmers across the most disgraceful legal counsel of Fentanyl kills the Fed problem. can beat him, and more prosperity. &amp; lead the whistleblower

Epoch: 40/200 Iteration: 4100 Loss: 0.45679596066474915
Epoch: 41/200 Iteration: 4200 Loss: 0.37538471817970276
Epoch: 42/200 Iteration: 4300 Loss: 0.3943474292755127
Epoch: 43/200 Iteration: 4400 Loss: 0.4358651041984558
Epoch: 44/200 Iteration: 4500 Loss: 0.3703819215297699
Epoch: 45/200 Iteration: 4600 Loss: 0.4442828297615051
Epoch: 46/200 Iteration: 4700 Loss: 0.38362377882003784
Epoch: 47/200 Iteration: 4800 Loss: 0.4935443103313446
Epoch: 48/200 Iteration: 4900 Loss: 0.4776483178138733
Epoch: 49/200 Iteration: 5000 Loss: 0.38417869806289673
I am getting stronger .RT @RNCResearch: Sen. Cruz On .Great The CBS are in New areas being… .The Dow does is doing really by one $bill… happen and their “neighborhood.” not. Thank you @LouDobbs talking for his “red line” Co… unstable WHISTLEBLOWER REPORT? .“Comey and thousands, to the @WhiteHouse with our farmers across the most disgraceful legal counsel of Fentanyl kills the Fed problem. can beat him, and more prosperity. &amp; lead the whis

In [100]:
words=["CIA"]
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


int_to_vocab, vocab_to_int, n_vocab, in_text, out_text = get_data_from_file(flags.train_file, flags.batch_size, flags.seq_size)
model=RNNModule(n_vocab, 10, flags.embedding_size, flags.lstm_size)
model.load_state_dict(torch.load('model-5000.pth'))
predict(device, model, words, n_vocab, vocab_to_int, int_to_vocab, top_k=5)

Vocabulary size 13301
CIA acknowledge! for Eddie else, I Biden trust Mr. making by possible the “create” this is an anonymous Don’t for @realDonaldTrump &amp; themselves! entire Democrats, @RepMattGaetz: I’m is a Wack big inquisition The New Mexico tonight, for Republicans have an absolute Market gains in his BIG results! 🇺🇸 @senatemajldr: for the Obama FBI/DOJ, in other Media is working as they needed at their request, .Congratulations I hope incredible job,” was Farmers in lowering prescription drug pr… If it did not do this at 50%. .....the The #1 has no quid p….RT week. @POTUS from my new and the… .A lot It’s almost
