In [1]:
from __future__ import print_function
import pdb
import math
from math import ceil
import numpy as np
import sys
import pandas as pd
import re

import torch
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
import torch.optim as optim
import torch.autograd as autograd

import helpers

In [2]:
VOCAB_SIZE = 5000
MAX_SEQ_LEN = 20
START_LETTER = 0
BATCH_SIZE = 32
MLE_TRAIN_EPOCHS = 100
ADV_TRAIN_EPOCHS = 50
POS_NEG_SAMPLES = 5000

GEN_EMBEDDING_DIM = 32
GEN_HIDDEN_DIM = 32
DIS_EMBEDDING_DIM = 64
DIS_HIDDEN_DIM = 64

data_path = './donaldtrump.csv'

In [3]:
df = pd.DataFrame.from_csv(data_path)
df

Unnamed: 0_level_0,Time,Tweet_Text,Type,Media_Type,Hashtags,Tweet_Id,Tweet_Url,twt_favourites_IS_THIS_LIKE_QUESTION_MARK,Retweets,Unnamed: 10,Unnamed: 11
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2011-11-16,15:26:37,Today we express our deepest gratitude to all ...,text,photo,ThankAVet,7.970000e+17,https://twitter.com/realDonaldTrump/status/797...,127213,41112,,
2011-11-16,13:33:35,Busy day planned in New York. Will soon be mak...,text,,,7.970000e+17,https://twitter.com/realDonaldTrump/status/797...,141527,28654,,
2011-11-16,11:14:20,Love the fact that the small groups of protest...,text,,,7.970000e+17,https://twitter.com/realDonaldTrump/status/797...,183729,50039,,
2011-11-16,2:19:44,Just had a very open and successful presidenti...,text,,,7.970000e+17,https://twitter.com/realDonaldTrump/status/796...,214001,67010,,
2011-11-16,2:10:46,A fantastic day in D.C. Met with President Oba...,text,,,7.970000e+17,https://twitter.com/realDonaldTrump/status/796...,178499,36688,,
2010-11-16,19:31:27,Happy 241st birthday to the U.S. Marine Corps!...,text,photo,,7.970000e+17,https://twitter.com/realDonaldTrump/status/796...,159176,44655,,
2009-11-16,11:36:58,Such a beautiful and important evening! The fo...,text,,,7.960000e+17,https://twitter.com/realDonaldTrump/status/796...,627615,225164,,
2009-11-16,2:48:27,Watching the returns at 9:45pm.\n#ElectionNigh...,text,photo,ElectionNight;MAGA,7.960000e+17,https://twitter.com/realDonaldTrump/status/796...,185160,45492,,
2009-11-16,1:35:15,RT @IvankaTrump: Such a surreal moment to vote...,text,,,7.960000e+17,https://twitter.com/realDonaldTrump/status/796...,99809,17169,,
2008-11-16,23:20:39,RT @EricTrump: Join my family in this incredib...,text,,MakeAmericaGreatAgain;VOTE,7.960000e+17,https://twitter.com/realDonaldTrump/status/796...,63868,19710,,


In [4]:
tweet_tokens = []

SOS_token = 0
EOS_token = 1
UNK_token = 2
index2word = {0: '<SOS>', 1: '<EOS>', 2: '<UNK>'}
word2index = {'<UNK>': 2}
word2count = {'<UNK>': 0}
n_words = 3

for s in df.Tweet_Text:  
    s = s.lower()
    tokens = helpers.preprocess(s)
    
    tweet_tokens.append(SOS_token)
    
    for word in tokens:
        word = re.sub(r'<[^>]+>', '<HTML>', word)
        word = re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', '<URL>', word)
        word = re.sub(r'(?:@[\w_]+)', '<@-mentions>', word)
        if word not in word2index:
            if (n_words<VOCAB_SIZE):
                word2count[word] = 1
                word2index[word] = n_words
                index2word[n_words] = word
                n_words += 1
            else:
                word = '<UNK>'
                word2count[word] += 1
        else:
            word2count[word] += 1
            
        tweet_tokens.append(word2index[word])
        
    tweet_tokens.append(EOS_token)
test_tokens = tweet_tokens[MAX_SEQ_LEN*POS_NEG_SAMPLES+1:]
tweet_tokens = tweet_tokens[:MAX_SEQ_LEN*POS_NEG_SAMPLES]
oracle_samples = torch.LongTensor(tweet_tokens).view(POS_NEG_SAMPLES,MAX_SEQ_LEN).type(torch.LongTensor)

In [5]:
class Generator(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, max_seq_len, oracle_init=False):
        super(Generator, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.max_seq_len = max_seq_len
        self.vocab_size = vocab_size

        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_dim)
        self.gru2out = nn.Linear(hidden_dim, vocab_size)

        if oracle_init:
            for p in self.parameters():
                init.normal(p, 0, 1)

    def init_hidden(self, batch_size=1):
        h = autograd.Variable(torch.zeros(1, batch_size, self.hidden_dim))
        return h

    def forward(self, inp, hidden):
        # input dim                                             # batch_size
        emb = self.embeddings(inp)                              # batch_size x embedding_dim
        emb = emb.view(1, -1, self.embedding_dim)               # 1 x batch_size x embedding_dim
        out, hidden = self.gru(emb, hidden)                     # 1 x batch_size x hidden_dim (out)
        out = self.gru2out(out.view(-1, self.hidden_dim))       # batch_size x vocab_size
        out = F.log_softmax(out, dim=0)
        return out, hidden

    def sample(self, num_samples, start_letter=0):

        samples = torch.zeros(num_samples, self.max_seq_len).type(torch.LongTensor)

        h = self.init_hidden(num_samples)
        inp = autograd.Variable(torch.LongTensor([start_letter]*num_samples))

        for i in range(self.max_seq_len):
            out, h = self.forward(inp, h)               # out: num_samples x vocab_size
            out = torch.multinomial(torch.exp(out), 1)  # num_samples x 1 (sampling from each row)
            samples[:, i] = out.data

            inp = out.view(-1)

        return samples

    def batchNLLLoss(self, inp, target):
        loss_fn = nn.NLLLoss()
        batch_size, seq_len = inp.size()
        inp = inp.permute(1, 0)           # seq_len x batch_size
        target = target.permute(1, 0)     # seq_len x batch_size
        h = self.init_hidden(batch_size)

        loss = 0
        for i in range(seq_len):
            out, h = self.forward(inp[i], h)
            loss += loss_fn(out, target[i])

        return loss     # per batch

    def batchPGLoss(self, inp, target, reward):
        batch_size, seq_len = inp.size()
        inp = inp.permute(1, 0)          # seq_len x batch_size
        target = target.permute(1, 0)    # seq_len x batch_size
        h = self.init_hidden(batch_size)

        loss = 0
        for i in range(seq_len):
            out, h = self.forward(inp[i], h)
            for j in range(batch_size):
                loss += -out[j][target.data[i][j]]*reward[j]     # log(P(y_t|Y_1:Y_{t-1})) * Q

        return loss/batch_size

In [6]:
class Discriminator(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, max_seq_len, dropout=0.2):
        super(Discriminator, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.max_seq_len = max_seq_len

        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers=2, bidirectional=True, dropout=dropout)
        self.gru2hidden = nn.Linear(2*2*hidden_dim, hidden_dim)
        self.dropout_linear = nn.Dropout(p=dropout)
        self.hidden2out = nn.Linear(hidden_dim, 1)

    def init_hidden(self, batch_size):
        h = autograd.Variable(torch.zeros(2*2*1, batch_size, self.hidden_dim))
        return h

    def forward(self, input, hidden):
        # input dim                                                # batch_size x seq_len
        emb = self.embeddings(input)                               # batch_size x seq_len x embedding_dim
        emb = emb.permute(1, 0, 2)                                 # seq_len x batch_size x embedding_dim
        _, hidden = self.gru(emb, hidden)                          # 4 x batch_size x hidden_dim
        hidden = hidden.permute(1, 0, 2).contiguous()              # batch_size x 4 x hidden_dim
        out = self.gru2hidden(hidden.view(-1, 4*self.hidden_dim))  # batch_size x 4*hidden_dim
        out = F.tanh(out)
        out = self.dropout_linear(out)
        out = self.hidden2out(out)                                 # batch_size x 1
        out = F.sigmoid(out)
        return out

    def batchClassify(self, inp):
        h = self.init_hidden(inp.size()[0])
        out = self.forward(inp, h)
        return out.view(-1)

    def batchBCELoss(self, inp, target):
        loss_fn = nn.BCELoss()
        h = self.init_hidden(inp.size()[0])
        out = self.forward(inp, h)
        return loss_fn(out, target)

In [7]:
def train_generator_MLE(gen, gen_opt, oracle, real_data_samples, epochs):
    for epoch in range(epochs):
        print('epoch %d : ' % (epoch + 1), end='')
        sys.stdout.flush()
        total_loss = 0

        for i in range(0, POS_NEG_SAMPLES, BATCH_SIZE):
            inp, target = helpers.prepare_generator_batch(real_data_samples[i:i + BATCH_SIZE], start_letter=START_LETTER)
            gen_opt.zero_grad()
            loss = gen.batchNLLLoss(inp, target)
            loss.backward()
            gen_opt.step()

            total_loss += loss.data[0]

            if (i / BATCH_SIZE) % ceil(
                            ceil(POS_NEG_SAMPLES / float(BATCH_SIZE)) / 10.) == 0:  # roughly every 10% of an epoch
                print('.', end='')
                sys.stdout.flush()
                
        total_loss = total_loss / ceil(POS_NEG_SAMPLES / float(BATCH_SIZE)) / MAX_SEQ_LEN
        oracle_loss = helpers.batchwise_oracle_nll(gen, oracle, POS_NEG_SAMPLES, BATCH_SIZE, MAX_SEQ_LEN, start_letter=START_LETTER)

        print(' average_train_NLL = %.4f, oracle_sample_NLL = %.4f' % (total_loss, oracle_loss))

In [8]:
def train_generator_PG(gen, gen_opt, oracle, dis, num_batches):
    for batch in range(num_batches):
        s = gen.sample(BATCH_SIZE*2)
        inp, target = helpers.prepare_generator_batch(s, start_letter=START_LETTER)
        rewards = dis.batchClassify(target)

        gen_opt.zero_grad()
        pg_loss = gen.batchPGLoss(inp, target, rewards)
        pg_loss.backward()
        gen_opt.step()
        
    oracle_loss = helpers.batchwise_oracle_nll(gen, oracle, POS_NEG_SAMPLES, BATCH_SIZE, MAX_SEQ_LEN, start_letter=START_LETTER)

    print(' oracle_sample_NLL = %.4f' % oracle_loss)

In [9]:
def train_discriminator(discriminator, dis_opt, real_data_samples, generator, oracle, d_steps, epochs):
    pos_val = oracle.sample(100)
    neg_val = generator.sample(100)
    val_inp, val_target = helpers.prepare_discriminator_data(pos_val, neg_val)

    for d_step in range(d_steps):
        s = helpers.batchwise_sample(generator, POS_NEG_SAMPLES, BATCH_SIZE)
        dis_inp, dis_target = helpers.prepare_discriminator_data(real_data_samples, s)
        for epoch in range(epochs):
            print('d-step %d epoch %d : ' % (d_step + 1, epoch + 1), end='')
            sys.stdout.flush()
            total_loss = 0
            total_acc = 0

            for i in range(0, 2 * POS_NEG_SAMPLES, BATCH_SIZE):
                inp, target = dis_inp[i:i + BATCH_SIZE], dis_target[i:i + BATCH_SIZE]
                dis_opt.zero_grad()
                out = discriminator.batchClassify(inp)
                loss_fn = nn.BCELoss()
                loss = loss_fn(out, target)
                loss.backward()
                dis_opt.step()

                total_loss += loss.data[0]
                total_acc += torch.sum((out>0.5)==(target>0.5)).data[0]

                if (i / BATCH_SIZE) % ceil(ceil(2 * POS_NEG_SAMPLES / float(
                        BATCH_SIZE)) / 10.) == 0:
                    print('.', end='')
                    sys.stdout.flush()

            total_loss /= ceil(2 * POS_NEG_SAMPLES / float(BATCH_SIZE))
            total_acc /= float(2 * POS_NEG_SAMPLES)

            val_pred = discriminator.batchClassify(val_inp)
            print(' average_loss = %.4f, train_acc = %.4f, val_acc = %.4f' % (
                total_loss, total_acc, torch.sum((val_pred>0.5)==(val_target>0.5)).data[0]/200.))

In [10]:
oracle = Generator(GEN_EMBEDDING_DIM, GEN_HIDDEN_DIM, VOCAB_SIZE, MAX_SEQ_LEN, oracle_init=True)

gen = Generator(GEN_EMBEDDING_DIM, GEN_HIDDEN_DIM, VOCAB_SIZE, MAX_SEQ_LEN)
dis = Discriminator(DIS_EMBEDDING_DIM, DIS_HIDDEN_DIM, VOCAB_SIZE, MAX_SEQ_LEN)

print('Starting Generator MLE Training...')
gen_optimizer = optim.Adam(gen.parameters(), lr=1e-2)
train_generator_MLE(gen, gen_optimizer, oracle, oracle_samples, MLE_TRAIN_EPOCHS)

#print('\nStarting Discriminator Training...')
#dis_optimizer = optim.Adagrad(dis.parameters())
#train_discriminator(dis, dis_optimizer, oracle_samples, gen, oracle, 50, 3)
#
#print('\nStarting Adversarial Training...')
#oracle_loss = helpers.batchwise_oracle_nll(gen, oracle, POS_NEG_SAMPLES, BATCH_SIZE, MAX_SEQ_LEN, start_letter=START_LETTER)
#print('\nInitial Oracle Sample Loss : %.4f' % oracle_loss)
#
#for epoch in range(ADV_TRAIN_EPOCHS):
#    print('\n--------\nEPOCH %d\n--------' % (epoch+1))
#    
#    print('\nAdversarial Training Generator : ', end='')
#    sys.stdout.flush()
#    train_generator_PG(gen, gen_optimizer, oracle, dis, 1)
#
#    print('\nAdversarial Training Discriminator : ')
#    train_discriminator(dis, dis_optimizer, oracle_samples, gen, oracle, 5, 3)

Starting Generator MLE Training...
epoch 1 : .......... average_train_NLL = 2.8151, oracle_sample_NLL = 9.5380
epoch 2 : .......... average_train_NLL = 2.3287, oracle_sample_NLL = 9.5161
epoch 3 : .......... average_train_NLL = 2.0794, oracle_sample_NLL = 9.5248
epoch 4 : .......... average_train_NLL = 1.9295, oracle_sample_NLL = 9.5216
epoch 5 : .......... average_train_NLL = 1.8312, oracle_sample_NLL = 9.5259
epoch 6 : .......... average_train_NLL = 1.7589, oracle_sample_NLL = 9.5179
epoch 7 : .......... average_train_NLL = 1.7033, oracle_sample_NLL = 9.4839
epoch 8 : .......... average_train_NLL = 1.6596, oracle_sample_NLL = 9.5177
epoch 9 : .......... average_train_NLL = 1.6253, oracle_sample_NLL = 9.5333
epoch 10 : .......... average_train_NLL = 1.5966, oracle_sample_NLL = 9.5360
epoch 11 : .......... average_train_NLL = 1.5712, oracle_sample_NLL = 9.5460
epoch 12 : .......... average_train_NLL = 1.5500, oracle_sample_NLL = 9.4838
epoch 13 : .......... average_train_NLL = 1.5285, 

In [21]:
sample = gen.sample(1).view(-1,1)
words = [index2word[int(i)] for i in sample]
text = ""
for word in words:
    text += word
    text += " "
print(text)

disenfranchised dear opposed visit smoothly islamic based blue-collar therefore director est two-man strictly platforms brutal atlanta opo palazzo companies nafta 
