## Load the Dataset

In [15]:
import pandas as pd

# Path for MAC OS
t_data = pd.read_csv(r'data/trump_insult_tweets_2014_to_2021.csv',index_col=0)
t_data.date = pd.to_datetime(t_data.date)
t_data.head()

Unnamed: 0,date,target,insult,tweet
1,2014-10-09,thomas-frieden,fool,"Can you believe this fool, Dr. Thomas Frieden ..."
2,2014-10-09,thomas-frieden,DOPE,"Can you believe this fool, Dr. Thomas Frieden ..."
3,2015-06-16,politicians,all talk and no action,Big time in U.S. today - MAKE AMERICA GREAT AG...
4,2015-06-24,ben-cardin,It's politicians like Cardin that have destroy...,Politician @SenatorCardin didn't like that I s...
5,2015-06-24,neil-young,total hypocrite,"For the nonbeliever, here is a photo of @Neily..."


In [16]:
t_data.isnull().sum() # checking for null values

date      0
target    2
insult    0
tweet     0
dtype: int64

## Data Cleaning

In [17]:
t_data.dropna(how='any',axis=0,inplace=True) # dropping 2 rows which has missing values

In [18]:
t_data.dtypes # checking data types

date      datetime64[ns]
target            object
insult            object
tweet             object
dtype: object

In [19]:
t_data.shape

(10358, 4)

## Splitting the Dataset

In [20]:
import sklearn
from sklearn.model_selection import train_test_split

# Split the dataset into 60% - training, 20% - validation, and 20% - testing
train_data, test_data = train_test_split(t_data, test_size = 0.2, random_state = 42)

train_data, valid_data = train_test_split(train_data, test_size = 0.25, random_state = 42) 

In [21]:
print(f'Number of total examples: {len(t_data)}')
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of total examples: 10358
Number of training examples: 6214
Number of validation examples: 2072
Number of testing examples: 2072


In [23]:
train_data.to_csv(r'data/train.csv', index = False)
test_data.to_csv(r'data/test.csv', index = False)
valid_data.to_csv(r'data/valid.csv', index = False)

In [24]:
train_data.head()

Unnamed: 0,date,target,insult,tweet
7471,2019-12-12,msnbc,Fake News,"It’s great to have a wonderful subject, Presid..."
1453,2016-05-28,the-media,biased,Don't believe the biased and phony media quoti...
1174,2016-03-18,megyn-kelly,sick,Everybody should boycott the @megynkelly show....
8040,2020-03-02,michael-bloomberg,bad debate performances,“Ever since (Mini Mike) Bloomberg’s bad debate...
1307,2016-05-07,elizabeth-warren,"All talk, no action -- maybe her Native Americ...",Goofy Elizabeth Warren is weak and ineffective...


In [25]:
import torch
from torchtext.legacy import data
import spacy 
from typing import List

SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# TEXT = data.Field(tokenize = 'spacy', tokenizer_language = 'en_core_web_sm')
# LABEL = data.LabelField(dtype = torch.float)

# Load model to return language object, note that en is deprecated
nlp = spacy.load('en_core_web_sm')

# Calling nlp on tweet texts to return a processed doc for each
# train_data['tokenized_tweet'] = [nlp(tweet) for tweet in train_data.tweet]
# train_data.sample(3)

# Create a tokenizer function
def tokenizer_en(text: str) -> List[str]:
    return [tok.text for tok in nlp.tokenizer(text)]

DATE = data.Field()
TARGET = data.Field()
INSULT = data.Field()
TWEET = data.Field(
  tokenize    = tokenizer_en,
  lower       = True,
  batch_first = True,
  init_token  = '<bos>',
  eos_token   = '<eos>'
)

In [25]:
# train_data.drop('doc', axis=1, inplace=True)
# train_data.head(3)

In [26]:
fields = {'date': ('d', DATE), 'target': ('ta', TARGET), 'insult': ('i', INSULT), 'tweet': ('tw', TWEET)}

train_data, valid_data, test_data = data.TabularDataset.splits(
                                        path = 'data',
                                        train = 'train.csv',
                                        validation = 'valid.csv',
                                        test = 'test.csv',
                                        format = 'csv',
                                        fields = fields,
                                        skip_header = False
)

print(train_data[0]. __dict__.keys())
print(train_data[0]. __dict__.values())
print(vars(train_data.examples[0]))

dict_keys(['d', 'ta', 'i', 'tw'])
dict_values([['2019-12-12'], ['msnbc'], ['Fake', 'News'], ['it', '’s', 'great', 'to', 'have', 'a', 'wonderful', 'subject', ',', 'president', 'trump', '.', 'fake', 'news', 'like', 'cnn', '&', 'msnbc', 'are', 'dying', '.', 'if', 'they', 'treated', 'me', 'fairly', ',', 'they', 'would', 'do', 'well', '.', 'have', 'zero', 'credibility', '!', 'https://t.co/yt8n8dgsco']])
{'d': ['2019-12-12'], 'ta': ['msnbc'], 'i': ['Fake', 'News'], 'tw': ['it', '’s', 'great', 'to', 'have', 'a', 'wonderful', 'subject', ',', 'president', 'trump', '.', 'fake', 'news', 'like', 'cnn', '&', 'msnbc', 'are', 'dying', '.', 'if', 'they', 'treated', 'me', 'fairly', ',', 'they', 'would', 'do', 'well', '.', 'have', 'zero', 'credibility', '!', 'https://t.co/yt8n8dgsco']}


## Building a Vocab

In [28]:
MAX_VOCAB_SIZE = 25_000

TWEET.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
TARGET.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)

print(f"Unique tokens in TWEET vocabulary: {len(TWEET.vocab)}") # addition include <unk> token and <pad> token
print(f"Unique tokens in TARGET vocabulary: {len(TARGET.vocab)}")

Unique tokens in TWEET vocabulary: 9125
Unique tokens in TARGET vocabulary: 731


In [29]:
print(TWEET.vocab.freqs.most_common(20))

[('the', 11136), (',', 10934), ('.', 9786), ('!', 5892), ('and', 5162), ('to', 4858), ('a', 4274), ('is', 4221), ('of', 3803), ('in', 2901), ('for', 2193), ('that', 2178), ('i', 2078), ('they', 2070), ('are', 1939), ('it', 1865), ('on', 1823), ('he', 1589), ('&', 1567), ('was', 1460)]


In [30]:
print(TWEET.vocab.itos[:10])

['<unk>', '<pad>', '<bos>', '<eos>', 'the', ',', '.', '!', 'and', 'to']


In [31]:
print(TARGET.vocab.stoi)

defaultdict(<bound method Vocab._default_unk_index of <torchtext.legacy.vocab.Vocab object at 0x14f5bf850>>, {'<unk>': 0, '<pad>': 1, 'the-media': 2, 'democrats': 3, 'hillary-clinton': 4, 'trump-russia': 5, 'joe-biden': 6, 'the-new-york-times': 7, 'cnn': 8, 'impeachment-inquiry': 9, 'adam-schiff': 10, 'nancy-pelosi': 11, '2020-election': 12, 'michael-bloomberg': 13, 'washington-post': 14, 'james-comey': 15, 'jeb-bush': 16, 'ted-cruz': 17, 'mueller-team': 18, 'marco-rubio': 19, 'elizabeth-warren': 20, 'fox-news': 21, 'chuck-schumer': 22, 'msnbc': 23, 'obamacare': 24, 'the-fed': 25, 'bernie-sanders': 26, 'barack-obama': 27, 'nbc-news': 28, 'mail-in-voting': 29, 'trump-report': 30, 'john-bolton': 31, 'karl-rove': 32, 'megyn-kelly': 33, 'michael-cohen': 34, 'chuck-todd': 35, 'mitt-romney': 36, 'morning-joe': 37, 'richard-blumenthal': 38, 'andrew-cuomo': 39, 'robert-mueller': 40, 'chris-cuomo': 41, 'fbi': 42, 'twitter': 43, 'brian-kemp': 44, 'andrew-mccabe': 45, 'john-mccain': 46, 'jeff-fla

## Constructing the Iterator

In [32]:
BATCH_SIZE = 64

# pytorch boilerplate that determines whether a GPU is present or not,
# this determines whether the dataset or model can to moved to a GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    device = device)

## Building the Model

### Simple RNN

In [33]:
import torch.nn as nn

# Additional info on word embeddings
# https://monkeylearn.com/blog/word-embeddings-transform-text-numbers/

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):  # three layers are an embedding layer, RNN, and a linear layer 
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):

        #text = [sent len, batch size]
        
        embedded = self.embedding(text)
        
        #embedded = [sent len, batch size, emb dim]
        
        output, hidden = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        return self.fc(hidden.squeeze(0))