In [6]:
import pandas as pd

t_data = pd.read_csv('trump_insult_tweets_2014_to_2021.csv',index_col=0)
t_data.date = pd.to_datetime(t_data.date)
t_data.head()

Unnamed: 0,date,target,insult,tweet
1,2014-10-09,thomas-frieden,fool,"Can you believe this fool, Dr. Thomas Frieden ..."
2,2014-10-09,thomas-frieden,DOPE,"Can you believe this fool, Dr. Thomas Frieden ..."
3,2015-06-16,politicians,all talk and no action,Big time in U.S. today - MAKE AMERICA GREAT AG...
4,2015-06-24,ben-cardin,It's politicians like Cardin that have destroy...,Politician @SenatorCardin didn't like that I s...
5,2015-06-24,neil-young,total hypocrite,"For the nonbeliever, here is a photo of @Neily..."


In [7]:
t_data.isnull().sum() # checking for null values

date      0
target    2
insult    0
tweet     0
dtype: int64

## Data Cleaning

In [8]:
t_data.dropna(how='any',axis=0,inplace=True) # dropping 2 rows which has missing values

In [9]:
t_data.dtypes # checking data types

date      datetime64[ns]
target            object
insult            object
tweet             object
dtype: object

In [10]:
t_data.shape

(10358, 4)

In [16]:
import sklearn
from sklearn.model_selection import train_test_split

# Split the dataset into 60% - training, 20% - validation, and 20% - testing
train_data, test_data = train_test_split(t_data, test_size = 0.2, random_state = 42)

train_data, valid_data = train_test_split(train_data, test_size = 0.25, random_state = 42) 

In [19]:
print(f'Number of total examples: {len(t_data)}')
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of total examples: 10358
Number of training examples: 6214
Number of validation examples: 2072
Number of testing examples: 2072


In [18]:
train_data.to_csv('train.csv', index = False)
test_data.to_csv('test.csv', index = False)
valid_data.to_csv('valid.csv', index = False)

In [30]:
train_data.head()

Unnamed: 0,date,target,insult,tweet
7471,2019-12-12,msnbc,Fake News,"It’s great to have a wonderful subject, Presid..."
1453,2016-05-28,the-media,biased,Don't believe the biased and phony media quoti...
1174,2016-03-18,megyn-kelly,sick,Everybody should boycott the @megynkelly show....
8040,2020-03-02,michael-bloomberg,bad debate performances,“Ever since (Mini Mike) Bloomberg’s bad debate...
1307,2016-05-07,elizabeth-warren,"All talk, no action -- maybe her Native Americ...",Goofy Elizabeth Warren is weak and ineffective...


In [46]:
import torch
from torchtext.legacy import data
import spacy 

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# TEXT = data.Field(tokenize = 'spacy', tokenizer_language = 'en_core_web_sm')
# LABEL = data.LabelField(dtype = torch.float)

# Load model to return language object, note that en is deprecated
nlp = spacy.load('en_core_web_sm')

# Calling nlp on tweet texts to return a processed doc for each
# train_data['tokenized_tweet'] = [nlp(tweet) for tweet in train_data.tweet]
# train_data.sample(3)

# Create a tokenizer function
def tokenizer_en(text: str) -> List[str]:
    return [tok.text for tok in nlp.tokenizer(text)]


DATE = data.Field()
TARGET = data.Field()
INSULT = data.Field()
TWEET = data.Field(
  tokenize    = tokenizer_en,
  lower       = True,
  batch_first = True,
  init_token  = '<bos>',
  eos_token   = '<eos>'
)



NameError: name 'List' is not defined

In [43]:
train_data.drop('doc', axis=1, inplace=True)
train_data.head(3)

Unnamed: 0,date,target,insult,tweet
7471,2019-12-12,msnbc,Fake News,"It’s great to have a wonderful subject, Presid..."
1453,2016-05-28,the-media,biased,Don't believe the biased and phony media quoti...
1174,2016-03-18,megyn-kelly,sick,Everybody should boycott the @megynkelly show....


In [45]:
#fields = {'date': ('d', date), 'target': ('t', target), 'insult': ('i', insult), 'tweet': ('t', tweet)}
fields = {'date': ('d', DATE), 'target': ('t', TARGET), 'insult': ('i', INSULT), 'tweet': ('t', TWEET)}

train_data, valid_data, test_data = data.TabularDataset.splits(
                                        path = 'data',
                                        train = 'train.csv',
                                        validation = 'valid.csv',
                                        test = 'test.csv',
                                        format = 'csv',
                                        fields = fields,
                                        skip_header = False
)

print(train_data[0]. __dict__.keys())
print(train_data[0]. __dict__.values())

NameError: name 'DATE' is not defined

# Data Preprocessing 

In [18]:
import re

text_dict = dict()
for (tweet,date) in zip(t_data.tweet,t_data.date):
    text_dict[(tweet,date)] = text_dict.get((tweet,date),0) + 1
    
text_dict = list(text_dict.keys())
f_data = pd.DataFrame(text_dict,columns=['text','date'])

# Convert into lowercase
f_data.text =f_data.text.str.lower()

# Remove twitter handlers
f_data.text = f_data.text.apply(lambda x:re.sub('@[^\s]+','',x))

# Remove hashtags
f_data.text = f_data.text.apply(lambda x:re.sub(r'\B#\S+','',x))

# Remove URLs
f_data.text = f_data.text.apply(lambda x:re.sub(r"http\S+", "", x))

# Remove all single characters
f_data.text = f_data.text.apply(lambda x:re.sub(r'\s+[a-zA-Z]\s+', '', x))

# Substituting multiple spaces with single space
f_data.text = f_data.text.apply(lambda x:re.sub(r'\s+', ' ', x, flags=re.I))

# Remove all the special characters
f_data.text = f_data.text.apply(lambda x:' '.join(re.findall(r'\w+', x)))

f_data.head()

def cleanup_text(texts):
    cleaned_text = []
    for text in texts:
        # remove punctuation
        text = re.sub('[^a-zA-Z0-9]', ' ', text)
        # remove multiple spaces
        text = re.sub(r' +', ' ', text)
        # remove newline
        text = re.sub(r'\n', ' ', text)
        cleaned_text.append(text)
    return cleaned_text

Unnamed: 0,text,date
0,can you believe this fool dr thomas frieden of...,2014-10-09
1,big time in u s today make america great again...,2015-06-16
2,politician didn t like thatsaid baltimore need...,2015-06-24
3,for the nonbeliever here isphoto of in my offi...,2015-06-24
4,song rockin in the free world was just one of ...,2015-06-24


In [19]:
len(f_data)

5680

# Split