In [1]:
# Source: Davidson et al. (2017)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv("./data/labeled_data.csv", index_col=0)
raw_tweets = df.tweet
raw_labels = df["class"].values

In [2]:
df.head()

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


### Data Preprocessing 

In [4]:
from preprocess import preprocess

tweets = raw_tweets.map(preprocess)
print("Example of a raw tweet:\n{}".format(raw_tweets[68]))
print("\nIts cleaned version is:\n{}".format(preprocess(raw_tweets[68])))

Example of a raw tweet:
"@Almightywayne__: @JetsAndASwisher @Gook____ bitch fuck u http://t.co/pXmGA68NC1" maybe you'll get better. Just http://t.co/TPreVwfq0S

Its cleaned version is:
 ||Quotation_Mark|| MENTIONHERE : MENTIONHERE MENTIONHERE bitch fuck u URLHERE ||Quotation_Mark|| maybe you'll get better ||Period|| just URLHERE 


### Remove outliers

In [6]:
# Get cleaned tweets
df["clean_tweet"] = tweets

# Get their word count
df["word_count"] = df.clean_tweet.apply(lambda x : len(x.split()))

# See Hemker notebook for a fuller description of why this tweet is being processed this way
old_tweet = df.loc[df.word_count == df.word_count.max(),].tweet.values[0]
new_tweet = old_tweet[:old_tweet.find("\r")]
df.loc[df.word_count == df.word_count.max(), "tweet"] = new_tweet
df.loc[df.word_count == df.word_count.max(), "clean_tweet"] = preprocess(new_tweet)
df.loc[df.word_count == df.word_count.max(), "word_count"] = len(preprocess(new_tweet).split())

### Create lookup tables

In [10]:
from create_lookup_tables import create_lookup_tables

vocab_to_int, int_to_vocab = create_lookup_tables(tweets)

In [11]:
print("The size of the vocabulary is: {} tokens.".format(len(vocab_to_int)))
vocab = list(vocab_to_int.keys())
np.random.shuffle(vocab)
print("These are 10 randomly sample words in the vocabulary:\n{}".format(vocab[:10]))
del vocab

The size of the vocabulary is: 21134 tokens.
These are 10 randomly sample words in the vocabulary:
['mash', 'sloppy', 'race/event', 'mus', 'bash', '🍂', 'valet', 'gota', 'wesley', 'pigfucking']


###  Padding the Data

In [12]:
from padding import create_pad_fn, pad_tweets

MAX_LENGTH = df.word_count.max()
pad_tweets = create_pad_fn(MAX_LENGTH)
df["padded_tweets"] = df.clean_tweet.map(pad_tweets)
print(df.padded_tweets[10])

<PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>  ||Quotation_Mark|| keeks is a bitch she curves everyone ||Quotation_Mark|| lol i walked into a conversation like this ||Period|| smh
