## Common Imports

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.util import ngrams
from collections import Counter
import re

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\scott\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Load the data

In [2]:
offensive = pd.read_pickle('pickle_files/offensive_formatted.pickle')
benign = pd.read_pickle('pickle_files/benign_formatted.pickle')

offensive = offensive.reset_index(drop=True)
offensive['Tweet ID'] = offensive['Tweet ID'].astype(str)

benign = benign.reset_index(drop=True)
benign['Tweet ID'] = benign['Tweet ID'].astype(str)

print(len(np.unique(offensive['Tweet ID']))) #for convenience
print(len(np.unique(benign['Tweet ID']))) #for convenience

839
5458


## Load Stopwords

In [3]:
stop = stopwords.words('english') #get stopwords from NLTK
keep = ['not'] #Waseem/Hovy did not use "not" as a stopword
stop = [word for word in stop if word not in keep] #Waseem/Hovy did not use "not" as a stopword

## Lowercase the data

In [4]:
offensive['Tweet_original'] = offensive.Tweet.copy() #keep a copy of the original tweet text
offensive['Tweet'] = offensive['Tweet'].str.lower() #lowercase the text 

benign['Tweet_original'] = benign.Tweet.copy() #keep a copy of the original tweet text
benign['Tweet'] = benign['Tweet'].str.lower() #lowercase the text 

print(len(np.unique(offensive['Tweet ID']))) #for convenience
print(len(np.unique(benign['Tweet ID']))) #for convenience

839
5458


## Remove stopwords

In [5]:
offensive['Tweet'] = offensive['Tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)])) #remove stopwords
print(len(np.unique(offensive['Tweet ID']))) #for convenience

benign['Tweet'] = benign['Tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)])) #remove stopwords
print(len(np.unique(benign['Tweet ID']))) #for convenience

839
5458


## Remove punctuation, usernames, hashtags, URLs

In [6]:
def remove_punctuation(df):
    df['Tweet'] = df['Tweet'].fillna('')
    p = re.compile(r'[^\w\s]+')
    df['Tweet'] = [p.sub('', x) for x in df['Tweet'].tolist()] #remove the punctuation
    for i in df.index:
        #print(i)
        #all_data.loc[i,'Bio'] =re.sub('[^A-Za-z0-9]+',"",all_data.loc[i,'Bio'])
        df.loc[i,'Tweet'] =re.sub("@[A-Za-z0-9_/:().]+",  "", df.loc[i,'Tweet'])
        df.loc[i,'Tweet'] =re.sub("http[A-Za-z0-9_/:().]+",  "", df.loc[i,'Tweet'])
        df.loc[i,'Tweet'] =re.sub("#[A-Za-z0-9_/:().]+",  "", df.loc[i,'Tweet'])
    return(df)

offensive = remove_punctuation(offensive)
benign = remove_punctuation(benign)

## Get frequency of unigrams and bigrams per tweet

In [7]:
def term_count(df):
    unigrams = nltk.word_tokenize(' '.join(df['Tweet']))
    bigrams = ngrams(unigrams,2)
    unigrams_c = Counter(unigrams)
    bigrams_c = Counter(bigrams)

    term = []
    count = []
    for i in bigrams_c:
        term = np.append(term,' '.join(i))
        count = np.append(count,bigrams_c[i]/df.shape[0])
    for i in unigrams_c:
        term = np.append(term,i)
        count = np.append(count,unigrams_c[i]/df.shape[0])
    terms = pd.DataFrame({'term':term,'count':count})
    return(terms)
offensive_terms = term_count(offensive)
benign_terms = term_count(benign)

## Create table of top 20 unigrams/bigrams per dataset

In [8]:
offensive_terms = offensive_terms.sort_values('count', ascending=False).reset_index(drop=True)
benign_terms = benign_terms.sort_values('count', ascending=False).reset_index(drop=True)
terms = pd.DataFrame({'offensive_term':offensive_terms.term[0:20],
                      'offensive_value':offensive_terms['count'][0:20],
                      'benign_term':benign_terms.term[0:20],
                      'benign_value':benign_terms['count'][0:20]})
print(terms)

              benign_term  benign_value   offensive_term  offensive_value
0                sturgeon      0.852510         sturgeon         0.927294
1                  nicola      0.674240           nicola         0.574493
2         nicola sturgeon      0.633565  nicola sturgeon         0.538737
3                scotland      0.214181          fucking         0.103695
4                   boris      0.185599             face         0.098927
5                 johnson      0.154819             fuck         0.098927
6                scottish      0.140344             cunt         0.094160
7            independence      0.129901            would         0.089392
8                  brexit      0.122023            punch         0.077473
9           boris johnson      0.116709             like         0.076281
10                    snp      0.109930            needs         0.076281
11                    not      0.101502             want         0.072706
12                     uk      0.10131

In [9]:
terms.to_csv('csv_excel/terms.csv')

## Get ids for boris johnson related benign tweets

In [10]:
bj_ids = []
for t in ['boris','johnson','boris johnson']: #loop through each hatebase term
    try:
        bj_benign = benign[benign.Tweet.str.contains(t)]
        bj_ids = np.append(bj_ids,bj_benign['Tweet ID']) #add the index if the tweet contains a hatebase term
    except:
        pass
bj_ids = np.unique(bj_ids) #remove the duplicates
len(bj_ids)

1193

## remove boris johnson related tweets from benign dataset

In [11]:
benign = pd.read_pickle('pickle_files/benign_formatted.pickle')
benign['Tweet ID'] = benign['Tweet ID'].astype(str)
print(benign.shape)
benign = benign[~benign['Tweet ID'].isin(bj_ids)]
print(benign.shape)
benign.to_pickle('pickle_files/benign_minusbj_formatted.pickle')

(5458, 14)
(4265, 14)
