In [1]:
# Import required packages
import numpy as np
import pandas as pd
import logging
import json
import string
import re
from pprint import pprint
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from hatesonar import Sonar

import warnings
warnings.filterwarnings('ignore')

## Part 1: Data Cleaning

In [2]:
#Define stopwords
stop_words = set(stopwords.words('english'))
#You can update the stop words from here (I added "re" because it's originally "are")
stop_words.update(["governer","re","governor","watch","live","facebook","beshear","hutchinson","lousiana","youtube","mercer","middlesex","monmouth","morris","ocean","passaic", "salem","somerset", "sussex", "union","warren","update","briefing","press","conference","updates","et","news"])

def common_member(a, b):
    a_set = set(a)
    b_set = set(b)
    if (a_set & b_set):
        return True 
    else:
        return False

def clean_text(text):
    '''
    Make text lowercase, 
    remove reply, 
    remove text in square brackets, 
    remove links, 
    remove user mention,
    remove punctuation, 
    remove numbers 
    and remove words containing numbers.
    '''
    
    re_url = r'(?:http|ftp|https)://(?:[\w_-]+(?:(?:\.[\w_-]+)+))(?:[\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?' 
    text = text.lower()
    text = re.sub('^rt', ' ', text)
    text = re.sub('\[.*?\]', ' ', text)
    text = re.sub(re_url, ' ', text)
    text = re.sub('<.*?>+', ' ', text) #In this part the <u> </u> will be removed and any similar HTML balise too 
    text = re.sub('@\w+', ' ', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\w*\d\w*', ' ', text)
    
    return text

In [3]:
df = pd.read_excel("Raw Tweets.xlsx")
df.head()

Unnamed: 0,State,Role,Name,Tweets,num_date,Party,Status
0,Minnesota,Senator,Amy Klobuchar,Seriously? Maybe the President should focus o...,2020-03-14,Democrat,4
1,Minnesota,Senator,Amy Klobuchar,There are so many heroes on the front lines as...,2020-03-14,Democrat,4
2,Minnesota,Senator,Amy Klobuchar,RT @Yamiche : This is why @JudyWoodruff is ama...,2020-03-14,Democrat,4
3,Minnesota,Senator,Amy Klobuchar,With the House passing the #FamiliesFirstBill ...,2020-03-14,Democrat,4
4,Minnesota,Senator,Amy Klobuchar,Seniors are more at risk to the coronavirus pa...,2020-03-14,Democrat,4


In [4]:
# Add the retweet column 
df['Retweet'] = ['Yes' if "RT @" in i else 'No' for i in df['Tweets'] ]

# Apply primary cleaning
df['Tweets_cleaned'] = df['Tweets'].apply(clean_text)

df['Tweets_cleaned'] = df['Tweets_cleaned'].str.split() \
    .apply(lambda x: [word for word in x if word not in stop_words]) \
    .apply(lambda x: ' '.join(x))

df.head(5)

Unnamed: 0,State,Role,Name,Tweets,num_date,Party,Status,Retweet,Tweets_cleaned
0,Minnesota,Senator,Amy Klobuchar,Seriously? Maybe the President should focus o...,2020-03-14,Democrat,4,No,seriously maybe president focus getting testin...
1,Minnesota,Senator,Amy Klobuchar,There are so many heroes on the front lines as...,2020-03-14,Democrat,4,No,many heroes front lines face pandemic nurses d...
2,Minnesota,Senator,Amy Klobuchar,RT @Yamiche : This is why @JudyWoodruff is ama...,2020-03-14,Democrat,4,Yes,amazing reminded viewers tonight think others ...
3,Minnesota,Senator,Amy Klobuchar,With the House passing the #FamiliesFirstBill ...,2020-03-14,Democrat,4,No,house passing familiesfirstbill last night sen...
4,Minnesota,Senator,Amy Klobuchar,Seniors are more at risk to the coronavirus pa...,2020-03-14,Democrat,4,No,seniors risk coronavirus pandemic cities like ...


## Part 2: Sentiment Analysis

In [5]:
# Import the lexicon 
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
# Download the lexicon
nltk.download("vader_lexicon")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\wajih\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [6]:
sentiments = []
neg = []
pos = []
neu = []
analyzer = SentimentIntensityAnalyzer()

for tw in df['Tweets_cleaned']:
    vs = analyzer.polarity_scores(str(tw))
    del vs['compound']
    max_value = max(vs, key=vs.get)
    labels = {'neg': "negative", 'neu': "neutral", 'pos': "positive"}
    neg.append(vs['neg']*100)
    neu.append(vs['neu']*100)
    pos.append(vs['pos']*100)
    sentiments.append(labels[max_value])
    
df['Sentiments'] = sentiments
df['Negative'] = neg
df['Positive'] = pos
df['Neutral'] = neu
df.head()

Unnamed: 0,State,Role,Name,Tweets,num_date,Party,Status,Retweet,Tweets_cleaned,Sentiments,Negative,Positive,Neutral
0,Minnesota,Senator,Amy Klobuchar,Seriously? Maybe the President should focus o...,2020-03-14,Democrat,4,No,seriously maybe president focus getting testin...,neutral,18.2,0.0,81.8
1,Minnesota,Senator,Amy Klobuchar,There are so many heroes on the front lines as...,2020-03-14,Democrat,4,No,many heroes front lines face pandemic nurses d...,neutral,0.0,34.4,65.6
2,Minnesota,Senator,Amy Klobuchar,RT @Yamiche : This is why @JudyWoodruff is ama...,2020-03-14,Democrat,4,Yes,amazing reminded viewers tonight think others ...,neutral,0.0,18.3,81.7
3,Minnesota,Senator,Amy Klobuchar,With the House passing the #FamiliesFirstBill ...,2020-03-14,Democrat,4,No,house passing familiesfirstbill last night sen...,neutral,13.9,28.7,57.4
4,Minnesota,Senator,Amy Klobuchar,Seniors are more at risk to the coronavirus pa...,2020-03-14,Democrat,4,No,seniors risk coronavirus pandemic cities like ...,positive,9.0,52.4,38.6


## Part 3: Hate Speech Detection

In [7]:
# Create an object of Sonar Hate Speech Detection
sonar = Sonar()

In [8]:
def hate_speech_classifier(df, Class, hate, offensive, neither):
     for i in df['Tweets_cleaned']:
         sonar_dict = sonar.ping(text=str(i))
         Class.append(list(sonar_dict.values())[1])
         hate.append(list(list(sonar_dict.values())[2][0].values())[1])
         offensive.append(list(list(sonar_dict.values())[2][1].values())[1])
         neither.append(list(list(sonar_dict.values())[2][2].values())[1])

In [9]:
Class = []
hate = []
offensive = []
neither = []

# Function calling 
hate_speech_classifier(df, Class, hate, offensive, neither)
# Prepare columns to add the scores later
df["Class"] = Class
df["hate"] = hate
df["offensive"] = offensive
df["neither"] = neither

df["hate"] = df["hate"].apply(lambda x : round(x*100,2))
df["offensive"] = df["offensive"].apply(lambda x : round(x*100,2))
df["neither"] = df["neither"].apply(lambda x : round(x*100,2))

In [10]:
from collections import Counter 

print(Counter(list(df["Class"])))

Counter({'neither': 61138, 'offensive_language': 309, 'hate_speech': 19})


## Save the labeled dataset

In [11]:
df.to_excel (r'Labeled Tweets By Sentiment & HateSpeech.xlsx', index = False, header=True)