## Dataset And Lexicon

In [None]:
import csv
import pandas as pd

### Loading data, old Lexicon, new Lexicon

In [None]:
df = pd.read_csv('data.csv')
new_lexicon = pd.read_csv('New_Lexicon.csv')
old_lexicon = pd.read_csv('Old_lexicon.csv')

Extracting Hateful Tweets

In [None]:
hate_tweets = [tweet for idx, tweet in enumerate(df['clean']) 
               if df['sentiment'][idx] == 'hateful']

Extracting the other Tweets

In [None]:
other_tweets = [tweet for idx, tweet in enumerate(df['clean']) if df['sentiment'][idx] != 'hateful']

## Classification using only the lexicon

### A tweet is classified as hate-tweet if it contains a word from the lexicon

Evaluation Function

In [None]:
def evaluation(predictedHateTweets, hate_tweets, other_tweets):
    FalsePositive = 0 
    FalseNegative = 0
    for tweet in predictedHateTweets:
        if not tweet in hate_tweets:
            FalsePositive+=1
    TruePositive = len(predictedHateTweets) - FalsePositive
    
    for tweet in hate_tweets:
        if not tweet in predictedHateTweets:
            FalseNegative+=1
    TrueNegative = len(other_tweets) - FalsePositive
    
    accuracy = float(TruePositive + TrueNegative)/float(TruePositive + FalsePositive + TrueNegative + FalseNegative)
    precision = float(TruePositive)/float(TruePositive + FalsePositive)
    recall = float(TruePositive)/float(TruePositive + FalseNegative)
    f1_score = 2*((precision*recall)/(precision+recall))
    
    return accuracy, precision, recall, f1_score

Classification using old Lexicon

In [None]:
old_predictedHateTweets = []
for tweet in df['clean']:
    for word in old_lexicon['clean']:
        if word in str(tweet):
            old_predictedHateTweets.append(tweet)
            break

In [None]:
accuracy, precision, recall, f1_score = evaluation(old_predictedHateTweets, hate_tweets, other_tweets)
print ("accuracy : %.4f, precision : %.4f, recall : %.4f, f1_score : %.4f" %(accuracy, precision,recall,f1_score))

Classification using new Lexicon

In [None]:
predictedHateTweets = []
for tweet in df['clean']:
    for word in new_lexicon['clean']:
        if word in str(tweet):
            predictedHateTweets.append(tweet)
            break

In [None]:
accuracy, precision, recall, f1_score = evaluation(predictedHateTweets, hate_tweets, other_tweets)
print ("accuracy : %.4f, precision : %.4f, recall : %.4f, f1_score : %.4f" %(accuracy, precision,recall,f1_score))

## Calculating the occurrences of lexicon's words in the corpus

In [None]:
import nltk

Appending Corpus' words into one list

In [None]:
dataset_words = []
for tweet in df['clean']:
    for word in str(tweet).split(" "):
        dataset_words.append(word)

In [None]:
len(dataset_words)

Appending Corpus' words related to hate speech into one list

In [None]:
hate_speech_words = []
for tweet in hate_tweets:
    for word in str(tweet).split(" "):
        hate_speech_words.append(word)

FreqDist for the previous two lists

In [None]:
all_fdist = nltk.FreqDist(word for word in dataset_words)
hate_fdist = nltk.FreqDist(word for word in hate_speech_words)

In [None]:
import matplotlib.pyplot as plt

Most 30 frequent word in dataset

In [None]:
all_fdist.plot(30)

Most 30 frequent word in hateful tweets

In [None]:
hate_fdist.plot(30)

old lexicon's words frequencies

In [None]:
old_lex_dist = {}
for word in set(old_lexicon['clean']):
    old_lex_dist[word] = hate_fdist[word]

sort new lexicon's words frequencies

In [None]:
old_lex_tuple = [(value, key) for key, value in old_lex_dist.items()]
old_lex_tuple.sort(key=lambda tup: tup[0], reverse=True)

In [None]:
old_lex_tuple

new lexicon's words frequencies

In [None]:
new_lex_dist ={}
for word in set(new_lexicon['clean']):
    new_lex_dist[word] = hate_fdist[word]

sort new lexicon's words frequencies

In [None]:
new_lex_tuple = [(value, key) for key, value in new_lex_dist.items()]
new_lex_tuple.sort(key=lambda tup: tup[0], reverse=True)

In [None]:
new_lex_tuple

In [None]:
plt.plot(list(old_lex_dist.values()))

In [None]:
plt.plot(list(new_lex_dist.values()))

old Lexicon's words' categories frequency

In [None]:
old_word_cat = dict()
for cat in set(old_lexicon['category']):
    old_word_cat[cat]= 0
for idx, word in enumerate(old_lexicon['clean']): 
    old_word_cat[old_lexicon['category'][idx]]+= hate_fdist[word]

In [None]:
old_catCount = [(cat, count) for cat,count in old_word_cat.items()]
old_catCount.sort(key=lambda tup: tup[1], reverse=True)

In [None]:
old_catCount

new Lexicon's words' categories frequency

In [None]:
new_word_cat = dict()
for cat in set(new_lexicon['category']):
    new_word_cat[cat]= 0
for idx, word in enumerate(new_lexicon['clean']): 
    new_word_cat[new_lexicon['category'][idx]]+= hate_fdist[word]

In [None]:
new_catCount = [(cat, count) for cat,count in new_word_cat.items()]
new_catCount.sort(key=lambda tup: tup[1], reverse=True)

In [None]:
new_catCount

extracting new words and expressions by looking at hateful_not_offinsive tweets

In [None]:
def hatful_offinsive(hate_tweets):
    hatful_offinsive_tweets = []
    for tweet in set(hate_tweets):
        for word in new_lexicon['clean']:
            if word in tweet:
                hatful_offinsive_tweets.append(tweet)
                break
    return hatful_offinsive_tweets

In [None]:
hatful_offinsive_tweets = hatful_offinsive(hate_tweets)

In [None]:
hatful_not_offinsive_tweets = set(hate_tweets) - set(hatful_offinsive_tweets)

In [None]:
hatful_not_offinsive_tweets