In [2]:
import numpy as np
import pandas as pd
import nltk
from nltk import pos_tag
from nltk.corpus import stopwords
from collections import defaultdict
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# Loading Dataset 1

##### https://github.com/sjtuprog/fox-news-comments

In [3]:
with open('dataset-1.txt', encoding='utf8') as fp:
    lst = []
    for index, line in enumerate(fp):
        parts = line.split(':')
        label = int(parts[0])
        tweet = parts[1]
        lst.append([tweet, label])
    dataset_1_df = pd.DataFrame(lst, columns=['tweet', 'label'])

In [4]:
dataset_1_df.head()

Unnamed: 0,tweet,label
0,barryswallows Merkel would never say NO\n,1
1,PostApocalypticHero Expect more and more women...,1
2,californiamojo Groping people in public wasn't...,0
3,"MikeSte Merkel, possible the only person in ch...",1
4,"scientist They know very well, no means NO! Th...",1


# Loading Dataset 2

#### https://github.com/UCSM-DUE/IWG_hatespeech_public https://www.kaggle.com/kazanova/sentiment140

In [5]:
dataset_2_df = pd.read_csv('dataset-2.csv')
dataset_2_df.columns = ['tweet', 'label']

In [6]:
dataset_2_df.head()

Unnamed: 0,tweet,label
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
1,is upset that he can't update his Facebook by ...,0
2,@Kenichan I dived many times for the ball. Man...,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",0


# Loading Dataset 3

#### https://www.kaggle.com/mrmorj/hate-speech-and-offensive-language-dataset

In [7]:
dataset_3_df = pd.read_csv('dataset-3.csv')
dataset_3_df.columns = ['tweet', 'label']

In [8]:
dataset_3_df.head()

Unnamed: 0,tweet,label
0,@infidelpamelaLC I'm going to blame the black ...,1
1,I hate fat bitches,1
2,RT @Isa__Lopez: @D_Lo520 but you're still a fa...,1
3,@kcSnowWhite7 @SamSaunders42 don't forget napp...,1
4,RT @JihadistJoe: We Muslims have no military h...,1


### Recalling Cleaning Function

In [9]:
def clean_tweet(df):
    df['tweet'].dropna(inplace=True)
    df['tweet_temp'] = [entry.lower() for entry in df['tweet']]
    df['tweet_temp'] = [word_tokenize(entry) for entry in df['tweet_temp']]

    tag_map = defaultdict(lambda: wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV

    for index, entry in enumerate(df['tweet_temp']):
        final_words = []
        word_Lemmatized = WordNetLemmatizer()

        for word, tag in pos_tag(entry):
            if word not in stopwords.words('english') and word.isalpha():
                word_final = word_Lemmatized.lemmatize(word, tag_map[tag[0]])
                final_words.append(word_final)
        df.loc[index, 'clean_tweet'] = str(final_words)

    del df['tweet_temp']

# Choosing X value for Validation Testing

In [10]:
all_positive_list = []
all_negative_list = []
n = 200

# Extraction Function

In [11]:
def perform(df):
    df_positive = df[df['label'] == 0].reset_index()
    df_negative = df[df['label'] == 1].reset_index()

    clean_tweet(df_positive)
    clean_tweet(df_negative)

    df_positive.drop('index', inplace=True, axis=1)
    df_negative.drop('index', inplace=True, axis=1)

    vectorizer = TfidfVectorizer(max_features=5000)
    positive_vec = vectorizer.fit_transform(df_positive['clean_tweet'])
    negative_vec = vectorizer.fit_transform(df_negative['clean_tweet'])

    positive_scores = zip(vectorizer.get_feature_names(), np.asarray(positive_vec.sum(axis=0)).ravel())
    positive_sorted_scores = sorted(positive_scores, key=lambda x: x[1], reverse=True)

    negative_scores = zip(vectorizer.get_feature_names(), np.asarray(negative_vec.sum(axis=0)).ravel())
    negative_sorted_scores = sorted(negative_scores, key=lambda x: x[1], reverse=True)

    positive_items = positive_sorted_scores[:n]
    negative_items = negative_sorted_scores[:n]

    for item in positive_items:
        all_positive_list.append(item[0])
    for item in negative_items:
        all_negative_list.append(item[0])

## Applying extraction on datasets

In [12]:
perform(dataset_1_df)
perform(dataset_2_df)
perform(dataset_3_df)

### Printing to lexicon file

In [13]:
df = pd.DataFrame(columns=['Positive', 'Negative'], data={'Positive': all_positive_list, 'Negative': all_negative_list})
df.to_csv('lexicon.csv')