In [1]:
import torch
import pandas as pd
import numpy as np
import sklearn
from collections import Counter

In [2]:
from sklearn.utils import Bunch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [3]:
from itertools import combinations

In [4]:
import re
import os

In [5]:
import torch.nn as nn

In [6]:
import matplotlib.pyplot as plt

# Data Loading

In [7]:
path = r"E:\github\movie_hatespeech_detection\data\fox_news\fox_news.csv"
df = pd.read_csv(path, index_col=0)
df = df.rename(columns={'class': 'label'})
df['label'] = df['label'].replace({2:1})
df = df.append({'comment': 'I love you', 'label': 0}, ignore_index=True)
df = df.append({'comment': 'I hate you', 'label': 1}, ignore_index=True)
df.tail()

Unnamed: 0,comment,label
1510,"True. Most leftists ,esp female leftists have ...",1
1511,"First, lets get this straight",1
1512,White privilege ...work all your life to take ...,1
1513,I love you,0
1514,I hate you,1


In [8]:
path = r'E:\github\movie_hatespeech_detection\data\movies_for_training\all_movies.csv'
movie_data = pd.read_csv(path, index_col=0)

In [9]:
movie_data.head()

Unnamed: 0,movie_id,batch_id,majority_answer,text,movie_name
0,AmericanHistoryX(1998)_1,1566624979,0,Derek.,AmerricanHistoryX
1,AmericanHistoryX(1998)_2,1566624979,1,What the fuck are you thinking?,AmerricanHistoryX
2,AmericanHistoryX(1998)_3,1566624979,0,There's a black guy outside breaking into your...,AmerricanHistoryX
3,AmericanHistoryX(1998)_4,1566624979,0,How long has he been there?,AmerricanHistoryX
4,AmericanHistoryX(1998)_5,1566624979,0,I don't know.,AmerricanHistoryX


In [10]:
print(df.label.value_counts())
df.label.value_counts().plot(kind='pie', subplots=True, autopct='%1.0f%%', title='Hate Speech Distribution')

0    1084
1     431
Name: label, dtype: int64


array([<AxesSubplot:ylabel='label'>], dtype=object)

## Data Splitting

In [11]:
def split_dataset(df, seed):
    df = df.copy()
    test = df.loc[1513:1514]
    df.drop(df.tail(1).index, inplace=True)
    train = df.sample(frac=1, random_state=seed)
    return train.comment.values, train.label.values, test.comment, test.label

In [12]:
categories = [0,1]
seed = 11

In [13]:
train, train_targets, test, test_targets = split_dataset(df, seed=seed)

In [14]:
train_size = len(train)
test_size = len(test)
print(train_size)
print(test_size)

1514
2


In [15]:
def calculate_dataset_class_distribution(targets, categories):
    df = pd.DataFrame({'category':targets})
    s = df.category.value_counts(normalize=True)
    s = s.reindex(categories)
    return [s.index[0], s[0]], [s.index[1], s[1]]

In [16]:
train_class_distribution = calculate_dataset_class_distribution(train_targets, categories)
test_class_distribution = calculate_dataset_class_distribution(test_targets, categories)
print(train_class_distribution)
print(test_class_distribution)

([0, 0.7159841479524438], [1, 0.28401585204755614])
([0, 0.5], [1, 0.5])


In [17]:
train_ds = Bunch(data=train, target=train_targets)
test_ds = Bunch(data=test, target=test_targets)

## Buidling the Model

In [18]:
# Getting all the vocabularies and indexing to a unique position
vocab = Counter()
#Indexing words from the training data
for text in train_ds.data:
    for word in text.split(' '):
        vocab[word.lower()]+=1

#Indexing words from the training data
for text in test_ds.data:
    for word in text.split(' '):
        vocab[word.lower()]+=1

for text in movie_data.text.values:
    for word in text.split(' '):
        vocab[word.lower()]+=1
        
total_words = len(vocab)

def get_word_2_index(vocab):
    word2index = {}
    for i,word in enumerate(vocab):
        word2index[word.lower()] = i

    return word2index

word2index = get_word_2_index(vocab)

In [19]:
print(len(word2index))
print(word2index["the"]) # Showing the index of 'the'
print (total_words)

17732
13
17732


In [20]:
# define the network
class News_20_Net(nn.Module):
     def __init__(self, input_size, hidden_size, num_classes):
        super(News_20_Net, self).__init__()
        self.layer_1 = nn.Linear(input_size,hidden_size, bias=True).cuda()
        self.relu = nn.ReLU().cuda()
        self.layer_2 = nn.Linear(hidden_size, hidden_size, bias=True).cuda()
        self.output_layer = nn.Linear(hidden_size, num_classes, bias=True).cuda()
    # accept input and return an output
     def forward(self, x):
        out = self.layer_1(x)
        out = self.relu(out)
        out = self.layer_2(out)
        out = self.relu(out)
        out = self.output_layer(out)
        return out

In [21]:
def get_batch(df,i,batch_size):
    batches = []
    results = []
    # Split into different batchs, get the next batch 
    texts = df.data[i*batch_size:i*batch_size+batch_size]
    # get the targets 
    categories = df.target[i*batch_size:i*batch_size+batch_size]
    #print(categories)
    for text in texts:
        # Dimension, 196609
        layer = np.zeros(total_words,dtype=float)

        for word in text.split(' '):
            layer[word2index[word.lower()]] += 1
        batches.append(layer)

    # We have 5 categories
    for category in categories:
        #print(category)
        index_y = -1
        if category == 0:
            index_y = 0
        elif category == 1:
            index_y = 1
        elif category == 2:
            index_y = 2
        results.append(index_y)

    # the training and the targets
    return np.array(batches),np.array(results)

In [22]:
# Parameters
learning_rate = 0.001
num_epochs = 8
batch_size = 32
display_step = 1 # ADDED will multiplied by 10

# Network Parameters
hidden_size = 100      # 1st layer and 2nd layer number of features
input_size = total_words # Words in vocab
num_classes = len(categories)         # Categories: "graphics","space","baseball","guns", "christian"

## Training

In [23]:
results = []

In [24]:
news_net = News_20_Net(input_size, hidden_size, num_classes)
# Loss and Optimizer
criterion = nn.CrossEntropyLoss()  # This includes the Softmax loss function
optimizer = torch.optim.Adam(news_net.parameters(), lr=learning_rate)  

# Train the Model
for epoch in range(num_epochs):
    # determine the number of min-batches based on the batch size and size of training data
    total_batch = int(len(train_ds.data)/batch_size)
    # Loop over all batches
    for i in range(total_batch):
        batch_x,batch_y = get_batch(train_ds,i,batch_size)
        
        articles = torch.cuda.FloatTensor(batch_x, device='cuda')
        labels = torch.cuda.LongTensor(batch_y, device='cuda')

        # Forward + Backward + Optimize
        optimizer.zero_grad()  # zero the gradient buffer
        outputs = news_net(articles)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        if (i+1) % display_step == 0:
            result = 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f'%(epoch+1, num_epochs, i+1, len(train_ds.data)/batch_size, loss.data)
            results.append({'Epoch': epoch+1, 'Step': i+1, 'Loss': loss.data.item()})
            if (i+1) % (display_step*10) == 0:
                print({'Epoch': epoch+1, 'Step': i+1, 'Loss': loss.data.item()})

{'Epoch': 1, 'Step': 10, 'Loss': 0.6423895359039307}
{'Epoch': 1, 'Step': 20, 'Loss': 0.5471791625022888}
{'Epoch': 1, 'Step': 30, 'Loss': 0.6262851357460022}
{'Epoch': 1, 'Step': 40, 'Loss': 0.5675703287124634}
{'Epoch': 2, 'Step': 10, 'Loss': 0.44184085726737976}
{'Epoch': 2, 'Step': 20, 'Loss': 0.30666372179985046}
{'Epoch': 2, 'Step': 30, 'Loss': 0.40755540132522583}
{'Epoch': 2, 'Step': 40, 'Loss': 0.2627961039543152}
{'Epoch': 3, 'Step': 10, 'Loss': 0.14209066331386566}
{'Epoch': 3, 'Step': 20, 'Loss': 0.09308931976556778}
{'Epoch': 3, 'Step': 30, 'Loss': 0.21916751563549042}
{'Epoch': 3, 'Step': 40, 'Loss': 0.018723975867033005}
{'Epoch': 4, 'Step': 10, 'Loss': 0.026511583477258682}
{'Epoch': 4, 'Step': 20, 'Loss': 0.0300114918500185}
{'Epoch': 4, 'Step': 30, 'Loss': 0.04757861793041229}
{'Epoch': 4, 'Step': 40, 'Loss': 0.0020073177292943}
{'Epoch': 5, 'Step': 10, 'Loss': 0.009803763590753078}
{'Epoch': 5, 'Step': 20, 'Loss': 0.010308003053069115}
{'Epoch': 5, 'Step': 30, 'Loss'

## Validation

In [25]:
# Test the Model
correct = 0
total = 0
total_test_data = len(test_ds.target)

In [26]:
iterates = total_test_data/batch_size # ignore last (<batch_size) batch

In [27]:
all_total = []
all_correct = []
labels_all = []
predicted_all = []

In [28]:
for i in range(int(iterates)):
    batch_x_test,batch_y_test = get_batch(test_ds,i,batch_size)
    
    articles = torch.FloatTensor(batch_x_test).to('cuda')
    
    labels = torch.LongTensor(batch_y_test).to('cuda')
    outputs = news_net(articles)
    _, predicted = torch.max(outputs.data, 1)
    
    labels_all.extend([x.item() for x in labels])
    predicted_all.extend([x.item() for x in predicted])

In [29]:
report = classification_report(labels_all, predicted_all, output_dict=True)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


In [30]:
df_report = pd.DataFrame(report).transpose()

In [31]:
df_report.round(2)

Unnamed: 0,precision,recall,f1-score,support
accuracy,0.0,0.0,0.0,0.0
macro avg,,,,0.0
weighted avg,0.0,0.0,0.0,0.0


----

## Classication of Movies

In [32]:
def annotate_df(movie_df):
    utterances = movie_df.text.values
    predictions = []
    batch = []
    
    for text in utterances:
        # Dimension, 196609
        layer = np.zeros(total_words,dtype=float)

        for word in text.split(' '):
            layer[word2index[word.lower()]] += 1

        batch.append(layer)
        
    texts = torch.FloatTensor(batch).to('cuda')
    outputs = news_net(texts)
    _, predicted = torch.max(outputs.data, 1)
    predictions.extend([x.item() for x in predicted])

    result = []
    
    for i, pred in enumerate(predictions):
        result.append({'index': i, 'label_bow_fox_news': pred})
    
    result_df = pd.DataFrame(result)
    movie_df = movie_df.merge(result_df, right_index=True, left_index=True)
    
    return movie_df

In [33]:
result_df = annotate_df(movie_data)

In [34]:
result_for_sana = result_df[['text', 'label_bow_fox_news']]

In [35]:
result_df

Unnamed: 0,movie_id,batch_id,majority_answer,text,movie_name,index,label_bow_fox_news
0,AmericanHistoryX(1998)_1,1566624979,0,Derek.,AmerricanHistoryX,0,0
1,AmericanHistoryX(1998)_2,1566624979,1,What the fuck are you thinking?,AmerricanHistoryX,1,0
2,AmericanHistoryX(1998)_3,1566624979,0,There's a black guy outside breaking into your...,AmerricanHistoryX,2,0
3,AmericanHistoryX(1998)_4,1566624979,0,How long has he been there?,AmerricanHistoryX,3,0
4,AmericanHistoryX(1998)_5,1566624979,0,I don't know.,AmerricanHistoryX,4,0
...,...,...,...,...,...,...,...
10683,TheWolfofWallStreet2013BluRay_3724,3859903933,0,Sell me this pen.,TheWolfofWallStreet,10683,0
10684,TheWolfofWallStreet2013BluRay_3725,3859903933,0,"Well, it's a nice pen.",TheWolfofWallStreet,10684,0
10685,TheWolfofWallStreet2013BluRay_3726_3727,3859903933,0,You can use the pen to write down thoughts fro...,TheWolfofWallStreet,10685,0
10686,TheWolfofWallStreet2013BluRay_3728,3859903933,0,Sell me this pen.,TheWolfofWallStreet,10686,0


In [36]:
result_df.label_bow_fox_news.value_counts()

0    9334
1    1354
Name: label_bow_fox_news, dtype: int64

In [37]:
result_df.majority_answer.value_counts()

0    9014
1    1380
2     294
Name: majority_answer, dtype: int64

In [38]:
def get_classifications_results(df):
    
    df['majority_answer'] = df['majority_answer'].replace({2:1})
    
    labels_all = df.majority_answer.values
    predicted_all = df.label_bow_fox_news.values

    results_classification = classification_report(labels_all, predicted_all, output_dict=True)
    
    df_report = pd.DataFrame(results_classification).transpose()
    
    return df_report

In [39]:
get_classifications_results(result_df).round(2)

Unnamed: 0,precision,recall,f1-score,support
0,0.84,0.87,0.86,9014.0
1,0.16,0.13,0.14,1674.0
accuracy,0.76,0.76,0.76,0.76
macro avg,0.5,0.5,0.5,10688.0
weighted avg,0.74,0.76,0.75,10688.0
