## Data Preparation
First we need to prepare the data so that we can input it into the model.

In [31]:
# load the necessary libraries
import pandas as pd

# Disable the warning on chained assignments
pd.options.mode.chained_assignment = None 
import numpy as np

In [2]:
# Open the spreadsheet that you downloaded from Google Sheets
sheet = pd.read_excel("data/Implicit hate speech.xlsx")

# View the first 5 instances of the sheet
sheet.head()

Unnamed: 0,###,Comment,Type,Target,Implicit,Metaphor/metonymy,Sarcasm/humor,Rhetorical question,Circumlocution
0,,,,,,,,,
1,"As part of #myrefugeestory, Mohamed Madi and B...",,,,,,,,
2,https://www.facebook.com/228735667216_10153273...,,,,,,,,
3,###,,,,,,,,
4,https://www.facebook.com/228735667216_10153273...,Can we shut up about refugees already?,Acceptable speech,No target,,,,,


In [70]:
# Let's look at the statistics of the sheet
sheet.describe(include="all")

Unnamed: 0,###,Comment,Type,Target,Implicit,Metaphor/metonymy,Sarcasm/humor,Rhetorical question,Circumlocution
count,5868,5782,5791,5791,571.0,329.0,329.0,329.0,241.0
unique,5838,5722,27,31,,,,,
top,###,-*-*-*- Mary mother of Jesus is depicted weari...,Acceptable speech,No target,,,,,
freq,31,11,2873,3015,,,,,
mean,,,,,0.576182,0.316109,0.106383,0.303951,0.33195
std,,,,,0.494595,0.465664,0.308797,0.460663,0.471893
min,,,,,0.0,0.0,0.0,0.0,0.0
25%,,,,,0.0,0.0,0.0,0.0,0.0
50%,,,,,1.0,0.0,0.0,0.0,0.0
75%,,,,,1.0,1.0,0.0,1.0,1.0


In [71]:
# In our experiments, we are interested only in comments, not posts (posts are not annotated), so let's remove all instances that don't have anything (have NaN values) in the Type column
sheet = sheet.dropna(subset=["Type"])

# Let's see the results
sheet.describe(include="all")

Unnamed: 0,###,Comment,Type,Target,Implicit,Metaphor/metonymy,Sarcasm/humor,Rhetorical question,Circumlocution
count,5791,5782,5791,5791,571.0,329.0,329.0,329.0,241.0
unique,5791,5722,27,31,,,,,
top,https://www.facebook.com/228735667216_10153273...,-*-*-*- Mary mother of Jesus is depicted weari...,Acceptable speech,No target,,,,,
freq,1,11,2873,3015,,,,,
mean,,,,,0.576182,0.316109,0.106383,0.303951,0.33195
std,,,,,0.494595,0.465664,0.308797,0.460663,0.471893
min,,,,,0.0,0.0,0.0,0.0,0.0
25%,,,,,0.0,0.0,0.0,0.0,0.0
50%,,,,,1.0,0.0,0.0,0.0,0.0
75%,,,,,1.0,1.0,0.0,1.0,1.0


In [72]:
# Let's look at the new sheet (first 3 instances)
sheet.head(3)

Unnamed: 0,###,Comment,Type,Target,Implicit,Metaphor/metonymy,Sarcasm/humor,Rhetorical question,Circumlocution
4,https://www.facebook.com/228735667216_10153273...,Can we shut up about refugees already?,Acceptable speech,No target,,,,,
5,https://www.facebook.com/228735667216_10153273...,-*-*-*- Why should we? It's the biggest humani...,Acceptable speech,No target,,,,,
6,https://www.facebook.com/228735667216_10153273...,-*-*-*- these refugees adult males are cowards...,Background offensive,Migrants,0.0,,,,


In [73]:
# Following Nikola's instruction, we'll also delete the "-*-*-*- " part from the comments.
# We need to add "\" before the "*" because otherwise it is understood as a special regex character.

sheet["Comment"] = sheet["Comment"].str.replace("-\*-\*-\*- ", "")
sheet["Comment"] = sheet["Comment"].str.replace("-\*-\*-\*-", "")

# Let's look at the sheet
sheet.head(5)

  sheet["Comment"] = sheet["Comment"].str.replace("-\*-\*-\*- ", "")
  sheet["Comment"] = sheet["Comment"].str.replace("-\*-\*-\*-", "")


Unnamed: 0,###,Comment,Type,Target,Implicit,Metaphor/metonymy,Sarcasm/humor,Rhetorical question,Circumlocution
4,https://www.facebook.com/228735667216_10153273...,Can we shut up about refugees already?,Acceptable speech,No target,,,,,
5,https://www.facebook.com/228735667216_10153273...,Why should we? It's the biggest humanitarian c...,Acceptable speech,No target,,,,,
6,https://www.facebook.com/228735667216_10153273...,these refugees adult males are cowards for not...,Background offensive,Migrants,0.0,,,,
7,https://www.facebook.com/228735667216_10153273...,Does Syria own the BBC?.........,Acceptable speech|Other offensive,No target|Journalist or medium,,,,,
8,https://www.facebook.com/228735667216_10153273...,They are all mentally jerking off to the refug...,Background offensive,Migrants,0.0,,,,


In [74]:
# Let's delete the empty comments because they do not carry any information.
# After this, the count of ### and Comment will be the same.
sheet = sheet.dropna(subset=["Comment"])

sheet.describe(include="all")

Unnamed: 0,###,Comment,Type,Target,Implicit,Metaphor/metonymy,Sarcasm/humor,Rhetorical question,Circumlocution
count,5782,5782,5782,5782,571.0,329.0,329.0,329.0,241.0
unique,5782,5707,27,31,,,,,
top,https://www.facebook.com/228735667216_10153273...,Mary mother of Jesus is depicted wearing it in...,Acceptable speech,No target,,,,,
freq,1,11,2864,3006,,,,,
mean,,,,,0.576182,0.316109,0.106383,0.303951,0.33195
std,,,,,0.494595,0.465664,0.308797,0.460663,0.471893
min,,,,,0.0,0.0,0.0,0.0,0.0
25%,,,,,0.0,0.0,0.0,0.0,0.0
50%,,,,,1.0,0.0,0.0,0.0,0.0
75%,,,,,1.0,1.0,0.0,1.0,1.0


In [75]:
# There are also some empty comments that have an empty string
discarded_sheet = sheet[sheet["Comment"].str.split().str.len() < 1]
discarded_sheet

Unnamed: 0,###,Comment,Type,Target,Implicit,Metaphor/metonymy,Sarcasm/humor,Rhetorical question,Circumlocution
90,https://www.facebook.com/228735667216_10153273...,,Acceptable speech,No target,,,,,
961,https://www.facebook.com/164305410295882_27747...,,Acceptable speech,No target,,,,,
3556,https://www.facebook.com/164305410295882_27690...,,Acceptable speech,No target,,,,,
3584,https://www.facebook.com/164305410295882_27690...,,Acceptable speech,No target,,,,,
3980,https://www.facebook.com/164305410295882_11105...,,Acceptable speech,No target,,,,,
4604,https://www.facebook.com/164305410295882_24273...,,Acceptable speech,No target,,,,,
5109,https://www.facebook.com/10513336322_101537662...,,Acceptable speech,No target,,,,,
5110,https://www.facebook.com/10513336322_101537662...,,Acceptable speech,No target,,,,,
5111,https://www.facebook.com/10513336322_101537662...,,Acceptable speech,No target,,,,,
5112,https://www.facebook.com/10513336322_101537662...,,Acceptable speech,No target,,,,,


In [76]:
# Lets delete the comments that have text length less than 1 - that are empty
sheet = sheet[sheet["Comment"].str.split().str.len() >= 1]

sheet.describe(include="all")


Unnamed: 0,###,Comment,Type,Target,Implicit,Metaphor/metonymy,Sarcasm/humor,Rhetorical question,Circumlocution
count,5771,5771,5771,5771,571.0,329.0,329.0,329.0,241.0
unique,5771,5706,27,31,,,,,
top,https://www.facebook.com/228735667216_10153273...,Mary mother of Jesus is depicted wearing it in...,Acceptable speech,No target,,,,,
freq,1,11,2853,2995,,,,,
mean,,,,,0.576182,0.316109,0.106383,0.303951,0.33195
std,,,,,0.494595,0.465664,0.308797,0.460663,0.471893
min,,,,,0.0,0.0,0.0,0.0,0.0
25%,,,,,0.0,0.0,0.0,0.0,0.0
50%,,,,,1.0,0.0,0.0,0.0,0.0
75%,,,,,1.0,1.0,0.0,1.0,1.0


In [77]:
# Let's analyze whether any of the comments are duplicated
sheet[sheet.Comment.duplicated(keep=False)].sort_values("Comment")

Unnamed: 0,###,Comment,Type,Target,Implicit,Metaphor/metonymy,Sarcasm/humor,Rhetorical question,Circumlocution
5584,https://www.facebook.com/10513336322_101537662...,.. One Month of Islam and Multiculturalism in ...,Acceptable speech,No target,,,,,
5397,https://www.facebook.com/10513336322_101537662...,.. One Month of Islam and Multiculturalism in ...,Acceptable speech,No target,,,,,
1886,https://www.facebook.com/10513336322_101528254...,A difficult concept for most people to underst...,Acceptable speech,No target,,,,,
1821,https://www.facebook.com/10513336322_101528254...,A difficult concept for most people to underst...,Acceptable speech,No target,,,,,
630,https://www.facebook.com/228735667216_10153107...,All of u forgot ab Israel. When they first arr...,Background offensive,Migrants,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
351,https://www.facebook.com/228735667216_10153107...,https://youtu.be/UkIocH91j0w,Acceptable speech,No target,,,,,
3244,https://www.facebook.com/228735667216_10153107...,https://youtu.be/UkIocH91j0w,Acceptable speech,No target,,,,,
384,https://www.facebook.com/228735667216_10153107...,https://youtu.be/UkIocH91j0w,Acceptable speech,No target,,,,,
4112,https://www.facebook.com/164305410295882_11105...,women are to cover in christianity when in pra...,Acceptable speech,No target,,,,,


In [78]:
# We have 104 duplicated texts - we need to remove the duplicates
sheet = sheet.drop_duplicates("Comment")
sheet.describe(include="all")

Unnamed: 0,###,Comment,Type,Target,Implicit,Metaphor/metonymy,Sarcasm/humor,Rhetorical question,Circumlocution
count,5706,5706,5706,5706,571.0,329.0,329.0,329.0,241.0
unique,5706,5706,27,31,,,,,
top,https://www.facebook.com/228735667216_10153273...,Can we shut up about refugees already?,Acceptable speech,No target,,,,,
freq,1,1,2821,2965,,,,,
mean,,,,,0.576182,0.316109,0.106383,0.303951,0.33195
std,,,,,0.494595,0.465664,0.308797,0.460663,0.471893
min,,,,,0.0,0.0,0.0,0.0,0.0
25%,,,,,0.0,0.0,0.0,0.0,0.0
50%,,,,,1.0,0.0,0.0,0.0,0.0
75%,,,,,1.0,1.0,0.0,1.0,1.0


In [79]:
# Let's analyze the statistics of the Implicit which interests us in Experiment 1
print(sheet.Implicit.value_counts())
print(sheet.Implicit.value_counts(normalize = True))

1.0    329
0.0    242
Name: Implicit, dtype: int64
1.0    0.576182
0.0    0.423818
Name: Implicit, dtype: float64


In [80]:
# Let's analyze the statistics of the Type which will interest us in Experiment 2
# See which classes we have:
print(sheet.Type.unique())
# See the statistics:
print(sheet.Type.value_counts())

['Acceptable speech' 'Background offensive'
 'Acceptable speech|Other offensive' 'Other offensive'
 'Acceptable speech|Background offensive'
 "Acceptable speech|Other offensive|Don't know"
 'Other offensive|Background offensive' 'Background violence'
 'Background violence|Background offensive'
 'Other offensive|Acceptable speech'
 "Other offensive|Acceptable speech|Background violence|Don't know"
 "Acceptable speech|Other offensive|Don't know|Background offensive"
 'Inappropriate' 'Acceptable speech|Inappropriate' 'Other threat'
 'Other threat|Other offensive'
 'Acceptable speech|Other offensive|Background offensive'
 'Background offensive|Inappropriate' 'Other offensive|Inappropriate'
 'Other offensive|Background violence' "Acceptable speech|Don't know"
 'Acceptable speech|Other offensive|Inappropriate'
 'Other threat|Background violence' 'Other threat|Acceptable speech'
 'Other threat|Other offensive|Background offensive'
 'Other threat|Acceptable speech|Other offensive'
 'Acceptable

In [81]:
# As the Experiment 2 will be a binary task: Acceptable vs Hate Speech, we need to create a column with only these two classes.
# First, let's copy the values in a new column
sheet["binary-hate-speech"] = sheet["Type"]

sheet.head(1)

Unnamed: 0,###,Comment,Type,Target,Implicit,Metaphor/metonymy,Sarcasm/humor,Rhetorical question,Circumlocution,binary-hate-speech
4,https://www.facebook.com/228735667216_10153273...,Can we shut up about refugees already?,Acceptable speech,No target,,,,,,Acceptable speech


In [82]:
# As Experiment 2 will be based on binary values: acceptable speech and hate speech, let's transform the labels from the Type accordingly.

# mark with "discard" all instances which are annotated with both categories at the same time (contains "|Acceptable speech" or "Acceptable speech|" - or is marked by "|" and you need to add "\" before the symbol "|" in the string "|Acceptable speech" to find it)
sheet['binary-hate-speech'] = np.where(sheet['binary-hate-speech'].str.contains("\|Acceptable speech|Acceptable speech\|", regex = True, na = False),'discard',sheet['binary-hate-speech'])

# See the remaining labels
sheet["binary-hate-speech"].unique()


array(['Acceptable speech', 'Background offensive', 'discard',
       'Other offensive', 'Other offensive|Background offensive',
       'Background violence', 'Background violence|Background offensive',
       'Inappropriate', 'Other threat', 'Other threat|Other offensive',
       'Background offensive|Inappropriate',
       'Other offensive|Inappropriate',
       'Other offensive|Background violence',
       'Other threat|Background violence',
       'Other threat|Other offensive|Background offensive'], dtype=object)

In [83]:
# mark as "Hate speech" all instances that are not "Acceptable speech" or "discard"
sheet['binary-hate-speech'] = np.where(((sheet['binary-hate-speech'] != "Acceptable speech") & (sheet['binary-hate-speech'] != "discard")),'Hate speech',sheet['binary-hate-speech'])

sheet.head()

Unnamed: 0,###,Comment,Type,Target,Implicit,Metaphor/metonymy,Sarcasm/humor,Rhetorical question,Circumlocution,binary-hate-speech
4,https://www.facebook.com/228735667216_10153273...,Can we shut up about refugees already?,Acceptable speech,No target,,,,,,Acceptable speech
5,https://www.facebook.com/228735667216_10153273...,Why should we? It's the biggest humanitarian c...,Acceptable speech,No target,,,,,,Acceptable speech
6,https://www.facebook.com/228735667216_10153273...,these refugees adult males are cowards for not...,Background offensive,Migrants,0.0,,,,,Hate speech
7,https://www.facebook.com/228735667216_10153273...,Does Syria own the BBC?.........,Acceptable speech|Other offensive,No target|Journalist or medium,,,,,,discard
8,https://www.facebook.com/228735667216_10153273...,They are all mentally jerking off to the refug...,Background offensive,Migrants,0.0,,,,,Hate speech


In [84]:
# Analyze the statistics of the results
sheet["binary-hate-speech"].value_counts()

Acceptable speech    2821
Hate speech          2619
discard               266
Name: binary-hate-speech, dtype: int64

In [85]:
sheet["binary-hate-speech"].value_counts()

Acceptable speech    2821
Hate speech          2619
discard               266
Name: binary-hate-speech, dtype: int64

In [86]:
# Before we create additional dataframes for specific experiments and save this one, restart the index.
sheet.reset_index(inplace=True)
sheet.head()

Unnamed: 0,index,###,Comment,Type,Target,Implicit,Metaphor/metonymy,Sarcasm/humor,Rhetorical question,Circumlocution,binary-hate-speech
0,4,https://www.facebook.com/228735667216_10153273...,Can we shut up about refugees already?,Acceptable speech,No target,,,,,,Acceptable speech
1,5,https://www.facebook.com/228735667216_10153273...,Why should we? It's the biggest humanitarian c...,Acceptable speech,No target,,,,,,Acceptable speech
2,6,https://www.facebook.com/228735667216_10153273...,these refugees adult males are cowards for not...,Background offensive,Migrants,0.0,,,,,Hate speech
3,7,https://www.facebook.com/228735667216_10153273...,Does Syria own the BBC?.........,Acceptable speech|Other offensive,No target|Journalist or medium,,,,,,discard
4,8,https://www.facebook.com/228735667216_10153273...,They are all mentally jerking off to the refug...,Background offensive,Migrants,0.0,,,,,Hate speech


In [87]:
# Delete the additional column that was created
sheet = sheet.drop(columns="index")
sheet.head(1)

Unnamed: 0,###,Comment,Type,Target,Implicit,Metaphor/metonymy,Sarcasm/humor,Rhetorical question,Circumlocution,binary-hate-speech
0,https://www.facebook.com/228735667216_10153273...,Can we shut up about refugees already?,Acceptable speech,No target,,,,,,Acceptable speech


In [88]:
# Save the analyzed dataframe for further use
sheet.to_csv("data/hate-speech-prepared-spreadsheet.csv", sep="\t")

In [35]:
# You can reopen the dataframe in such way:
main_df = pd.read_csv("data/hate-speech-prepared-spreadsheet.csv", sep= "\t", index_col = 0)

main_df.head()

Unnamed: 0,###,Comment,Type,Target,Implicit,Metaphor/metonymy,Sarcasm/humor,Rhetorical question,Circumlocution,binary-hate-speech
0,https://www.facebook.com/228735667216_10153273...,Can we shut up about refugees already?,Acceptable speech,No target,,,,,,Acceptable speech
1,https://www.facebook.com/228735667216_10153273...,Why should we? It's the biggest humanitarian c...,Acceptable speech,No target,,,,,,Acceptable speech
2,https://www.facebook.com/228735667216_10153273...,these refugees adult males are cowards for not...,Background offensive,Migrants,0.0,,,,,Hate speech
3,https://www.facebook.com/228735667216_10153273...,Does Syria own the BBC?.........,Acceptable speech|Other offensive,No target|Journalist or medium,,,,,,discard
4,https://www.facebook.com/228735667216_10153273...,They are all mentally jerking off to the refug...,Background offensive,Migrants,0.0,,,,,Hate speech


In [33]:
# Let's analyze the average text length (this will be useful to know what should the max token length be when setting the Transformer parameters)
# Add a column with this information - we calculate the length by first splitting the string in column "Comment" to a list of words (split()) and then calculating the number of elements in the list (len())
main_df['text_length'] = main_df["Comment"].str.split().str.len()

# See the statistics
main_df.text_length.describe()

count    5706.000000
mean       34.013495
std        55.640073
min         1.000000
25%         8.000000
50%        18.000000
75%        38.000000
max      1387.000000
Name: text_length, dtype: float64

In [35]:
print(main_df.text_length.describe().to_markdown())

|       |   text_length |
|:------|--------------:|
| count |     5706      |
| mean  |       34.0135 |
| std   |       55.6401 |
| min   |        1      |
| 25%   |        8      |
| 50%   |       18      |
| 75%   |       38      |
| max   |     1387      |


In [36]:
# Lets analyze texts that are longer than 1000 words
main_df[main_df.text_length > 1000]

Unnamed: 0,###,Comment,Type,Target,Implicit,Metaphor/metonymy,Sarcasm/humor,Rhetorical question,Circumlocution,binary-hate-speech,text_length
66,https://www.facebook.com/228735667216_10153273...,Delfina Jones 1. I don't worship a pedophile. ...,Other offensive,No target|Commenter,1.0,0.0,0.0,1.0,0.0,Hate speech,1387


In [37]:
# Let's check if the text is okay
main_df.iloc[66].Comment

'Delfina Jones 1. I don\'t worship a pedophile. I respect and adore a prophet. Placing him into the category of a pedophile is beyond pathetic as he lived 1400 YEARS. 1400 years. People change in 1400 years. In the future, people would call us pedophiles for marrying people at the age of 20-28. That\'s the problem that you don\'t seem to understand. It was a norm to marry women at a young age. How can I call someone who lived more than a thousand years ago a rapist if it was a normal thing for EVERY human on the planet at that point to marry young females? Marriage is also a choice in islam. I\'m a young woman yet my parents don\'t force me to marry at a young age. THAT is culture. A culture that has ruined the true image of islam by mixing its own blend to make islam look "nice" for them. But it isn\'t. 2. Slavery is allowed but we rather call it housekeeping as slavery basically means a slave being owned to a master and does everything that the master tells him to do. Islam is agains

We see that 75% of comments have 38 or less words, so we can use a smaller maximum token size as a parameter in Transformers.

### Datasets for the experiments

Let's create the datasets for the two experiments. Simple Transformers require the data to be in the following format:

"The train data should be contained in a Pandas Dataframe with at least two columns. One column should contain the text and the other should contain the labels. The text column should be of datatype str, while the labels column should be of datatype int (0 or 1).

If the dataframe has a header row, the text column should have the heading text and the labels column should have the heading labels."


#### Dataset for Experiment 1
In experiment 1, we have two classes: implicit (1) or explicit (0).

In [91]:
# Create a new dataframe from the original sheet, discarding all instances that are not annotated for implicitness.
sheet1 = sheet.dropna(subset=["Implicit"])

# See the sheet1 size - number of rows and columns:
sheet1.shape

(571, 10)

In [92]:
# View the sheet1
sheet1.head()

Unnamed: 0,###,Comment,Type,Target,Implicit,Metaphor/metonymy,Sarcasm/humor,Rhetorical question,Circumlocution,binary-hate-speech
2,https://www.facebook.com/228735667216_10153273...,these refugees adult males are cowards for not...,Background offensive,Migrants,0.0,,,,,Hate speech
4,https://www.facebook.com/228735667216_10153273...,They are all mentally jerking off to the refug...,Background offensive,Migrants,0.0,,,,,Hate speech
5,https://www.facebook.com/228735667216_10153273...,You only see what you want to see. Pretty much...,Other offensive,Commenter,1.0,1.0,1.0,0.0,0.0,Hate speech
7,https://www.facebook.com/228735667216_10153273...,Delfina Jones are you one of the Donald Trump'...,Other offensive,Commenter,1.0,1.0,1.0,1.0,0.0,Hate speech
8,https://www.facebook.com/228735667216_10153273...,Europeans who immigrated during WW2 weren't br...,Background offensive,Migrants,0.0,,,,,Hate speech


In [93]:
# Because the categories need to be encoded as integers not floats, we change that:
sheet1["Implicit"] = sheet1["Implicit"].astype("int")

# View the sheet
sheet1.head(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sheet1["Implicit"] = sheet1["Implicit"].astype("int")


Unnamed: 0,###,Comment,Type,Target,Implicit,Metaphor/metonymy,Sarcasm/humor,Rhetorical question,Circumlocution,binary-hate-speech
2,https://www.facebook.com/228735667216_10153273...,these refugees adult males are cowards for not...,Background offensive,Migrants,0,,,,,Hate speech


In [94]:
# Now we prepare the dataframe as instructed by the SimpleTransformers, taking only the two relevant columns - Comment and Implicit
impl_sheet = sheet1[["Comment", "Implicit"]]

# Rename the columns
impl_sheet.columns=["text", "labels"]

impl_sheet

Unnamed: 0,text,labels
2,these refugees adult males are cowards for not...,0
4,They are all mentally jerking off to the refug...,0
5,You only see what you want to see. Pretty much...,1
7,Delfina Jones are you one of the Donald Trump'...,1
8,Europeans who immigrated during WW2 weren't br...,0
...,...,...
1089,I'd like military-aged Syrian men to pick up a...,1
1090,It won't happen as oil and water do not -Will ...,1
1094,You basically killed him yourself trying for a...,0
1095,Does he not realize that ISIS terrorists are a...,0


In [95]:
# Save the created file in a csv format
impl_sheet.to_csv("data/implicitness_binary_dataset.csv", sep="\t")

#### Dataset for experiment 2
In this experiment, we are interested in two classes: acceptable speech (0) and hate speech (1).

In [36]:
sheet = main_df

sheet.head(1)

Unnamed: 0,###,Comment,Type,Target,Implicit,Metaphor/metonymy,Sarcasm/humor,Rhetorical question,Circumlocution,binary-hate-speech
0,https://www.facebook.com/228735667216_10153273...,Can we shut up about refugees already?,Acceptable speech,No target,,,,,,Acceptable speech


In [37]:
# Create a new dataframe from the original sheet, discarding all instances that have "discard" in column.
sheet2 = sheet[sheet["binary-hate-speech"] != "discard"]

sheet2.shape

(5440, 10)

In [38]:
# See the statistics
sheet2["binary-hate-speech"].value_counts()

Acceptable speech    2821
Hate speech          2619
Name: binary-hate-speech, dtype: int64

In [39]:
# Change the labels in binary-hate-speech into 0 and 1
sheet2['binary-hate-speech'] = np.where((sheet2['binary-hate-speech'] == "Acceptable speech"),'0',sheet2['binary-hate-speech'])
sheet2['binary-hate-speech'] = np.where((sheet2['binary-hate-speech'] == "Hate speech"),'1',sheet2['binary-hate-speech'])

In [40]:
sheet2.head()

Unnamed: 0,###,Comment,Type,Target,Implicit,Metaphor/metonymy,Sarcasm/humor,Rhetorical question,Circumlocution,binary-hate-speech
0,https://www.facebook.com/228735667216_10153273...,Can we shut up about refugees already?,Acceptable speech,No target,,,,,,0
1,https://www.facebook.com/228735667216_10153273...,Why should we? It's the biggest humanitarian c...,Acceptable speech,No target,,,,,,0
2,https://www.facebook.com/228735667216_10153273...,these refugees adult males are cowards for not...,Background offensive,Migrants,0.0,,,,,1
4,https://www.facebook.com/228735667216_10153273...,They are all mentally jerking off to the refug...,Background offensive,Migrants,0.0,,,,,1
5,https://www.facebook.com/228735667216_10153273...,You only see what you want to see. Pretty much...,Other offensive,Commenter,1.0,1.0,1.0,0.0,0.0,1


In [41]:
# Now we prepare the dataframe as instructed by the SimpleTransformers, taking only the two relevant columns - Comment and 'binary-hate-speech'
hs_sheet = sheet2[["Comment", "binary-hate-speech"]]

# Rename the columns
hs_sheet.columns=["text", "labels"]

hs_sheet

Unnamed: 0,text,labels
0,Can we shut up about refugees already?,0
1,Why should we? It's the biggest humanitarian c...,0
2,these refugees adult males are cowards for not...,1
4,They are all mentally jerking off to the refug...,1
5,You only see what you want to see. Pretty much...,1
...,...,...
5701,No multiculturalism = no inter-ethnic violence...,1
5702,"Personally, I agree with those who view multic...",0
5703,And by the way these people represent abt 30% ...,1
5704,This is reality for my country today... Someti...,0


In [42]:
# We will use all data, annotated for implicitness (instances up to index 1096) as test data and everything else as train data:
hs_sheet.loc[:1097,"split"] = 'test'
hs_sheet.loc[1097:,"split"] = 'train'

hs_sheet.head()

Unnamed: 0,text,labels,split
0,Can we shut up about refugees already?,0,test
1,Why should we? It's the biggest humanitarian c...,0,test
2,these refugees adult males are cowards for not...,1,test
4,They are all mentally jerking off to the refug...,1,test
5,You only see what you want to see. Pretty much...,1,test


In [44]:
hs_sheet.tail()

Unnamed: 0,text,labels,split
5701,No multiculturalism = no inter-ethnic violence...,1,train
5702,"Personally, I agree with those who view multic...",0,train
5703,And by the way these people represent abt 30% ...,1,train
5704,This is reality for my country today... Someti...,0,train
5705,"Hey, lets send someone from Swedish press acro...",1,train


In [45]:
hs_sheet.split.value_counts()

train    4391
test     1049
Name: split, dtype: int64

In [46]:
# Save the created file in a csv format
hs_sheet.to_csv("data/hatespeech_binary_dataset.csv", sep="\t")