## Data Preparation
First we need to prepare the data so that we can input it into the model.

In [16]:
# load the necessary libraries
import pandas as pd
import numpy as np

In [2]:
# Open the spreadsheet that you downloaded from Google Sheets
sheet = pd.read_excel("data/Implicit hate speech.xlsx")

# View the first 5 instances of the sheet
sheet.head()

Unnamed: 0,###,Comment,Type,Target,Implicit,Metaphor/metonymy,Sarcasm/humor,Rhetorical question,Circumlocution
0,,,,,,,,,
1,"As part of #myrefugeestory, Mohamed Madi and B...",,,,,,,,
2,https://www.facebook.com/228735667216_10153273...,,,,,,,,
3,###,,,,,,,,
4,https://www.facebook.com/228735667216_10153273...,Can we shut up about refugees already?,Acceptable speech,No target,,,,,


In [3]:
# Let's look at the statistics of the sheet
sheet.describe(include="all")

Unnamed: 0,###,Comment,Type,Target,Implicit,Metaphor/metonymy,Sarcasm/humor,Rhetorical question,Circumlocution
count,5868,5782,5791,5791,571.0,329.0,329.0,329.0,241.0
unique,5838,5722,27,31,,,,,
top,###,-*-*-*- Mary mother of Jesus is depicted weari...,Acceptable speech,No target,,,,,
freq,31,11,2873,3015,,,,,
mean,,,,,0.576182,0.316109,0.106383,0.303951,0.33195
std,,,,,0.494595,0.465664,0.308797,0.460663,0.471893
min,,,,,0.0,0.0,0.0,0.0,0.0
25%,,,,,0.0,0.0,0.0,0.0,0.0
50%,,,,,1.0,0.0,0.0,0.0,0.0
75%,,,,,1.0,1.0,0.0,1.0,1.0


In [4]:
# In our experiments, we are interested only in comments, not posts (posts are not annotated), so let's remove all instances that don't have anything (have NaN values) in the Type column
sheet = sheet.dropna(subset=["Type"])

# Let's see the results
sheet.describe(include="all")

Unnamed: 0,###,Comment,Type,Target,Implicit,Metaphor/metonymy,Sarcasm/humor,Rhetorical question,Circumlocution
count,5791,5782,5791,5791,571.0,329.0,329.0,329.0,241.0
unique,5791,5722,27,31,,,,,
top,https://www.facebook.com/228735667216_10153273...,-*-*-*- Mary mother of Jesus is depicted weari...,Acceptable speech,No target,,,,,
freq,1,11,2873,3015,,,,,
mean,,,,,0.576182,0.316109,0.106383,0.303951,0.33195
std,,,,,0.494595,0.465664,0.308797,0.460663,0.471893
min,,,,,0.0,0.0,0.0,0.0,0.0
25%,,,,,0.0,0.0,0.0,0.0,0.0
50%,,,,,1.0,0.0,0.0,0.0,0.0
75%,,,,,1.0,1.0,0.0,1.0,1.0


In [5]:
# Let's look at the new sheet (first 3 instances)
sheet.head(3)

Unnamed: 0,###,Comment,Type,Target,Implicit,Metaphor/metonymy,Sarcasm/humor,Rhetorical question,Circumlocution
4,https://www.facebook.com/228735667216_10153273...,Can we shut up about refugees already?,Acceptable speech,No target,,,,,
5,https://www.facebook.com/228735667216_10153273...,-*-*-*- Why should we? It's the biggest humani...,Acceptable speech,No target,,,,,
6,https://www.facebook.com/228735667216_10153273...,-*-*-*- these refugees adult males are cowards...,Background offensive,Migrants,0.0,,,,


In [6]:
# Following Nikola's instruction, we'll also delete the "-*-*-*- " part from the comments.
# We need to add "\" before the "*" because otherwise it is understood as a special regex character.

sheet["Comment"] = sheet["Comment"].str.replace("-\*-\*-\*- ", "")

# Let's look at the sheet
sheet.head(5)

  sheet["Comment"] = sheet["Comment"].str.replace("-\*-\*-\*- ", "")


Unnamed: 0,###,Comment,Type,Target,Implicit,Metaphor/metonymy,Sarcasm/humor,Rhetorical question,Circumlocution
4,https://www.facebook.com/228735667216_10153273...,Can we shut up about refugees already?,Acceptable speech,No target,,,,,
5,https://www.facebook.com/228735667216_10153273...,Why should we? It's the biggest humanitarian c...,Acceptable speech,No target,,,,,
6,https://www.facebook.com/228735667216_10153273...,these refugees adult males are cowards for not...,Background offensive,Migrants,0.0,,,,
7,https://www.facebook.com/228735667216_10153273...,Does Syria own the BBC?.........,Acceptable speech|Other offensive,No target|Journalist or medium,,,,,
8,https://www.facebook.com/228735667216_10153273...,They are all mentally jerking off to the refug...,Background offensive,Migrants,0.0,,,,


In [8]:
# Let's delete the empty comments because they do not carry any information.
# After this, the count of ### and Comment will be the same.
sheet = sheet.dropna(subset=["Comment"])

sheet.describe(include="all")

Unnamed: 0,###,Comment,Type,Target,Implicit,Metaphor/metonymy,Sarcasm/humor,Rhetorical question,Circumlocution
count,5782,5782,5782,5782,571.0,329.0,329.0,329.0,241.0
unique,5782,5707,27,31,,,,,
top,https://www.facebook.com/228735667216_10153273...,Mary mother of Jesus is depicted wearing it in...,Acceptable speech,No target,,,,,
freq,1,11,2864,3006,,,,,
mean,,,,,0.576182,0.316109,0.106383,0.303951,0.33195
std,,,,,0.494595,0.465664,0.308797,0.460663,0.471893
min,,,,,0.0,0.0,0.0,0.0,0.0
25%,,,,,0.0,0.0,0.0,0.0,0.0
50%,,,,,1.0,0.0,0.0,0.0,0.0
75%,,,,,1.0,1.0,0.0,1.0,1.0


In [9]:
# Let's analyze the statistics of the Implicit which interests us in Experiment 1
print(sheet.Implicit.value_counts())
print(sheet.Implicit.value_counts(normalize = True))

1.0    329
0.0    242
Name: Implicit, dtype: int64
1.0    0.576182
0.0    0.423818
Name: Implicit, dtype: float64


In [10]:
# Let's analyze the statistics of the Type which will interest us in Experiment 2
# See which classes we have:
print(sheet.Type.unique())
# See the statistics:
print(sheet.Type.value_counts())

['Acceptable speech' 'Background offensive'
 'Acceptable speech|Other offensive' 'Other offensive'
 'Acceptable speech|Background offensive'
 "Acceptable speech|Other offensive|Don't know"
 'Other offensive|Background offensive' 'Background violence'
 'Background violence|Background offensive'
 'Other offensive|Acceptable speech'
 "Other offensive|Acceptable speech|Background violence|Don't know"
 "Acceptable speech|Other offensive|Don't know|Background offensive"
 'Inappropriate' 'Acceptable speech|Inappropriate' 'Other threat'
 'Other threat|Other offensive'
 'Acceptable speech|Other offensive|Background offensive'
 'Background offensive|Inappropriate' 'Other offensive|Inappropriate'
 'Other offensive|Background violence' "Acceptable speech|Don't know"
 'Acceptable speech|Other offensive|Inappropriate'
 'Other threat|Background violence' 'Other threat|Acceptable speech'
 'Other threat|Other offensive|Background offensive'
 'Other threat|Acceptable speech|Other offensive'
 'Acceptable

In [11]:
# As the Experiment 2 will be a binary task: Acceptable vs Hate Speech, we need to create a column with only these two classes.
# First, let's copy the values in a new column
sheet["binary-hate-speech"] = sheet["Type"]

sheet.head(1)

Unnamed: 0,###,Comment,Type,Target,Implicit,Metaphor/metonymy,Sarcasm/humor,Rhetorical question,Circumlocution,binary-hate-speech
4,https://www.facebook.com/228735667216_10153273...,Can we shut up about refugees already?,Acceptable speech,No target,,,,,,Acceptable speech


In [12]:
# As Experiment 2 will be based on binary values: acceptable speech and hate speech, let's transform the labels from the Type accordingly.

# mark with "discard" all instances which are annotated with both categories at the same time (contains "|Acceptable speech" or "Acceptable speech|" - or is marked by "|" and you need to add "\" before the symbol "|" in the string "|Acceptable speech" to find it)
sheet['binary-hate-speech'] = np.where(sheet['binary-hate-speech'].str.contains("\|Acceptable speech|Acceptable speech\|", regex = True, na = False),'discard',sheet['binary-hate-speech'])

# See the remaining labels
sheet["binary-hate-speech"].unique()


array(['Acceptable speech', 'Background offensive', 'discard',
       'Other offensive', 'Other offensive|Background offensive',
       'Background violence', 'Background violence|Background offensive',
       'Inappropriate', 'Other threat', 'Other threat|Other offensive',
       'Background offensive|Inappropriate',
       'Other offensive|Inappropriate',
       'Other offensive|Background violence',
       'Other threat|Background violence',
       'Other threat|Other offensive|Background offensive'], dtype=object)

In [13]:
# mark as "Hate speech" all instances that are not "Acceptable speech" or "discard"
sheet['binary-hate-speech'] = np.where(((sheet['binary-hate-speech'] != "Acceptable speech") & (sheet['binary-hate-speech'] != "discard")),'Hate speech',sheet['binary-hate-speech'])

sheet.head()

Unnamed: 0,###,Comment,Type,Target,Implicit,Metaphor/metonymy,Sarcasm/humor,Rhetorical question,Circumlocution,binary-hate-speech
4,https://www.facebook.com/228735667216_10153273...,Can we shut up about refugees already?,Acceptable speech,No target,,,,,,Acceptable speech
5,https://www.facebook.com/228735667216_10153273...,Why should we? It's the biggest humanitarian c...,Acceptable speech,No target,,,,,,Acceptable speech
6,https://www.facebook.com/228735667216_10153273...,these refugees adult males are cowards for not...,Background offensive,Migrants,0.0,,,,,Hate speech
7,https://www.facebook.com/228735667216_10153273...,Does Syria own the BBC?.........,Acceptable speech|Other offensive,No target|Journalist or medium,,,,,,discard
8,https://www.facebook.com/228735667216_10153273...,They are all mentally jerking off to the refug...,Background offensive,Migrants,0.0,,,,,Hate speech


In [14]:
# Analyze the statistics of the results
sheet["binary-hate-speech"].value_counts()

Acceptable speech    2864
Hate speech          2652
discard               266
Name: binary-hate-speech, dtype: int64

In [15]:
# Save the analyzed dataframe for further use
sheet.to_csv("data/hate-speech-prepared-spreadsheet.csv", sep="\t", index=False)

In [18]:
# You can reopen the dataframe in such way:
main_df = pd.read_csv("data/hate-speech-prepared-spreadsheet.csv", sep= "\t")

main_df.head()

Unnamed: 0,###,Comment,Type,Target,Implicit,Metaphor/metonymy,Sarcasm/humor,Rhetorical question,Circumlocution,binary-hate-speech
0,https://www.facebook.com/228735667216_10153273...,Can we shut up about refugees already?,Acceptable speech,No target,,,,,,Acceptable speech
1,https://www.facebook.com/228735667216_10153273...,Why should we? It's the biggest humanitarian c...,Acceptable speech,No target,,,,,,Acceptable speech
2,https://www.facebook.com/228735667216_10153273...,these refugees adult males are cowards for not...,Background offensive,Migrants,0.0,,,,,Hate speech
3,https://www.facebook.com/228735667216_10153273...,Does Syria own the BBC?.........,Acceptable speech|Other offensive,No target|Journalist or medium,,,,,,discard
4,https://www.facebook.com/228735667216_10153273...,They are all mentally jerking off to the refug...,Background offensive,Migrants,0.0,,,,,Hate speech


In [19]:
# Let's analyze the average text length (this will be useful to know what should the max token length be when setting the Transformer parameters)
# Add a column with this information - we calculate the length by first splitting the string in column "Comment" to a list of words (split()) and then calculating the number of elements in the list (len())
main_df['text_length'] = main_df["Comment"].str.split().str.len()

# See the statistics
main_df.text_length.describe()

count    5782.000000
mean       33.821861
std        55.474635
min         1.000000
25%         8.000000
50%        18.000000
75%        38.000000
max      1387.000000
Name: text_length, dtype: float64

We see that 75% of comments have 38 or less words, so we can use a smaller maximum token size as a parameter in Transformers.

### Datasets for the experiments

Let's create the datasets for the two experiments. Simple Transformers require the data to be in the following format:

"The train data should be contained in a Pandas Dataframe with at least two columns. One column should contain the text and the other should contain the labels. The text column should be of datatype str, while the labels column should be of datatype int (0 or 1).

If the dataframe has a header row, the text column should have the heading text and the labels column should have the heading labels."


#### Dataset for Experiment 1
In experiment 1, we have two classes: implicit (1) or explicit (0).

In [20]:
# Create a new dataframe from the original sheet, discarding all instances that are not annotated for implicitness.
sheet1 = sheet.dropna(subset=["Implicit"])

# See the sheet1 size - number of rows and columns:
sheet1.shape

(571, 10)

In [21]:
# View the sheet1
sheet1.head()

Unnamed: 0,###,Comment,Type,Target,Implicit,Metaphor/metonymy,Sarcasm/humor,Rhetorical question,Circumlocution,binary-hate-speech
6,https://www.facebook.com/228735667216_10153273...,these refugees adult males are cowards for no...,Background offensive,Migrants,0.0,,,,,Hate speech
8,https://www.facebook.com/228735667216_10153273...,They are all mentally jerking off to the refu...,Background offensive,Migrants,0.0,,,,,Hate speech
9,https://www.facebook.com/228735667216_10153273...,You only see what you want to see. Pretty muc...,Other offensive,Commenter,1.0,1.0,1.0,0.0,0.0,Hate speech
11,https://www.facebook.com/228735667216_10153273...,Delfina Jones are you one of the Donald Trump...,Other offensive,Commenter,1.0,1.0,1.0,1.0,0.0,Hate speech
12,https://www.facebook.com/228735667216_10153273...,Europeans who immigrated during WW2 weren't b...,Background offensive,Migrants,0.0,,,,,Hate speech


In [25]:
# Because the categories need to be encoded as integers not floats, we change that:
sheet1["Implicit"] = sheet1["Implicit"].astype("int")

# View the sheet
sheet1.head(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sheet1["Implicit"] = sheet1["Implicit"].astype("int")


Unnamed: 0,###,Comment,Type,Target,Implicit,Metaphor/metonymy,Sarcasm/humor,Rhetorical question,Circumlocution,binary-hate-speech
6,https://www.facebook.com/228735667216_10153273...,these refugees adult males are cowards for no...,Background offensive,Migrants,0,,,,,Hate speech


In [27]:
# Now we prepare the dataframe as instructed by the SimpleTransformers, taking only the two relevant columns - Comment and Implicit
impl_sheet = sheet1[["Comment", "Implicit"]]

# Rename the columns
impl_sheet.columns=["text", "labels"]

impl_sheet

Unnamed: 0,text,labels
6,these refugees adult males are cowards for no...,0
8,They are all mentally jerking off to the refu...,0
9,You only see what you want to see. Pretty muc...,1
11,Delfina Jones are you one of the Donald Trump...,1
12,Europeans who immigrated during WW2 weren't b...,0
...,...,...
1141,I'd like military-aged Syrian men to pick up a...,1
1142,It won't happen as oil and water do not -Will ...,1
1146,You basically killed him yourself trying for a...,0
1147,Does he not realize that ISIS terrorists are a...,0


In [29]:
# Save the created file in a csv format
impl_sheet.to_csv("data/implicitness_binary_dataset.csv", sep="\t", index=False)

#### Dataset for experiment 2
In this experiment, we are interested in two classes: acceptable speech (0) and hate speech (1).

In [32]:
# Create a new dataframe from the original sheet, discarding all instances that have "discard" in column.
sheet2 = sheet[sheet["binary-hate-speech"] != "discard"]

sheet2.shape

(5516, 10)

In [37]:
# See the statistics
sheet2["binary-hate-speech"].value_counts()

Acceptable speech    2864
Hate speech          2652
Name: binary-hate-speech, dtype: int64

In [40]:
# Change the labels in binary-hate-speech into 0 and 1
sheet2['binary-hate-speech'] = np.where((sheet2['binary-hate-speech'] == "Acceptable speech"),'0',sheet2['binary-hate-speech'])
sheet2['binary-hate-speech'] = np.where((sheet2['binary-hate-speech'] == "Hate speech"),'1',sheet2['binary-hate-speech'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sheet2['binary-hate-speech'] = np.where((sheet2['binary-hate-speech'] == "Acceptable speech"),'0',sheet2['binary-hate-speech'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sheet2['binary-hate-speech'] = np.where((sheet2['binary-hate-speech'] == "Hate speech"),'1',sheet2['binary-hate-speech'])


In [42]:
# Now we prepare the dataframe as instructed by the SimpleTransformers, taking only the two relevant columns - Comment and 'binary-hate-speech'
hs_sheet = sheet2[["Comment", "binary-hate-speech"]]

# Rename the columns
hs_sheet.columns=["text", "labels"]

hs_sheet

Unnamed: 0,text,labels
4,Can we shut up about refugees already?,0
5,Why should we? It's the biggest humanitarian ...,0
6,these refugees adult males are cowards for no...,1
8,They are all mentally jerking off to the refu...,1
9,You only see what you want to see. Pretty muc...,1
...,...,...
5865,No multiculturalism = no inter-ethnic violence...,1
5866,"Personally, I agree with those who view multic...",0
5867,And by the way these people represent abt 30% ...,1
5868,This is reality for my country today... Someti...,0


In [43]:
# Save the created file in a csv format
hs_sheet.to_csv("data/hatespeech_binary_dataset.csv", sep="\t", index=False)