## Importing libraries

In [15]:
import csv
import re
import os
import emoji

## Data Pre-Processing

As a first operation we are going to clean the dataset from all the <MENTION_X> strings to improve the training and test sets


In [16]:

input_train_file = "Datasets/raw_training_data.tsv"
input_test_file = "Datasets/raw_test_data.tsv"

output_folder = "OutputDatasets"
output_train_file = "raw_training_data.tsv"
output_test_file = "raw_test_data.tsv"

pattern = r"<(MENTION_\d+|URL)>\s*"

def remove_mention(input_file, output_file, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    output_file = os.path.join(output_folder, output_file)
    with open(input_file, "r", newline="", encoding="utf-8") as file_in, open(output_file, "w", newline="", encoding="utf-8") as file_out:
        reader = csv.reader(file_in, delimiter="\t")
        writer = csv.writer(file_out, delimiter="\t")

        for row in reader:
            modified_row = [re.sub(pattern, "", cell) for cell in row]
            writer.writerow(modified_row)

remove_mention(input_train_file, output_train_file, output_folder)
remove_mention(input_test_file, output_test_file, output_folder)

We have notice the presence of emojis in the dataset.
The emojis must be removed because they are useless and disturbing elements during the execution of the task.

In [17]:
input_train_file = "OutputDatasets/raw_training_data.tsv"
input_test_file = "OutputDatasets/raw_test_data.tsv"

output_folder = "OutputDatasetsNoEmoji"
output_train_file = "raw_training_data.tsv"
output_test_file = "raw_test_data.tsv"

def get_emoji_regexp():
    emojis = sorted(emoji.EMOJI_DATA, key=len, reverse=True)
    pattern = u'(' + u'|'.join(re.escape(u) for u in emojis) + u')'
    return re.compile(pattern)

def remove_emoji(string):
    cleaned = re.sub(get_emoji_regexp(), "", string)
    return cleaned.strip()

def remove_emoji_from_tsv(input_file, output_file, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    output_file = os.path.join(output_folder, output_file)
    with open(input_file, "r", newline="", encoding="utf-8") as file_in, open(output_file, "w", newline="", encoding="utf-8") as file_out:
        reader = csv.reader(file_in, delimiter="\t")
        writer = csv.writer(file_out, delimiter="\t")
        for row in reader:
            modified_row = [remove_emoji(cell) for cell in row]
            writer.writerow(modified_row)

remove_emoji_from_tsv(input_train_file, output_train_file, output_folder)
remove_emoji_from_tsv(input_test_file, output_test_file, output_folder)

## Reading the train set and the test set

In [18]:


input_train_file = "OutputDatasetsNoEmoji/raw_training_data.tsv"
input_test_file = "OutputDatasetsNoEmoji/raw_test_data.tsv"

def read_set(input_set):
    features = []
    labels = []
    with open(input_set, 'r', encoding='utf-8') as tsv_file:
        tsv_reader = csv.reader(tsv_file, delimiter='\t')
        next(tsv_reader)  # Salta l'intestazione del file

        # Legge le righe del file e salva le features e le labels
        for row in tsv_reader:
            text = row[1]
            misogynous = int(row[2])
            aggressiveness = int(row[3])

            features.append(text)
            labels.append((misogynous, aggressiveness))
    return features, labels

# Praticamente train_set[0] per le features e train_set[1] per le labels
train_set = read_set(input_train_file)
test_set = read_set(input_test_file)


Text: Fatti trovare  te lo do volentieri e ti sborro in bocca
Misogynous: 1
Aggressiveness: 1

Text: Tu dovresti ricominciare dai semafori a fare la lavavetri..ma tranquilla tanto il vitalizio ti resterà in gola
Misogynous: 1
Aggressiveness: 1

Text: Amore,sei presentabile? Xchè così via Skype ti faccio conoscere i miei parenti..." e fu così che da un tablet ho conosciuto i parenti terroni della mia tettona,prendendomi un "avete già finito di pranzare? beh si,voi al nord siete diversi..." Dal nord è tutto,a voi..."
Misogynous: 1
Aggressiveness: 1

Text: Salvo poi mandare la culona a Mosca, aummaumm, per farsi dare da Putin commesse multimiliardarie e metterlo in quel posto ai FRATELLI della UE (tra i quali NOI)
Misogynous: 1
Aggressiveness: 0

Text: Vediamo Gentiloni, è ora di finirla di essere servili! Pensiamo al danno che la Francia ci ha fatto con Sarkosy e la culona della Merkel con il bombardamento della Libia
Misogynous: 1
Aggressiveness: 1

Text: Aveva voglia di gridare tutta l