# Hate Speech Detector - Test data preparation

In [1]:
import csv
import os

from tqdm.notebook import tqdm

In [2]:
def get_necessary_data(in_file_names, out_file_name, columns):
    list_data = []
    for in_file_name in in_file_names:
        with open(in_file_name, 'r') as f:
            data = {col: [] for col in columns}

            num_reader = csv.DictReader(f)
            num_rows = 0
            for row in num_reader:
                num_rows += 1

            f.seek(0)
            reader = csv.DictReader(f)
            for row in tqdm(reader, total=num_rows):
                for h, v in row.items():
                    if h in columns:
                        data[h].append(v)

        list_data.extend(list(zip(*[data[col] for col in columns])))
    
    with open(out_file_name, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(columns)
        for row in tqdm(list_data):
            writer.writerow(row)
    
    return list_data

In [3]:
def get_csv_data(file_name):
    with open(file_name, 'r') as f:
        list_data = [d for d in tqdm(list(csv.reader(f)))]
    return list_data

### Data analyses to perform:
    1) Which hashtags/mentions denote hate speech or no-hate speech?
    2) How the tweets denoted as hate speech are popular (number of likes retweets, replies)?
    3) Which users (username, name) from test data use hate speech (examples)?
    4) How do the hour/day-of-month histograms for all, hate-speech and non-hate-speech tweets look like?

In [4]:
# get necessary tweet data
necessary_cols = ['tweet', 'date', 'time', 'username', 'name', 'hashtags', 'mentions', 'retweets_count', 'likes_count', 'reply_to']
if not os.path.exists('tests/necessary_en.csv'):
    get_necessary_data(['tests/hsd_brexit_en.csv'], 'tests/necessary_en.csv', necessary_cols)
if not os.path.exists('tests/necessary_pl.csv'):
    get_necessary_data(['tests/hsd_owsiak_pl.csv', 'tests/hsd_wosp_pl.csv'], 'tests/necessary_pl.csv', necessary_cols)

In [5]:
# get raw tweets
if not os.path.exists('tests/tweets_en.csv'):
    get_necessary_data(['tests/necessary_en.csv'], 'tests/tweets_en.csv', ['tweet'])
if not os.path.exists('tests/tweets_pl.csv'):
    get_necessary_data(['tests/necessary_pl.csv'], 'tests/tweets_pl.csv', ['tweet'])