In [16]:
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk
import numpy as np
from contrastive_addition.temp.cleaning_functions import *

In [3]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/raymond/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/raymond/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
def load_wiki(folderpath):
    """Loads raw wiki data (Wulczyn2017) from folder, cleans text and returns train, test sets.
    See https://github.com/ewulczyn/wiki-detox/

    Args:
        folderpath (str): location of raw dataset.

    Returns:
        pd.DataFrame, pd.DataFrame: train and test sets as pd.Dataframe.
    """
    df = pd.read_csv(f'{folderpath}/attack_annotated_comments.tsv', sep = '\t', index_col = 0)
    annotations = pd.read_csv(f'{folderpath}/attack_annotations.tsv',  sep = '\t')
    # labels a comment as an atack if the majority of annoatators did so
    labels = annotations.groupby('rev_id')['attack'].mean() > 0.5
    # join binary labels to comments
    df['label'] = labels * 1
    # remove newline, tab tokens and ==
    df['comment'] = df['comment'].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
    df['comment'] = df['comment'].apply(lambda x: x.replace("TAB_TOKEN", " "))
    df['comment'] = df['comment'].apply(lambda x: x.replace("==", ""))
    # rename columns
    df = df.rename(columns = {'comment': 'text'})
    # clean data
    df = clean_data(df)
    # create train, test sets
    print('\n--Creating base test and train sets---')
    test = df[df['split']=='test']
    train = df[df['split']!='test']
    # keep cols
    test = test[['clean_text', 'label']]
    test = test.rename(columns = {'clean_text':'text'})
    abuse = len(test[test['label']==1])
    print(f'base_test:\nlen: {len(test)}, pct_abuse: {np.round(abuse/len(test),3)}')
    train = train[['clean_text', 'label']]
    train = train.rename(columns = {'clean_text':'text'})
    abuse = len(train[train['label']==1])
    print(f'base_train:\nlen: {len(train)}, pct_abuse: {np.round(abuse/len(train),3)}')
    return train, test

def load_tweets(folderpath):
    """Loads raw tweets data (Founta2018) from folder, cleans text and returns train, test sets.

    Args:
        folderpath (str): location of raw dataset.

    Returns:
        pd.DataFrame, pd.DataFrame: train and test sets as pd.Dataframe.
    """
    df = pd.read_csv(f'{folderpath}/hatespeech_text_label_vote.csv', sep = '\t',
        encoding='utf-8', header=None)
    df = df.rename(columns = {0:'tweet', 1:'label', 2:'vote'})
    # binarize labels
    df['binary_label'] = df['label'].map(lambda x: 0 if x in ['spam', 'normal'] else 1)
    # rename columns
    df = df.rename(columns = {'tweet':'text'})
    # clean data
    df = clean_data(df)
    # Split of 10% test set
    train, test = train_test_split(df, test_size=0.101, shuffle = True, random_state=42)
    # create train, test sets
    print('\n--Creating base test and train sets---')
    test = test[['clean_text', 'binary_label']]
    test = test.rename(columns = {'clean_text':'text', 'binary_label':'label'})
    abuse = len(test[test['label']==1])
    print(f'base_test:\nlen: {len(test)}, pct_abuse: {np.round(abuse/len(test),3)}')
    train = train[['clean_text', 'binary_label']]
    train = train.rename(columns = {'clean_text':'text', 'binary_label':'label'})
    abuse = len(train[train['label']==1])
    print(f'base_train:\nlen: {len(train)}, pct_abuse: {np.round(abuse/len(train),3)}')
    return train, test

def clean_data(df):
    """Cleans data using functions from cleaning_functions.py.

    Args:
        df (pd.DataFrame): input dataframe.

    Returns:
        pd.DataFrame: output cleaned dataframe.
    """
    print('\n---Dropping NaNs---')
    df = drop_nans(df, subset_col = 'text', verbose = True)
    print('\n---Dropping duplicates---')
    df = drop_duplicates(df, subset_col = 'text', verbose = True)
    print('\n---Cleaning text---')
    df['clean_text'] = df['text'].apply(clean_text)
    print('\n---Dropping empty text entries---')
    df = drop_empty_text(df, subset_col = 'clean_text', verbose = True)
    print('\n---Dropping text entries with only URL + EMOJI---')
    df = drop_url_emoji(df, subset_col = 'clean_text', verbose = True)
    print('\n---Checking text length---')
    df['text_length'] = df['clean_text'].map(lambda x: len(x))
    print('Summary statistics of text length:')
    print(df['text_length'].describe())
    return df

In [19]:
train, test = load_wiki('./data/Wulczyn2017/')


---Dropping NaNs---

Orig len: 115864,
            Num of dropped values: 0,
            New len: 115864

---Dropping duplicates---

Orig len: 115864,
            Num of dropped values: 173,
            New len: 115691

---Cleaning text---

---Dropping empty text entries---

Orig len: 115691,
            Num of dropped values: 4,
            New len: 115687

---Dropping text entries with only URL + EMOJI---

Orig len: 115687,
            Num of dropped values: 5,
            New len: 115682

---Checking text length---
Summary statistics of text length:
count    115682.000000
mean        401.943353
std         733.893019
min           1.000000
25%          89.000000
50%         196.000000
75%         423.000000
max       10000.000000
Name: text_length, dtype: float64

--Creating base test and train sets---
base_test:
len: 23139, pct_abuse: 0.119
base_train:
len: 92543, pct_abuse: 0.117


{0, 1}