In [1]:
import pandas as pd
import re
import json
import numpy as np
import emoji
import os

In [2]:
DATASETS_FOLDER = "/Users/slavkoz/OneDrive - Univerza v Ljubljani/Datasets/Offensive language datasets/"
#DATASETS_FOLDER = "/home/slavkoz/Datasets/Offensive language datasets/"

TEXT_ONLY_DF = pd.DataFrame(columns=["text"])

def concat_text_df(df):
    print(f"\t{len(df)} lines")   
    #print("\n".join(df["text"].tail(5).tolist())) 
    return pd.concat([TEXT_ONLY_DF, df[["text"]]], ignore_index=True, axis=0)

def remove_html_tags(text):
    """Remove html tags from a string"""
    import re
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

def create_tweet_dict(df):
    tweet_id_to_text = dict()
    for index, row in df.iterrows():
        tweet_id_to_text[row["id"]] = row["text"]
    return tweet_id_to_text

In [3]:
# DATASET 01
print("Dataset 01 loading ...")
full_dataset = []

dataset_path = os.path.join(DATASETS_FOLDER, '01_jigsaw-toxic-comment-classification-challenge/train.csv')
df = pd.read_csv(dataset_path)
df = df.rename(columns={'comment_text': 'text'})
TEXT_ONLY_DF = concat_text_df(df)

for index, row in df.iterrows():
    if row["toxic"] == 1:
        full_dataset.append([1, row["text"], "toxic"])
    if row["severe_toxic"] == 1:
        full_dataset.append([1, row["text"], "severe_toxic"])
    if row["obscene"] == 1:
        full_dataset.append([1, row["text"], "obscene"])
    if row["threat"] == 1:
        full_dataset.append([1, row["text"], "threat"])
    if row["insult"] == 1:
        full_dataset.append([1, row["text"], "insult"])
    if row["identity_hate"] == 1:
        full_dataset.append([1, row["text"], "identity_hate"])

dataset_path = os.path.join(DATASETS_FOLDER, '01_jigsaw-toxic-comment-classification-challenge/test.csv')
df = pd.read_csv(dataset_path)
df = df.rename(columns={'comment_text': 'text'})
TEXT_ONLY_DF = concat_text_df(df)

# Only some examples from test set used for evaluation
mapper = {"toxic": set(), "severe_toxic": set(), "obscene": set(), "threat": set(), "insult": set(), "identity_hate": set()}
dataset_path = os.path.join(DATASETS_FOLDER, '01_jigsaw-toxic-comment-classification-challenge/test_labels.csv')
df_test_labels = pd.read_csv(dataset_path)
for index, row in df_test_labels.iterrows():
    if row["toxic"] == 1:
        mapper["toxic"].add(row["id"])
    if row["severe_toxic"] == 1:
        mapper["severe_toxic"].add(row["id"])
    if row["obscene"] == 1:
        mapper["obscene"].add(row["id"])
    if row["threat"] == 1:
        mapper["threat"].add(row["id"])
    if row["insult"] == 1:
        mapper["insult"].add(row["id"])
    if row["identity_hate"] == 1:
        mapper["identity_hate"].add(row["id"])
        
for index, row in df.iterrows():
    if row["id"] in mapper["toxic"]:
        full_dataset.append([1, row["text"], "toxic"])
    if row["id"] in mapper["severe_toxic"]:
        full_dataset.append([1, row["text"], "severe_toxic"])
    if row["id"] in mapper["obscene"]:
        full_dataset.append([1, row["text"], "obscene"])
    if row["id"] in mapper["threat"]:
        full_dataset.append([1, row["text"], "threat"])
    if row["id"] in mapper["insult"]:
        full_dataset.append([1, row["text"], "insult"])
    if row["id"] in mapper["identity_hate"]:
        full_dataset.append([1, row["text"], "identity_hate"])


df1 = pd.DataFrame(full_dataset, columns=["corpus_id", "text", "label"])
df1["label"].value_counts()

Dataset 01 loading ...
	159571 lines
	153164 lines


toxic            21384
obscene          12140
insult           11304
identity_hate     2117
severe_toxic      1962
threat             689
Name: label, dtype: int64

In [4]:
# DATASET 02
print("Dataset 02 loading ...")
dataset_path = os.path.join(DATASETS_FOLDER, '02_davidsons_dataset.csv')
df = pd.read_csv(dataset_path)
df = df.rename(columns={'tweet': 'text'})
TEXT_ONLY_DF = concat_text_df(df)

full_dataset = []
mapper = {0: "hate_speech", 1: "offensive_language", 2: "neither"}
for index, row in df.iterrows():
    
    full_dataset.append([2, row["text"], mapper[row["class"]]])

df2 = pd.DataFrame(full_dataset, columns=["corpus_id", "text", "label"])
df2["label"].value_counts()

Dataset 02 loading ...
	24783 lines


offensive_language    19190
neither                4163
hate_speech            1430
Name: label, dtype: int64

In [5]:
# DATASET 03
print("Dataset 03 loading ...")
dataset_path = os.path.join(DATASETS_FOLDER, '03_HASOC_2019_english_dataset/english_dataset.tsv')
df = pd.read_csv(dataset_path, sep='\t')
TEXT_ONLY_DF = concat_text_df(df)

full_dataset = []
mapper = {"NONE": "none", "HATE": "hate", "OFFN": "offensive", "PRFN": "profane"}
for index, row in df.iterrows():
    full_dataset.append([3, row["text"], mapper[row["task_2"]]])

df3 = pd.DataFrame(full_dataset, columns=["corpus_id", "text", "label"])
df3["label"].value_counts()

Dataset 03 loading ...
	5852 lines


none         3591
hate         1143
profane       667
offensive     451
Name: label, dtype: int64

In [6]:
# DATASET 04
print("Dataset 04 loading ...")
dataset_path = os.path.join(DATASETS_FOLDER, '04_Waseems_dataset_detect_hate_speech_data.csv')
df = pd.read_csv(dataset_path, sep='|')
TEXT_ONLY_DF = concat_text_df(df)

full_dataset = []
for index, row in df.iterrows():
    full_dataset.append([4, row["text"], row["label"]])

df4 = pd.DataFrame(full_dataset, columns=["corpus_id", "text", "label"])
df4["label"].value_counts()

Dataset 04 loading ...
	20894 lines


none          16844
sexism         3963
homophobia       87
Name: label, dtype: int64

In [7]:
# DATASET 06
print("Dataset 06 loading ...")
dataset_path = os.path.join(DATASETS_FOLDER, '06_Reynolds formspring/formspring_data.csv')
df = pd.read_csv(dataset_path, sep='\t')
df = df.rename(columns={'post': 'text'})
TEXT_ONLY_DF = concat_text_df(df)

full_dataset = []
for index, row in df.iterrows():
    labels = [row["ans1"], row["ans2"], row["ans3"]]
    if labels.count("Yes") > 1:
        full_dataset.append([6, row["text"], "cyberbullying"])
    else:
        full_dataset.append([6, row["text"], "none"])

df6 = pd.DataFrame(full_dataset, columns=["corpus_id", "text", "label"])
df6["label"].value_counts()

Dataset 06 loading ...
	12773 lines


none             11997
cyberbullying      776
Name: label, dtype: int64

In [8]:
# DATASET 07
print("Dataset 07 loading ...")
dataset_path = os.path.join(DATASETS_FOLDER, '07_founta/hatespeech_text_label_vote.csv')
df = pd.read_csv(dataset_path, sep='\t', header=None)
df = df.rename(columns={0: 'text'})
TEXT_ONLY_DF = concat_text_df(df)

full_dataset = []
for index, row in df.iterrows():
    full_dataset.append([7, row["text"], row[1]])

df7 = pd.DataFrame(full_dataset, columns=["corpus_id", "text", "label"])
df7["label"].value_counts()

Dataset 07 loading ...
	99996 lines


normal     53851
abusive    27150
spam       14030
hateful     4965
Name: label, dtype: int64

In [9]:
# DATASET 08

#TODO: files are password protected
#print("Dataset 08 loading ...")
#dataset_path = os.path.join(DATASETS_FOLDER, '08_AMI_IBEREVAL2018')
#df = pd.read_csv(dataset_path, sep='\t')
#df = df.rename(columns={'post': 'text'})
#TEXT_ONLY_DF = concat_text_df(df)

#dfTrain = pd.read_csv("../data/iberEval/en_AMI_TrainingSet.csv", sep=";")
#    mysog_tweets = dfTrain[dfTrain["misogyny_category"] != "0"]
#    return list(mysog_tweets["tweet"]), list(mysog_tweets["misogyny_category"])

In [10]:
# DATASET 9
print("Dataset 9 loading ...")

pd.DataFrame(columns=["text"])
allPosts = list()
allTypes = list()
with open(os.path.join(DATASETS_FOLDER, '09_MMHS150K/MMHS150K_GT.json'), 'rb') as file:
    df = json.load(file)
mapping = ['none', 'racist', 'sexist', 'homophobic', 'religious', 'other']
for val in df.values():
    labels = np.unique(val['labels']) 
    allPosts.extend([val['tweet_text']])
    allTypes.extend([mapping[label] for label in labels])

df = pd.DataFrame(allPosts, columns =['text'])
TEXT_ONLY_DF = concat_text_df(df)

full_dataset = []
for post, typ in zip(allPosts, allTypes):
    full_dataset.append([9, post, typ])

df9 = pd.DataFrame(full_dataset, columns=["corpus_id", "text", "label"])
df9["label"].value_counts()

Dataset 9 loading ...
	149823 lines


none          84573
racist        30290
other         14942
sexist        11322
homophobic     7397
religious      1299
Name: label, dtype: int64

In [11]:
# DATASET 10
print("Dataset 10 loading ...")
dataset_path = os.path.join(DATASETS_FOLDER, '10_jig-quian gab.csv')
df = pd.read_csv(dataset_path)

allPosts = list()
allTypes = list()
for text, idx in zip(df['text'], df['hate_speech_idx']):
    posts = re.split('[0-9]+. \t+', text)
    posts[0] = posts[0][3:]
    idx_list = re.split(r'\[|\]', str(idx))
    types = ['hateful' if str(i) in idx_list else 'none'
                for i in range(1, len(posts) + 1)]
    allPosts.extend(posts)
    allTypes.extend(types)

df = pd.DataFrame(allPosts, columns =['text'])
TEXT_ONLY_DF = concat_text_df(df)


full_dataset = []
for post, typ in zip(allPosts, allTypes):
    full_dataset.append([10, post, typ])

df10 = pd.DataFrame(full_dataset, columns=["corpus_id", "text", "label"])
df10["label"].value_counts()

Dataset 10 loading ...
	33776 lines


none       24840
hateful     8936
Name: label, dtype: int64

In [12]:
# DATASET 11
print("Dataset 11 loading ...")
dataset_path = os.path.join(DATASETS_FOLDER, '11_jig-quian reddit.csv')
df = pd.read_csv(dataset_path)

allPosts = list()
allTypes = list()
for text, idx in zip(df['text'], df['hate_speech_idx']):
    posts = re.split('[0-9]+. \t+', text)
    posts[0] = posts[0][3:]
    idx_list = re.split(r'\[|\]', str(idx))
    types = ['hateful' if str(i) in idx_list else 'none'
                for i in range(1, len(posts) + 1)]
    allPosts.extend(posts)
    allTypes.extend(types)

df = pd.DataFrame(allPosts, columns =['text'])
TEXT_ONLY_DF = concat_text_df(df)

full_dataset = []
for post, typ in zip(allPosts, allTypes):
    full_dataset.append([11, post, typ])

df11 = pd.DataFrame(full_dataset, columns=["corpus_id", "text", "label"])
df11["label"].value_counts()

Dataset 11 loading ...
	22309 lines


none       19330
hateful     2979
Name: label, dtype: int64

In [13]:
# DATASET 12
print("Dataset 12 loading ...")

allPosts = list()
allTypes = list()

annotations = pd.read_csv(os.path.join(DATASETS_FOLDER, '12_white supremacist forum/annotations_metadata.csv'))
for index, row in annotations.iterrows():
    if row['label'] in ['hate', 'noHate']:
        text = open(os.path.join(DATASETS_FOLDER, '12_white supremacist forum/all_files/' + str(row['file_id']) + '.txt'), 'r', encoding='utf-8').read()
        label = row['label']
        allPosts.append(text)
        allTypes.append(label)

df = pd.DataFrame(allPosts, columns =['text'])
TEXT_ONLY_DF = concat_text_df(df)

full_dataset = []
for post, typ in zip(allPosts, allTypes):
    full_dataset.append([12, post, typ])

df12 = pd.DataFrame(full_dataset, columns=["corpus_id", "text", "label"])
df12["label"].value_counts()

Dataset 12 loading ...


FileNotFoundError: [Errno 2] No such file or directory: '/home/slavkoz/Datasets/Offensive language datasets/12_white supremacist forum/all_files/12834493_1.txt'

In [None]:
# DATASET 13
print("Dataset 13 loading ...")

allPosts = list()
allTypes = list()
with open(os.path.join(DATASETS_FOLDER, '13_CONAN.json'), encoding="utf-8") as json_file:
    data = json.load(json_file)
for record in data['conan']:
    allPosts.append(record['hateSpeech'])
    allTypes.append(record['hsType'])
    
df = pd.DataFrame(allPosts, columns =['text'])
TEXT_ONLY_DF = concat_text_df(df)

full_dataset = []
for post, typ in zip(allPosts, allTypes):
    full_dataset.append([13, post, typ])

df13 = pd.DataFrame(full_dataset, columns=["corpus_id", "text", "label"])
df13["label"].value_counts()

In [None]:
# DATASET 14
print("Dataset 14 loading ...")
dataset_path = os.path.join(DATASETS_FOLDER, '14_ousidhoum - en_dataset_with_stop_words.csv')
df = pd.read_csv(dataset_path)
df = df.rename(columns={'tweet': 'text'})
TEXT_ONLY_DF = concat_text_df(df)

full_dataset = []
for index, row in df.iterrows():    
    full_dataset.append([14, row["text"], row["sentiment"]])    

df14 = pd.DataFrame(full_dataset, columns=["corpus_id", "text", "label"])
df14["label"].value_counts()

In [None]:
# DATASET 15
print("Dataset 15 loading ...")

dataset_path = os.path.join(DATASETS_FOLDER, '15_OLID/OLID-labels-levela.csv')
df = pd.read_csv(dataset_path, names=['id', 'label'])
id_to_label = dict()
for index, row in df.iterrows(): 
    id_to_label[row["id"]] = row["label"]


dataset_path = os.path.join(DATASETS_FOLDER, '15_OLID/OLID-testset-levela.tsv')
df = pd.read_csv(dataset_path, sep='\t')
df = df.rename(columns={'tweet': 'text'})
TEXT_ONLY_DF = concat_text_df(df)

full_dataset = []
mapper = {"NOT": "non-offensive", "OFF": "offensive"}
for index, row in df.iterrows():  
    full_dataset.append([15, row["text"], mapper[id_to_label[row["id"]]]]) 

dataset_path = os.path.join(DATASETS_FOLDER, '15_OLID/olid-training-v1.0.tsv')
df = pd.read_csv(dataset_path, sep='\t')
df = df.rename(columns={'tweet': 'text'})
TEXT_ONLY_DF = concat_text_df(df)

for index, row in df.iterrows():    
    full_dataset.append([15, row["text"], mapper[row["subtask_a"]]])  

df15 = pd.DataFrame(full_dataset, columns=["corpus_id", "text", "label"])
df15["label"].value_counts()

In [None]:
# DATASET 16
print("Dataset 16 loading ...")
dataset_path = os.path.join(DATASETS_FOLDER, '16_fox-news-all-comments.txt')
df = pd.read_csv(dataset_path, names=['label', 'text'], sep=':')
TEXT_ONLY_DF = concat_text_df(df)

full_dataset = []
for index, row in df.iterrows():    
    if row["label"] == 1:
        full_dataset.append([16, row["text"], "hateful"])  
    else:
        full_dataset.append([16, row["text"], "non-hateful"])  

df16 = pd.DataFrame(full_dataset, columns=["corpus_id", "text", "label"])
df16["label"].value_counts()


In [None]:
# DATASET 17
print("Dataset 17 loading ...")
dataset_path = os.path.join(DATASETS_FOLDER, '17_trac2/eng/trac2_eng_train.csv')
df = pd.read_csv(dataset_path)
df = df.rename(columns={'Text': 'text'})
TEXT_ONLY_DF = concat_text_df(df)

full_dataset = []
mapper = {"NAG": "non-aggressive", "OAG": "overtly-aggressive", "CAG": "covertly-aggressive"}
for index, row in df.iterrows():    
    full_dataset.append([17, row["text"], mapper[row["Sub-task A"]]])  

dataset_path = os.path.join(DATASETS_FOLDER, '17_trac2/eng/trac2_eng_dev.csv')
df = pd.read_csv(dataset_path)
df = df.rename(columns={'Text': 'text'})

for index, row in df.iterrows():    
    full_dataset.append([17, row["text"], mapper[row["Sub-task A"]]])  

df17 = pd.DataFrame(full_dataset, columns=["corpus_id", "text", "label"])
df17["label"].value_counts()



In [None]:
# DATASET 18
print("Dataset 18 loading ...")

dataset_path = os.path.join(DATASETS_FOLDER, '18/18_retrieved_tweets.csv')
df = pd.read_csv(dataset_path)
TEXT_ONLY_DF = concat_text_df(df)

tweet_id_to_text = create_tweet_dict(df)
tweet_id_to_text


full_dataset = []
dataset_path = os.path.join(DATASETS_FOLDER, '18/18_hatespeechtwitter.csv')
df = pd.read_csv(dataset_path)
for index, row in df.iterrows():
    tweet_id = row['tweet_id']
    if tweet_id in tweet_id_to_text:
        full_dataset.append([18, tweet_id_to_text[tweet_id], row['maj_label']])

df18 = pd.DataFrame(full_dataset, columns=["corpus_id", "text", "label"])
df18["label"].value_counts()

In [None]:
# DATASET 19
print("Dataset 19 loading ...")
dataset_path = os.path.join(DATASETS_FOLDER, '19_Online Harassment Dataset/onlineHarassmentDataset.tdf')
df = pd.read_csv(dataset_path, sep='\t', encoding="ISO-8859-1")
df = df.rename(columns={'Tweet': 'text'})
TEXT_ONLY_DF = concat_text_df(df)

full_dataset = []
mapper = {"N": "non-harrasment", "H": "harrasment"}
for index, row in df.iterrows():    
    full_dataset.append([19, row["text"], mapper[row['Code']]])

df19 = pd.DataFrame(full_dataset, columns=["corpus_id", "text", "label"])
df19["label"].value_counts()

In [None]:
# DATASET 20
print("Dataset 20 loading ...")

dataset_path = os.path.join(DATASETS_FOLDER, '20_NLP_CSS_2017-master/20_retrieved_benevolent_tweets.csv')
df = pd.read_csv(dataset_path)
TEXT_ONLY_DF = concat_text_df(df)
tweet_id_to_text_b = create_tweet_dict(df)

dataset_path = os.path.join(DATASETS_FOLDER, '20_NLP_CSS_2017-master/20_retrieved_hostile_tweets.csv')
df = pd.read_csv(dataset_path)
TEXT_ONLY_DF = concat_text_df(df)
tweet_id_to_text_h = create_tweet_dict(df)

full_dataset = []
benevolents = open(os.path.join(DATASETS_FOLDER, '20_NLP_CSS_2017-master/benevolent_sexist.tsv'), 'r').readlines()
benevolents = list(map(lambda x: re.sub(r'\n$','', x), benevolents))
for benevolent in benevolents:
    if int(benevolent) in tweet_id_to_text_b:
        full_dataset.append([20, tweet_id_to_text_b[int(benevolent)], 'benevolent_sexist'])

hostiles = open(os.path.join(DATASETS_FOLDER, '20_NLP_CSS_2017-master/hostile_sexist.tsv'), 'r').readlines()
hostiles = list(map(lambda x: re.sub(r'\n$','', x), hostiles))
for hostile in hostiles:
    if int(hostile) in tweet_id_to_text_h:
        full_dataset.append([20, tweet_id_to_text_h[int(hostile)], 'hostile_sexist'])



df20 = pd.DataFrame(full_dataset, columns=["corpus_id", "text", "label"])
df20["label"].value_counts()

In [None]:
# DATASET 21
print("Dataset 21 loading ...")

allPosts = list()
allTypes = list()
with open(os.path.join(DATASETS_FOLDER, '21_HateXplain-dataset.json'), encoding="utf-8") as json_file:
    data = json.load(json_file)

for item in data:    
    allPosts.append(" ".join(data[item]['post_tokens']))
    labels = list(map(lambda x: x["label"], data[item]['annotators']))
    if labels.count('hatespeech') >= len(labels)/2:
        allTypes.append('hatespeech')
    elif labels.count('offensive') >= len(labels)/2:
        allTypes.append('offensive')
    else: 
        allTypes.append('normal')

  

df = pd.DataFrame(allPosts, columns =['text'])
TEXT_ONLY_DF = concat_text_df(df)

full_dataset = []
for post, typ in zip(allPosts, allTypes):
    full_dataset.append([21, post, typ])

df21 = pd.DataFrame(full_dataset, columns=["corpus_id", "text", "label"])
df21["label"].value_counts()

In [None]:
# DATASET 25
print("Dataset 25 loading ...")
dataset_path = os.path.join(DATASETS_FOLDER, '25_2020-12-31-DynamicallyGeneratedHateDataset-entries-v0.1.csv')
df = pd.read_csv(dataset_path)
TEXT_ONLY_DF = concat_text_df(df)

full_dataset = []
for index, row in df.iterrows():    
    full_dataset.append([25, row["text"], row['label']])

df25 = pd.DataFrame(full_dataset, columns=["corpus_id", "text", "label"])
df25["label"].value_counts()

In [None]:
# DATASET 26

print("Dataset 26 loading ...")
dataset_path = os.path.join(DATASETS_FOLDER, '26_Reddit_norm_violations/macro-norm-violations-n10-t0-misogynistic-slurs.csv')
df = pd.read_csv(dataset_path, names=["text"])
TEXT_ONLY_DF = concat_text_df(df)

full_dataset = []
for index, row in df.iterrows():    
    full_dataset.append([26, row["text"], "misogyny-slur"])

dataset_path = os.path.join(DATASETS_FOLDER, '26_Reddit_norm_violations/macro-norm-violations-n15-t2-hatespeech-racist-homophobic.csv')
df = pd.read_csv(dataset_path, names=["text"])
TEXT_ONLY_DF = concat_text_df(df)

for index, row in df.iterrows():    
    full_dataset.append([26, row["text"], "racist-homophobic"])

dataset_path = os.path.join(DATASETS_FOLDER, '26_Reddit_norm_violations/macro-norm-violations-n15-t3-abusing-and-criticisizing-mods.csv')
df = pd.read_csv(dataset_path, names=["text"])
TEXT_ONLY_DF = concat_text_df(df)

for index, row in df.iterrows():    
    full_dataset.append([26, row["text"], "abuse"])

df26 = pd.DataFrame(full_dataset, columns=["corpus_id", "text", "label"])
df26["label"].value_counts()

In [None]:
# DATASET 27

print("Dataset 27 loading ...")
dataset_path = os.path.join(DATASETS_FOLDER, '27_VulgarTwitter/cleaned_data_train.tsv')
df = pd.read_csv(dataset_path, sep='\t')
df = df.rename(columns={'Tweet': 'text'})
TEXT_ONLY_DF = concat_text_df(df)

full_dataset = []
for index, row in df.iterrows():    
    if row["Majority"] >= 3:
        full_dataset.append([27, row["text"], "non-vulgar"])
    else:
        full_dataset.append([27, row["text"], "vulgar"])

dataset_path = os.path.join(DATASETS_FOLDER, '27_VulgarTwitter/cleaned_data_test.tsv')
df = pd.read_csv(dataset_path, sep='\t')
df = df.rename(columns={'Tweet': 'text'})
TEXT_ONLY_DF = concat_text_df(df)

for index, row in df.iterrows():    
    if row["Majority"] >= 3:
        full_dataset.append([27, row["text"], "non-vulgar"])
    else:
        full_dataset.append([27, row["text"], "vulgar"])

dataset_path = os.path.join(DATASETS_FOLDER, '27_VulgarTwitter/cleaned_data_val.tsv')
df = pd.read_csv(dataset_path, sep='\t')
df = df.rename(columns={'Tweet': 'text'})
TEXT_ONLY_DF = concat_text_df(df)

for index, row in df.iterrows():    
    if row["Majority"] >= 3:
        full_dataset.append([27, row["text"], "non-vulgar"])
    else:
        full_dataset.append([27, row["text"], "vulgar"])

df27 = pd.DataFrame(full_dataset, columns=["corpus_id", "text", "label"])
df27["label"].value_counts()

In [None]:
# DATASET 28
print("Dataset 28 loading ...")

allPosts = list()
allTypes = list()
with open(os.path.join(DATASETS_FOLDER, '28_LoL_dataset/lol_anonymized_ann.txt'), 'r', encoding="utf-8") as file:
    ann = file.read()[1:-1]
    ann = ann.split('),(')
    ann = [(entry.split(',')[0], entry.split(',')[1]) for entry in ann]
with open(os.path.join(DATASETS_FOLDER, '28_LoL_dataset/lol_anonymized_posts.txt'), 'r', encoding="utf-8") as file:
    posts = ','.join(file.readlines())[1:-1]
    posts = posts.split('),(')
for post in posts:
    split = post.split(',')
    topic_id = split[0]
    post_number = split[1]
    html_message = ''.join(split[3:-1])
    allPosts.extend([remove_html_tags(html_message)])
    allTypes.extend(['cyberbullying' if (topic_id, post_number) in ann else 'none'])

df = pd.DataFrame(allPosts, columns =['text'])
TEXT_ONLY_DF = concat_text_df(df)

full_dataset = []
for post, typ in zip(allPosts, allTypes):
    full_dataset.append([28, post, typ])

df28 = pd.DataFrame(full_dataset, columns=["corpus_id", "text", "label"])
df28["label"].value_counts()

In [None]:
# DATASET 29
print("Dataset 29 loading ...")

dataset_path = os.path.join(DATASETS_FOLDER, '29/29_retrieved_tweets.csv')
df = pd.read_csv(dataset_path)
TEXT_ONLY_DF = concat_text_df(df)

tweet_id_to_text = create_tweet_dict(df)
tweet_id_to_text


full_dataset = []
dataset_path = os.path.join(DATASETS_FOLDER, '29/29_NAACL_SRW_2016.csv')
df = pd.read_csv(dataset_path)
for index, row in df.iterrows():
    tweet_id = row['tweet_id']
    if tweet_id in tweet_id_to_text:
        full_dataset.append([29, tweet_id_to_text[tweet_id], row['class']])

df29 = pd.DataFrame(full_dataset, columns=["corpus_id", "text", "label"])
df29["label"].value_counts()

In [None]:
# DATASET 30
print("Dataset 30 loading ...")

dataset_path = os.path.join(DATASETS_FOLDER, '30/30_retrieved_tweets.csv')
df = pd.read_csv(dataset_path)
TEXT_ONLY_DF = concat_text_df(df)

tweet_id_to_text = create_tweet_dict(df)
tweet_id_to_text


full_dataset = []
data = open(os.path.join(DATASETS_FOLDER, '30/30_NLP_CSS_2016.csv'), 'r').readlines()
data = list(map(lambda x: re.sub(r'\n$','', x), data))
data = list(map(lambda x: x.split('\t'), data))
data = list(map(lambda x: [x[0], x[1]], data))[1:] # tweet_id, expert - columns
df = pd.DataFrame(data, columns=["tweet_id", "class"])
for index, row in df.iterrows():
    tweet_id = row['tweet_id']
    if int(tweet_id) in tweet_id_to_text:
        full_dataset.append([30, tweet_id_to_text[int(tweet_id)], row['class']])

df30 = pd.DataFrame(full_dataset, columns=["corpus_id", "text", "label"])
df30["label"].value_counts()

In [None]:
print(f"FINAL DATASET:\n\t{len(TEXT_ONLY_DF)} lines")

print("Doing basic preprocessing ...")

def basic_preprocessing(row):
    text = row["text"]
    text = text.replace('"', '')
    text = re.sub(r'^\'','', text)
    text = re.sub(r'\'$','', text)
    text = re.sub(r'^\s+','', text)
    text = re.sub(r'\s+$','', text)
    text = re.sub(r'\s+',' ', text)
    
    text = emoji.demojize(text)
    text = text.replace('::', ' ')
    row["text"] = text  
    return row

TEXT_ONLY_DF = TEXT_ONLY_DF.apply(lambda x : basic_preprocessing(x), axis=1)
print("... basic preprocessing done.")

print("Saving full datasets ...")
TEXT_ONLY_DF.to_csv('full_textOnly_dataset.csv', index = False)
print("... datasets saved.")

In [None]:
print("Doing cleaning ...")
CLEANED_DF = pd.DataFrame(TEXT_ONLY_DF["text"],columns=["text"])

def cleaning(row):
    text = row["text"]
    text = text.replace('RT', '')
    text = text.replace('…', '')
    text = text.replace('Q:', '')
    text = text.replace('A:', '')
    text = text.replace('“', '')
    text = text.replace('”', '')
    text = text.replace('``', '')
    text = text.replace('\'\'', '')
    text = text.replace('\\n', ' ')
    text = text.replace('\\\'', '\'')
    text = re.sub(r'@[A-Za-z0-9\-_]+:?', '', text)
    text = re.sub(r'https?://\S+', '', text)    
    text = re.sub(r'<.*?>', '', text) # remove html tags        
    text = re.sub(r'&.*?;', '', text) # remove &sometihngttags        
    text = re.sub(r'&lt;|&gt;|&amp;', '', text)   #remove tags
    text = re.sub(r'\s+',' ', text)    
    row["text"] = text
    return row

CLEANED_DF = CLEANED_DF.apply(lambda x : cleaning(x), axis=1)
print("... cleaning done.")

In [None]:
print("Saving cleaned dataset ...")
CLEANED_DF.to_csv('full_textOnly_cleaned_dataset.csv', index = False, header=False)
print("... dataset saved.")

In [None]:
print(f"FULL CLASSIFICATION DATASET:")

print("Combining dataset ...")
FULL_CLASS_DF = pd.DataFrame(full_dataset, columns=["corpus_id", "text", "label"])
FULL_CLASS_DF = pd.concat([FULL_CLASS_DF, df1], ignore_index=True, axis=0)
FULL_CLASS_DF = pd.concat([FULL_CLASS_DF, df2], ignore_index=True, axis=0)
FULL_CLASS_DF = pd.concat([FULL_CLASS_DF, df3], ignore_index=True, axis=0)
FULL_CLASS_DF = pd.concat([FULL_CLASS_DF, df4], ignore_index=True, axis=0)
#FULL_CLASS_DF = pd.concat([FULL_CLASS_DF, df5], ignore_index=True, axis=0)
FULL_CLASS_DF = pd.concat([FULL_CLASS_DF, df6], ignore_index=True, axis=0)
FULL_CLASS_DF = pd.concat([FULL_CLASS_DF, df7], ignore_index=True, axis=0)
#FULL_CLASS_DF = pd.concat([FULL_CLASS_DF, df8], ignore_index=True, axis=0)
FULL_CLASS_DF = pd.concat([FULL_CLASS_DF, df9], ignore_index=True, axis=0)
FULL_CLASS_DF = pd.concat([FULL_CLASS_DF, df10], ignore_index=True, axis=0)
FULL_CLASS_DF = pd.concat([FULL_CLASS_DF, df11], ignore_index=True, axis=0)
FULL_CLASS_DF = pd.concat([FULL_CLASS_DF, df12], ignore_index=True, axis=0)
FULL_CLASS_DF = pd.concat([FULL_CLASS_DF, df13], ignore_index=True, axis=0)
FULL_CLASS_DF = pd.concat([FULL_CLASS_DF, df14], ignore_index=True, axis=0)
FULL_CLASS_DF = pd.concat([FULL_CLASS_DF, df15], ignore_index=True, axis=0)
FULL_CLASS_DF = pd.concat([FULL_CLASS_DF, df16], ignore_index=True, axis=0)
FULL_CLASS_DF = pd.concat([FULL_CLASS_DF, df17], ignore_index=True, axis=0)
FULL_CLASS_DF = pd.concat([FULL_CLASS_DF, df18], ignore_index=True, axis=0)
FULL_CLASS_DF = pd.concat([FULL_CLASS_DF, df19], ignore_index=True, axis=0)
FULL_CLASS_DF = pd.concat([FULL_CLASS_DF, df20], ignore_index=True, axis=0)
FULL_CLASS_DF = pd.concat([FULL_CLASS_DF, df21], ignore_index=True, axis=0)
#FULL_CLASS_DF = pd.concat([FULL_CLASS_DF, df22], ignore_index=True, axis=0)
#FULL_CLASS_DF = pd.concat([FULL_CLASS_DF, df23], ignore_index=True, axis=0)
#FULL_CLASS_DF = pd.concat([FULL_CLASS_DF, df24], ignore_index=True, axis=0)
FULL_CLASS_DF = pd.concat([FULL_CLASS_DF, df25], ignore_index=True, axis=0)
FULL_CLASS_DF = pd.concat([FULL_CLASS_DF, df26], ignore_index=True, axis=0)
FULL_CLASS_DF = pd.concat([FULL_CLASS_DF, df27], ignore_index=True, axis=0)
FULL_CLASS_DF = pd.concat([FULL_CLASS_DF, df28], ignore_index=True, axis=0)
FULL_CLASS_DF = pd.concat([FULL_CLASS_DF, df29], ignore_index=True, axis=0)
FULL_CLASS_DF = pd.concat([FULL_CLASS_DF, df30], ignore_index=True, axis=0)

print(f"\nCounts by corpus id: \n{FULL_CLASS_DF['corpus_id'].value_counts()}")
print(f"\nCounts by label type: \n{FULL_CLASS_DF['label'].value_counts()}\n")
print("... combining dataset done.")

print("Doing basic preprocessing ...")
FULL_CLASS_DF = FULL_CLASS_DF.apply(lambda x : basic_preprocessing(x), axis=1)
print("... basic preprocessing done.")

In [None]:
print("Saving full classification dataset ...")
FULL_CLASS_DF.to_csv('full_classification_dataset.csv', index = False)
print("... dataset saved.")