In [1]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split

In [2]:
prefix = 'data/binary_classification/'

binary_train_df = pd.read_csv(prefix + 'train.csv', header=None)
binary_train_df.head()

binary_eval_df = pd.read_csv(prefix + 'test.csv', header=None)
binary_eval_df.head()

binary_train_df[0] = (binary_train_df[0] == 2).astype(int)
binary_eval_df[0] = (binary_eval_df[0] == 2).astype(int)

binary_train_df = pd.DataFrame({
    'prefix': ["binary classification" for i in range(len(binary_train_df))],
    'input_text': binary_train_df[1].str.replace('\n', ' '),
    'target_text': binary_train_df[0].astype(str),
})

print(binary_train_df.head())

binary_eval_df = pd.DataFrame({
    'prefix': ["binary classification" for i in range(len(binary_eval_df))],
    'input_text': binary_eval_df[1].str.replace('\n', ' '),
    'target_text': binary_eval_df[0].astype(str),
})


print(binary_eval_df.head())

prefix                                         input_text  \
0  binary classification  Unfortunately, the frustration of being Dr. Go...   
1  binary classification  Been going to Dr. Goldberg for over 10 years. ...   
2  binary classification  I don't know what Dr. Goldberg was like before...   
3  binary classification  I'm writing this review to give you a heads up...   
4  binary classification  All the food is great here. But the best thing...   

  target_text  
0           0  
1           1  
2           0  
3           0  
4           1  
                  prefix                                         input_text  \
0  binary classification  Contrary to other reviews, I have zero complai...   
1  binary classification  Last summer I had an appointment to get new ti...   
2  binary classification  Friendly staff, same starbucks fair you get an...   
3  binary classification  The food is good. Unfortunately the service is...   
4  binary classification  Even when we didn't have a

In [3]:
prefix = "data/multilabel_classification/"

multi_train_df = pd.read_csv(prefix + 'train.csv')
multi_train_df["comment_text"].str.replace('\n', ' ').str.replace('\t', ' ')

for col in multi_train_df.columns:
    if col not in ["id", "comment_text"]:
        multi_train_df[col] = multi_train_df[col].apply(lambda x: col if x else "")

multi_train_df["target_text"] = multi_train_df['toxic'].str.cat(multi_train_df[[col for col in multi_train_df.columns if col not in ["id", "comment_text", "toxic"]]], sep=',')
multi_train_df["target_text"] = multi_train_df["target_text"].apply(lambda x: ",".join(word for word in x.split(",") if word)).apply(lambda x: x if x else "clean")
multi_train_df["input_text"] = multi_train_df["comment_text"].str.replace('\n', ' ')
multi_train_df["prefix"] = "multilabel classification"
multi_train_df = multi_train_df[["prefix", "input_text", "target_text"]]

multi_train_df, multi_eval_df = train_test_split(multi_train_df, test_size=0.1)

multi_train_df.head()

Unnamed: 0,prefix,input_text,target_text
140162,multilabel classification,ban you got me banned on irc -,clean
135151,multilabel classification,"This is a public computer Hi, I have a sligh...",clean
4901,multilabel classification,Why does nobody post anything on 'my talk' tha...,clean
58298,multilabel classification,Okay sorry I didn't read the article for a while.,clean
56472,multilabel classification,If you really feel that strongly about protect...,clean


In [4]:
prefix = 'data/regression/'

sts_train_df = pd.read_csv(prefix + 'train.tsv', sep='\t', error_bad_lines=False).dropna()
sts_eval_df = pd.read_csv(prefix + 'dev.tsv', sep='\t', error_bad_lines=False).dropna()

sts_train_df["sentence1"] = sts_train_df["sentence1"].str.replace('\n', ' ').str.replace('\t', ' ')
sts_train_df["sentence2"] = sts_train_df["sentence2"].str.replace('\n', ' ').str.replace('\t', ' ')
sts_eval_df["sentence1"] = sts_eval_df["sentence1"].str.replace('\n', ' ').str.replace('\t', ' ')
sts_eval_df["sentence2"] = sts_eval_df["sentence2"].str.replace('\n', ' ').str.replace('\t', ' ')

b'Skipping line 2509: expected 10 fields, saw 11\nSkipping line 2650: expected 10 fields, saw 11\nSkipping line 2727: expected 10 fields, saw 11\nSkipping line 3071: expected 10 fields, saw 11\nSkipping line 3393: expected 10 fields, saw 11\n'
b'Skipping line 1042: expected 10 fields, saw 11\nSkipping line 1066: expected 10 fields, saw 11\nSkipping line 1083: expected 10 fields, saw 11\nSkipping line 1137: expected 10 fields, saw 11\nSkipping line 1150: expected 10 fields, saw 11\n'


In [10]:
sts_train_df.drop(2001, inplace=True) # This line badly formatted. Getting rid.

In [13]:
sts_train_df["input_text"] = sts_train_df.apply(lambda x: "sentence1: " + x["sentence1"] + " sentence2: " + x["sentence2"], axis=1)
sts_eval_df["input_text"] = sts_eval_df.apply(lambda x: "sentence1: " + x["sentence1"] + " sentence2: " + x["sentence2"], axis=1)

sts_train_df["target_text"] = sts_train_df["score"].apply(lambda x: round(x * 5) / 5).astype(str)
sts_eval_df["target_text"] = sts_eval_df["score"].apply(lambda x: round(x * 5) / 5).astype(str)

sts_train_df["prefix"] = "similarity"
sts_eval_df["prefix"] = "similarity"

sts_train_df = sts_train_df[["prefix", "input_text", "target_text"]]
sts_eval_df = sts_eval_df[["prefix", "input_text", "target_text"]]

In [15]:
train_df = pd.concat([binary_train_df, multi_train_df, sts_train_df]).astype(str)
eval_df = pd.concat([binary_eval_df, multi_eval_df, sts_eval_df]).astype(str)

In [16]:
train_df.to_csv("data/train.tsv", "\t")
eval_df.to_csv("data/eval.tsv", "\t")