In [1]:
import pandas as pd

In [2]:
def read_dataset(input_path: str, sep: str, select_columns: list, rename_columns: list):
    if input_path.split(".")[1] == "txt":
        df = pd.read_csv(input_path, sep=sep, quoting=csv.QUOTE_NONE)
    elif input_path.split(".")[1] == "parquet":
        df = pd.read_parquet(input_path)
    else:
        df = pd.read_csv(input_path, sep=sep)

    if len(rename_columns) == 0:
        return df[select_columns]
    elif len(select_columns) == len(rename_columns):
        for i in range(0, len(select_columns)):
            df.rename(columns={select_columns[i]: rename_columns[i]}, inplace=True)
        return df[rename_columns]
    else:
        return df

In [61]:
df = pd.read_parquet(dataset_mrpc)
with pd.option_context('display.max_colwidth', None):
    display(df)

Unnamed: 0,label,id_1,id_2,sentence1,sentence2
0,1,702876,702977,"Amrozi accused his brother, whom he called ""the witness"", of deliberately distorting his evidence.","Referring to him as only ""the witness"", Amrozi accused his brother of deliberately distorting his evidence."
1,0,2108705,2108831,Yucaipa owned Dominick's before selling the chain to Safeway in 1998 for $2.5 billion.,Yucaipa bought Dominick's in 1995 for $693 million and sold it to Safeway for $1.8 billion in 1998.
2,1,1330381,1330521,"They had published an advertisement on the Internet on June 10, offering the cargo for sale, he added.","On June 10, the ship's owners had published an advertisement on the Internet, offering the explosives for sale."
3,0,3344667,3344648,"Around 0335 GMT, Tab shares were up 19 cents, or 4.4%, at A$4.56, having earlier set a record high of A$4.57.","Tab shares jumped 20 cents, or 4.6%, to set a record closing high at A$4.57."
4,1,1236820,1236712,"The stock rose $2.11, or about 11 percent, to close Friday at $21.51 on the New York Stock Exchange.",PG&E Corp. shares jumped $1.63 or 8 percent to $21.03 on the New York Stock Exchange on Friday.
...,...,...,...,...,...
4071,1,1620264,1620507,"""At this point, Mr. Brando announced: 'Somebody ought to put a bullet'"" through her head, the motion continued.","Brando said that ""somebody ought to put a bullet"" through her head, according to the defense."
4072,0,1848001,1848224,"Martin, 58, will be freed today after serving two thirds of his five-year sentence for the manslaughter of 16-year-old Fred Barras.",Martin served two thirds of a five-year sentence for the manslaughter of Barras and for wounding Fearon.
4073,1,747160,747144,"""We have concluded that the outlook for price stability over the medium term has improved significantly since our last decision to lower interest rates,"" Duisenberg said.","In a statement, the ECB said the outlook for price stability over the medium term had ""improved significantly"" since its last decision to lower interest rates in March."
4074,1,2539933,2539850,The notification was first reported Friday by MSNBC.,MSNBC.com first reported the CIA request on Friday.


# Process training set

In [3]:
dataset_etpc = "dataset/train_set/etpc_train.parquet"
dataset_mrpc = "dataset/train_set/mrpc_train.parquet"
dataset_paws = "dataset/train_set/paws_labeled_final_train.parquet"
dataset_superGlue = "dataset/train_set/superGlue_axb_train.parquet"

In [4]:
df_etpc = read_dataset(dataset_etpc, "", ["sentence1", "sentence2", "etpc_label"], ["input", "target", "label"]) 
df_paws = read_dataset(dataset_paws, "", ["sentence1", "sentence2", "label"], ["input", "target", "label"]) 
df_superGlue = read_dataset(dataset_superGlue, "", ["sentence1", "sentence2", "label"], ["input", "target", "label"]) 
df_mrpc = read_dataset(dataset_mrpc, "", ["sentence1", "sentence2", "label"], ["input", "target", "label"]) 

df_train = pd.concat([df_etpc, df_paws, df_superGlue, df_mrpc])
len(df_train)

60382

In [6]:
df_train_quality = df_train.loc[df_train["label"] == 1]
df_train_quality 

28553

# Process test set

In [74]:
dataset_mrpc = "dataset/test_set/mrpc_test.parquet"
dataset_paws = "dataset/test_set/paws_labeled_final_test.parquet"
dataset_superGlue = "dataset/test_set/superGlue_axg_test.parquet"

In [84]:
df_paws = read_dataset(dataset_paws, "", ["sentence1", "sentence2", "label"], ["input", "target", "label"]) 
df_superGlue = read_dataset(dataset_superGlue, "", ["premise", "hypothesis", "label"], ["input", "target", "label"]) 
df_mrpc = read_dataset(dataset_mrpc, "", ["sentence1", "sentence2", "label"], ["input", "target", "label"]) 

10081

# Process validation set

In [85]:
dataset_mrpc = "dataset/validation_set/mrpc_validation.parquet"
dataset_paws = "dataset/validation_set/paws_labeled_final_validation.parquet"

In [86]:
df_paws = read_dataset(dataset_paws, "", ["sentence1", "sentence2", "label"], ["input", "target", "label"]) 
df_mrpc = read_dataset(dataset_mrpc, "", ["sentence1", "sentence2", "label"], ["input", "target", "label"]) 

df_validation = pd.concat([df_paws, df_mrpc])
len(df_validation)

8408

In [23]:
df_simpleWiki = pd.read_json(path_or_buf="dataset/SimpleWiki.jsonl", lines=True)

In [39]:
df_simpleWiki.rename(columns={0: "#1 String", 1: "#2 String"})
df_simpleWiki["label"] = 1
df_simpleWiki.to_csv("dataset/new_data/simpleWiki.csv", index=True)

In [43]:
df_coco = pd.read_json(path_or_buf="dataset/coco_captions.jsonl", lines=True)

In [58]:
pd.Series(df_coco['texts'])

0         [A bathroom with a border of butterflies and b...
1         [A bathroom with a border of butterflies and b...
2         [A bathroom with a border of butterflies and b...
3         [A bathroom with a border of butterflies and b...
4         [A blue and white bathroom with butterfly them...
                                ...                        
828390    [Fans pose with stuffed animals at an ice rink...
828391    [Fans pose with stuffed animals at an ice rink...
828392    [A couple of women with some stuffed animals.,...
828393    [A couple of women with some stuffed animals.,...
828394    [Two women smile for the camea while posing iw...
Name: texts, Length: 828395, dtype: object

In [56]:
df_coco["texts"][0][1]

'A blue and white bathroom with butterfly themed wall tiles.'