# Preprocessing

# 0. Imports

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import numpy as np
import pandas as pd

## 1. Original dataset preprocessing

### 1.1. Loading data

In [3]:
X_plain_train = pd.read_csv('/content/gdrive/MyDrive/git/data/Training/subtaskA_data_all.csv', index_col=0)
X_plain_dev   = pd.read_csv('/content/gdrive/MyDrive/git/data/Dev/subtaskA_dev_data.csv'     , index_col=0)
X_plain_test  = pd.read_csv('/content/gdrive/MyDrive/git/data/Test/subtaskA_test_data.csv'   , index_col=0)

y_plain_train = pd.read_csv('/content/gdrive/MyDrive/git/data/Training/subtaskA_answers_all.csv', index_col=0, header=None)
y_plain_dev   = pd.read_csv('/content/gdrive/MyDrive/git/data/Dev/subtaskA_gold_answers.csv'    , index_col=0, header=None)
y_plain_test  = pd.read_csv('/content/gdrive/MyDrive/git/data/Test/subtaskA_gold_answers.csv'   , index_col=0, header=None)

### 1.2. Add the label column

In [8]:
X_plain_train['label'] = y_plain_train

### 1.3. Swap 0 labelled pairs
This is important because we want the sensical and nonsensical sentences to be in their respective columns. In this case the sensical sentence will be sent0.

In [14]:
X_plain_train['sent0'], X_plain_train['sent1'] = np.where(X_plain_train['label'] == 0, (X_plain_train['sent1'], X_plain_train['sent0']),
                                                 (X_plain_train['sent0'], X_plain_train['sent1']))

### 1.4. Add columns corresponding to lowercase sentences with no punctuation

In [70]:
X_plain_train['lower_nopunct0'] = X_plain_train['sent0'].str.lower().replace('[^\w\s]', '', regex=True)
X_plain_train['lower_nopunct1'] = X_plain_train['sent1'].str.lower().replace('[^\w\s]', '', regex=True)

In [75]:
X_plain_train

Unnamed: 0_level_0,sent0,sent1,hr_sent0,hr_sent1,lower_nopunct0,label,lower_nopunct1
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,He poured milk on his cereal.,He poured orange juice on his cereal.,He poured orange juice on his cereal.,He poured milk on his cereal.,he poured milk on his cereal,0,he poured orange juice on his cereal
1,He drinks milk.,He drinks apple.,He drinks apple.,He drinks milk.,he drinks milk,0,he drinks apple
2,Jeff ran a mile today,"Jeff ran 100,000 miles today",Jeff ran a mile today,"Jeff ran 100,000 miles today",jeff ran a mile today,1,jeff ran 100000 miles today
3,A mosquito stings me,I sting a mosquito,Mosquito bites me,I bite mosquito,a mosquito stings me,1,i sting a mosquito
4,A niece is a person.,A giraffe is a person.,Niece of the person.,Giraffe is a person.,a niece is a person,1,a giraffe is a person
...,...,...,...,...,...,...,...
9995,Mark ate a big sweet cherry pie,Mark ate a big bitter cherry pie,Mark has eaten a large bitter cherry pie,Mark ate large sweet cherry pie,mark ate a big sweet cherry pie,0,mark ate a big bitter cherry pie
9996,Gloria wears a hat on her head,Gloria wears a cat on her head,Gloria carries a cat on his head,Gloria wearing a hat on his head,gloria wears a hat on her head,0,gloria wears a cat on her head
9997,Harry went to the barbershop to have his hair cut,Harry went to the barbershop to have his glass...,Harry went to the barber shop to a hair cut,Harry went to the barber shop to fix his glasses,harry went to the barbershop to have his hair cut,1,harry went to the barbershop to have his glass...
9998,Reilly is sleeping on the couch,Reilly is sleeping on the window,Reilly is sleeping on the couch,Reilly sleeping on window,reilly is sleeping on the couch,1,reilly is sleeping on the window


## 2. Artificially generated dataset preprocessing

### 2.1. Load data

In [15]:
X_aug_train = pd.read_csv('/content/gdrive/MyDrive/git/data/Training/GPT2_subtaskA_data.csv', index_col=0)
X_aug_dev   = pd.read_csv('/content/gdrive/MyDrive/git/data/Dev/subtaskA_dev_data.csv'     , index_col=0)
X_aug_test  = pd.read_csv('/content/gdrive/MyDrive/git/data/Test/subtaskA_test_data.csv'   , index_col=0)

y_aug_train = pd.read_csv('/content/gdrive/MyDrive/git/data/Training/GPT2_answers.csv', index_col=0, header=None)
y_aug_dev   = pd.read_csv('/content/gdrive/MyDrive/git/data/Dev/subtaskA_gold_answers.csv'    , index_col=0, header=None)
y_aug_test  = pd.read_csv('/content/gdrive/MyDrive/git/data/Test/subtaskA_gold_answers.csv'   , index_col=0, header=None)

In [167]:
scores1 = pd.read_csv('scores1_gmean.csv', header=None, index_col=0)
scores2 = pd.read_csv('scores2_gmean.csv', header=None, index_col=0)

### 2.3. Add label column

In [34]:
X_aug_train['label'] = y_aug_train

### 2.4. Swap 0 labelled pairs

In [38]:
X_aug_train['sent0'], X_aug_train['sent1'] = np.where(X_aug_train['label'] == 0, (X_aug_train['sent1'], X_aug_train['sent0']),
                                                 (X_aug_train['sent0'], X_aug_train['sent1']))

### 2.5. Auxiliary columns

In [168]:
X_aug_train['lower_nopunct0'] = X_aug_train['sent0'].str.lower().replace('[^\w\s]', '', regex=True)
X_aug_train['lower_nopunct1'] = X_aug_train['sent1'].str.lower().replace('[^\w\s]', '', regex=True)
scores1.index = X_aug_train.index
scores2.index = X_aug_train.index
X_aug_train['scores1'] = scores1
X_aug_train['scores2'] = scores2

### 2.6. Remove duplicates

In [None]:
a = pd.concat([X_plain_train, X_aug_train])
a = a.drop_duplicates(['lower_nopunct0', 'lower_nopunct1'], keep=False)
a = a[~a['scores1'].isna()]
a.pop('hr_sent0')
a.pop('hr_sent1')

### 2.7. Remove entries with more than one sentence

In [171]:
a = a[~a['sent0'].str.contains('.*\..*.', regex=True)]
a = a[~a['sent1'].str.contains('.*\..*.', regex=True)]

### 2.8. Remove similar sentence pairs

In [172]:
a = a[~(a['lower_nopunct0'] == a['lower_nopunct1'])]

### 2.9. Remove short sentences

In [95]:
!pip install transformers
from transformers import GPT2TokenizerFast
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2-large")

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 5.4MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 31.0MB/s 
Collecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |███████

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355256.0, style=ProgressStyle(descript…




In [173]:
a['length1'] = a['sent0'].apply(lambda x: len(tokenizer(x)['input_ids']))
a['length2'] = a['sent1'].apply(lambda x: len(tokenizer(x)['input_ids']))

In [174]:
a = a[a['length1'] >= 3]
a = a[a['length2'] >= 3]

### 2.10. Swap the labels back

In [177]:
a['sent0'], a['sent1'] = np.where(a['label'] == 0, (a['sent1'], a['sent0']),
                                                 (a['sent0'], a['sent1']))

### 2.11. Filter data based on score difference

In [179]:
a['diff'] = a['scores1'] - a['scores2']
a['diff_log'] = np.log(a['scores1']) - np.log(a['scores2'])

In [180]:
highest = a.nlargest(10000, 'diff')
highest_log = a.nlargest(10000, 'diff_log')

### 2.12. Save the data

In [188]:
%cd /content/gdrive/MyDrive/git/data/Training

/content/gdrive/.shortcut-targets-by-id/1yfuNPQUT_G0CfNtdGxHhLi96giKyswbZ/git/data/Training


In [189]:
data = highest[['sent0', 'sent1']]
answers = highest['label']
data.to_csv('GPT2_data_final.csv')
answers.to_csv('GPT2_answers_final.csv', header=None)

In [190]:
data = highest_log[['sent0', 'sent1']]
answers = highest_log['label']
data.to_csv('GPT2_data_final_log.csv')
answers.to_csv('GPT2_answers_final_log.csv', header=None)