This is the same notebook as Juliana's Clean_CoLA, I just removed the data cleaning part and only kept the non-tokenized dataset

# Import Libraries

In [1]:
# standard library
import pandas as pd

# drive access
from google.colab import drive
drive.mount('/content/drive')

# train/val split
from sklearn.model_selection import train_test_split

# for augmentation
!pip install nlpaug -q
import nlpaug.augmenter.word as naw
import random

# for visualization
import matplotlib.pyplot as plt
import seaborn as sns


Mounted at /content/drive
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h

# Load CoLA dataset (n=9594 sentences, split into train/dev)

- **raw**:
  - in domain train (n=8551) / dev set (n=527)
  - out of domain dev set (n=516)

**NOTE: Original CoLA paper mentions > 10K sentences, but that includes the held-out test set found in Kaggle competitions. The 'test' set mentioned in this notebook is actually the validation set mentioned in the paper.**

# Clean raw dataset

## Train

In [2]:
# Train: 8551 sentences
train_cola = pd.read_csv('/content/drive/MyDrive/266/Data/Raw_Data/CoLA/cola_public/raw/in_domain_train.tsv', sep='\t', header=None)

# change column names
column_names = ['source','acceptability','authors_acceptability','sentence']
train_cola = train_cola.set_axis(column_names, axis=1)

# add column to specify domain (in/out of domain)
train_cola['domain'] = pd.Series(['in' for x in range(len(train_cola.index))])
train_cola.head()

train_cola.head()

Unnamed: 0,source,acceptability,authors_acceptability,sentence,domain
0,gj04,1,,"Our friends won't buy this analysis, let alone...",in
1,gj04,1,,One more pseudo generalization and I'm giving up.,in
2,gj04,1,,One more pseudo generalization or I'm giving up.,in
3,gj04,1,,"The more we study verbs, the crazier they get.",in
4,gj04,1,,Day by day the facts are getting murkier.,in


In [3]:
# see samples of acceptable sentences
train_cola[train_cola['acceptability']==1].head()


Unnamed: 0,source,acceptability,authors_acceptability,sentence,domain
0,gj04,1,,"Our friends won't buy this analysis, let alone...",in
1,gj04,1,,One more pseudo generalization and I'm giving up.,in
2,gj04,1,,One more pseudo generalization or I'm giving up.,in
3,gj04,1,,"The more we study verbs, the crazier they get.",in
4,gj04,1,,Day by day the facts are getting murkier.,in


In [4]:
# see samples of unacceptable sentences
train_cola[train_cola['acceptability']==0].head()

Unnamed: 0,source,acceptability,authors_acceptability,sentence,domain
18,gj04,0,*,They drank the pub.,in
20,gj04,0,*,The professor talked us.,in
22,gj04,0,*,We yelled ourselves.,in
23,gj04,0,*,We yelled Harry hoarse.,in
25,gj04,0,*,Harry coughed himself.,in


### Remove author's acceptability? YES

The acceptability column reduces all NaN in authors_acceptability to 1 in acceptability, and all levels (*, ?*,*?, ??) to 0.

From the CoLA paper:

"*When examples appear with non-Boolean judgments (this occurs in less than 3% of cases), we either exclude them (for labels ‘?’ or ‘#’), or label them unacceptable (‘??’ and ‘*?’).*"

In [5]:
train_cola[['acceptability','authors_acceptability']]
train_cola[train_cola['authors_acceptability']=='??']

Unnamed: 0,source,acceptability,authors_acceptability,sentence,domain
136,cj99,0,??,"I can well imagine the more him eating, the fa...",in
301,cj99,0,??,I finally worked up enough courage to ask whic...,in
302,cj99,0,??,Which folks up at corporate headquarters do yo...,in
303,cj99,0,??,This is a problem that you'll be able to tell ...,in
305,cj99,0,??,This is a problem that you solve it and you'll...,in
306,cj99,0,??,Those are the folks that you just solve this p...,in
307,cj99,0,??,They failed to tell me which problem I'll beat...,in
308,cj99,0,??,This is the problem that you'll beat the compe...,in
361,bc01,0,??,Which problem do you wonder whether John said ...,in
400,bc01,0,??,How many people do you wonder whether I consid...,in


In [6]:
# remove authors_acceptability since lots of NaN
train_cola.drop(['authors_acceptability'], axis=1, inplace= True)
train_cola.isna().sum()
train_cola.head()

Unnamed: 0,source,acceptability,sentence,domain
0,gj04,1,"Our friends won't buy this analysis, let alone...",in
1,gj04,1,One more pseudo generalization and I'm giving up.,in
2,gj04,1,One more pseudo generalization or I'm giving up.,in
3,gj04,1,"The more we study verbs, the crazier they get.",in
4,gj04,1,Day by day the facts are getting murkier.,in


## Test

In [7]:
# Test in domain: 527 sentences
test_in_cola = pd.read_csv('/content/drive/MyDrive/266/Data/Raw_Data/CoLA/cola_public/raw/in_domain_dev.tsv', sep='\t', header=None)

# change column names
column_names = ['source','acceptability','authors_acceptability','sentence']
test_in_cola = test_in_cola.set_axis(column_names, axis=1)

# add column to specify domain (in/out of domain)
test_in_cola['domain'] = pd.Series(['in' for x in range(len(test_in_cola.index))])
test_in_cola.head()


Unnamed: 0,source,acceptability,authors_acceptability,sentence,domain
0,gj04,1,,The sailors rode the breeze clear of the rocks.,in
1,gj04,1,,The weights made the rope stretch over the pul...,in
2,gj04,1,,The mechanical doll wriggled itself loose.,in
3,cj99,1,,"If you had eaten more, you would want less.",in
4,cj99,0,*,"As you eat the most, you want the least.",in


### Remove author's acceptability

In [8]:
# remove authors_acceptability since lots of NaN
test_in_cola.drop(['authors_acceptability'], axis=1, inplace= True)
test_in_cola.isna().sum()

source           0
acceptability    0
sentence         0
domain           0
dtype: int64

In [9]:
# Test out domain: 516 sentences
test_out_cola = pd.read_csv('/content/drive/MyDrive/266/Data/Raw_Data/CoLA/cola_public/raw/out_of_domain_dev.tsv', sep='\t', header=None)

# change column names
column_names = ['source','acceptability','authors_acceptability','sentence']
test_out_cola = test_out_cola.set_axis(column_names, axis=1)
test_out_cola.head()


# add column to specify domain (in/out of domain)
test_out_cola['domain'] = pd.Series(['out' for x in range(len(test_out_cola.index))])
test_out_cola.head()

Unnamed: 0,source,acceptability,authors_acceptability,sentence,domain
0,clc95,1,,Somebody just left - guess who.,out
1,clc95,1,,"They claimed they had settled on something, bu...",out
2,clc95,1,,"If Sam was going, Sally would know where.",out
3,clc95,1,,"They're going to serve the guests something, b...",out
4,clc95,1,,She's reading. I can't imagine what.,out


In [10]:
# remove authors_acceptability since lots of NaN
test_out_cola.drop(['authors_acceptability'], axis=1, inplace= True)
test_out_cola.isna().sum()

source           0
acceptability    0
sentence         0
domain           0
dtype: int64

### Merge in-domain and out-of-domain test sets

In [11]:
test_cola = pd.concat([test_in_cola, test_out_cola])
test_cola.head()

Unnamed: 0,source,acceptability,sentence,domain
0,gj04,1,The sailors rode the breeze clear of the rocks.,in
1,gj04,1,The weights made the rope stretch over the pul...,in
2,gj04,1,The mechanical doll wriggled itself loose.,in
3,cj99,1,"If you had eaten more, you would want less.",in
4,cj99,0,"As you eat the most, you want the least.",in


# Remove unwanted columns

In [12]:
train_cola.drop(['source'], axis=1, inplace=True)

display(len(train_cola))

train_cola.head()

8551

Unnamed: 0,acceptability,sentence,domain
0,1,"Our friends won't buy this analysis, let alone...",in
1,1,One more pseudo generalization and I'm giving up.,in
2,1,One more pseudo generalization or I'm giving up.,in
3,1,"The more we study verbs, the crazier they get.",in
4,1,Day by day the facts are getting murkier.,in


In [13]:
test_cola.drop(['source'], axis=1, inplace=True)

display(len(test_cola))

test_cola.head()

1043

Unnamed: 0,acceptability,sentence,domain
0,1,The sailors rode the breeze clear of the rocks.,in
1,1,The weights made the rope stretch over the pul...,in
2,1,The mechanical doll wriggled itself loose.,in
3,1,"If you had eaten more, you would want less.",in
4,0,"As you eat the most, you want the least.",in


In [14]:
acc = round(len(test_cola[test_cola['acceptability'] == 1]) / len(test_cola)*100)
print(f'Percentage of test instances which are acceptable: {acc}%')

Percentage of test instances which are acceptable: 69%


# Train:test split (Rachel's approach)

In [15]:
train, validation = train_test_split(train_cola, test_size=0.2, random_state=1234)

In [16]:
train

Unnamed: 0,acceptability,sentence,domain
5007,1,To please John is tough.,in
7480,1,John did not like Mary.,in
5454,0,John is not more reliable a fellow than Bill.,in
2175,1,Joan knew the answer.,in
3170,1,The baby dressed.,in
...,...,...,...
6137,0,Was been hit by Bill by the baseball.,in
664,1,Mary considers John a fool and Bill a wimp.,in
7540,0,Mary appreciates John and himself.,in
7221,1,Henri wants the book which is on the top shelf.,in


In [17]:
print("Number of acceptable/unacceptable sentences in train set")
print("-"*56)
train['acceptability'].value_counts()

Number of acceptable/unacceptable sentences in train set
--------------------------------------------------------


1    4802
0    2038
Name: acceptability, dtype: int64

In [18]:
acc = round(len(train[train['acceptability'] == 1]) / len(train)*100)
print(f'Percentage of training instances which are acceptable: {acc}%')

Percentage of training instances which are acceptable: 70%


In [19]:
validation

Unnamed: 0,acceptability,sentence,domain
4749,1,Which man did you talk to?,in
7987,1,What she thought was that the poison was neutr...,in
3851,1,The teacher made students happy.,in
8430,0,I have sent 0 letter to Environmental Heath,in
7780,0,We believed to be omnipotent.,in
...,...,...,...
8001,1,Evan's every idea was completely insane.,in
2244,1,Faustina sprayed the lilies.,in
4185,1,I am fond of him.,in
2437,1,Tabs were kept on the suspect.,in


In [20]:
print("Number of acceptable/unacceptable sentences in validation set")
print("-"*61)
validation['acceptability'].value_counts()

Number of acceptable/unacceptable sentences in validation set
-------------------------------------------------------------


1    1221
0     490
Name: acceptability, dtype: int64

In [21]:
acc = round(len(validation[validation['acceptability'] == 1]) / len(validation)*100)
print(f'Percentage of validation instances which are acceptable: {acc}%')

Percentage of validation instances which are acceptable: 71%


# Dealing with class imbalance (Rachel's approach)

https://pypi.org/project/nlpaug/0.0.5/

## Create a random augmentation function using nlpaug

In [22]:
def augment(text):
  choices = ['swap', 'delete', 'crop']
  if len(text.split(' ')) <= 6:
    choice = 'swap'
  else:
    choice = random.choice(choices)

  if choice == 'swap':
    aug = naw.RandomWordAug(action="swap")
  elif choice == 'delete':
    aug = naw.RandomWordAug()
  elif choice == 'crop':
    aug = naw.RandomWordAug(action='crop')

  augmented_text = aug.augment(text)
  return augmented_text

## Apply the random augmentation function to each of the unacceptable examples to create a more balanced dataset

In [23]:
unacceptable = train[train['acceptability'] == 0]['sentence'].tolist()
print("First 20 instances of unacceptable sentences in train set:")
print("-"*90)
unacceptable[:20]

First 20 instances of unacceptable sentences in train set:
------------------------------------------------------------------------------------------


['John is not more reliable a fellow than Bill.',
 "Myra took Betty's snooze.",
 'Bill left when that no one else was awake is certain.',
 'The car honked down the road.',
 'The cake was been eating.',
 'The weather rained',
 'She said Moya liked football.',
 'it is the tall man come from the back that Mary saw the tall man come from the back.',
 'It is an alive fish.',
 'Tony bent the table with the rod.',
 'The magician vanished a rabbit into thin air.',
 'Who did Herb believe the claim that cheated?',
 'On which I consulted with the chairman of the Select Committee was this matter.',
 'I lent the book all the way to Tony.',
 'We are knowing this theory.',
 'Lilly recounted a story to remember because Holly had also recounted a story to.',
 'The doctor cured pneumonia from Pat.',
 'Steve tossed the wall with the ball.',
 'John convinced it to be obvious that Bill left.',
 "I'm sure that I ever met him."]

In [24]:
# apply random augmentation to each of the unacceptable samples to create more unacceptable samples
augmented = []
for each in unacceptable:
  each_aug = augment(each[:-1])
  augmented.append(each_aug[0] + each[-1])

print("First 20 instances of unacceptable sentences in augmented set:")
print("-"*90)
augmented[:20]

First 20 instances of unacceptable sentences in augmented set:
------------------------------------------------------------------------------------------


['John is not fellow than Bill.',
 "Took myra Betty ' snooze s.",
 'Bill when left no that one was else awake certain is.',
 'The car honked down the road.',
 'The cake eating was been.',
 'The raine weatherd',
 'Said she liked Moya football.',
 'is the it man tall come the from back that Mary saw the come tall man from the back.',
 'It is an alive fish.',
 'Bent table with the rod.',
 'The magician vanished a rabbit into air thin.',
 'Did who Herb claim believe the that cheated?',
 'Which on consulted I with the chairman the of Committee Select was matter this.',
 'I lent book all the to.',
 'Are we this knowing theory.',
 'Lilly recounted story a remember to because Holly had recounted also to a story.',
 'The cured doctor from pneumonia Pat.',
 'Tossed wall with the ball.',
 'John convinced it that Bill left.',
 "' m that ever met him."]

In [25]:
# find out how many more augmented unacceptable samples we need to create a balanced set
subsample = random.sample(augmented, len(train[train['acceptability'] == 1]) - (len(train[train['acceptability'] == 0]) *2))


len(subsample)

726

In [26]:
# apply random augmentation to each of the subsample
for each in subsample:
  each_aug = augment(each[:-1])
  augmented.append(each_aug[0] + each[-1])

len(augmented)

2764

In [27]:
augmented_dict = {
    'acceptability': 0,
    'sentence': augmented
}

In [28]:
augmented_df = pd.DataFrame(augmented_dict)

augmented_df

Unnamed: 0,acceptability,sentence
0,0,John is not fellow than Bill.
1,0,Took myra Betty ' snooze s.
2,0,Bill when left no that one was else awake cert...
3,0,The car honked down the road.
4,0,The cake eating was been.
...,...,...
2759,0,"Likes, Sandy Lee and."
2760,0,Girl in the red coat will put on your.
2761,0,Has her her kissed.
2762,0,In is putting the.


In [29]:
balanced_train = pd.concat([train,augmented_df])

print("Number of acceptable/unacceptable sentences in balanced_train")
print("-"*60)
balanced_train['acceptability'].value_counts()

Number of acceptable/unacceptable sentences in balanced_train
------------------------------------------------------------


1    4802
0    4802
Name: acceptability, dtype: int64

# Save files

In [30]:
# for use with models
balanced_train.to_csv('/content/drive/MyDrive/266/Data/Clean_Data/CoLA/cola_raw_balanced_train.csv', index=False)
validation.to_csv('/content/drive/MyDrive/266/Data/Clean_Data/CoLA/cola_raw_validation.csv', index=False)
test_cola.to_csv('/content/drive/MyDrive/266/Data/Clean_Data/CoLA/cola_raw_test.csv', index=False)