# Import Libraries

In [128]:
# standard library
import pandas as pd

# drive access
from google.colab import drive
drive.mount('/content/drive')

# train/val split
from sklearn.model_selection import StratifiedKFold, train_test_split

# for augmentation
!pip install nlpaug -q
import nlpaug.augmenter.word as naw
import random

# for visualization
import matplotlib.pyplot as plt
import seaborn as sns


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load CoLA dataset (n=9594 sentences, split into train/dev)

- **raw**:
  - in domain train (n=8551) / dev set (n=527)
  - out of domain dev set (n=516)

- **tokenizer** (using NLTK tokenizer)
  - in domain train/dev set
  - out of domain dev set

- **major annotations**
  - 1043 sentences annotated by major category of grammatical error
  
- **minor annotations**
  - 1043 sentences annotated by minor category of grammatical error

**NOTE: Original CoLA paper mentions > 10K sentences, but that includes the held-out test set found in Kaggle competitions. The 'test' set mentioned in this notebook is actually the validation set mentioned in the paper.**

# Define functions

- Add period to sentences without final period.
- Add a capital letter to sentences without an initial capital letter.

In [129]:
def fix_sentence(str):
  '''
  Changes trailing commas or colons to period,
  adds final period to sentences without period.
  Leaves sentences with any other punctuation untouched.
  '''
  if str[-1] in ["?", ".", "!"]:
    return str
  elif str[-1] == ",":
    return str[:-1] + "."
  elif str[-1] == ":":
    return str[:-1] + "."
  else:
    return str + "."

In [130]:
def add_cap(str):
  '''
  Changes first letter to capital letter,
  only if first letter isn't a capital letter.
  Leaves rest of the sentence untouched.
  '''
  if str[0][0].isupper():
      return str
  else:
      return str[0][0].upper() +  str[1:]

In [131]:
def clean_df(df):
  '''
  Select and order columns for final train and test dfs:
  'sentence','acceptability','source','domain'
  where sentence is the clean sentence with puncutation
  and capitalization.
  '''
  df = df.drop(['punct_sentence','sentence','source','domain'], axis=1)

  # rename columns. Now 'sentence' includes puncutation and capitalization
  fin_column_names = ['acceptability','sentence']
  df = df.set_axis(fin_column_names, axis=1)

  # reorder
  df = df.loc[:,['sentence','acceptability']]
  df.head()

  return df

# Clean raw dataset

## Train

In [132]:
# Train: 8551 sentences
train_cola = pd.read_csv('/content/drive/MyDrive/266/Data/Raw_Data/CoLA/cola_public/raw/in_domain_train.tsv', sep='\t', header=None)

# change column names
column_names = ['source','acceptability','authors_acceptability','sentence']
train_cola = train_cola.set_axis(column_names, axis=1)

# add column to specify domain (in/out of domain)
train_cola['domain'] = pd.Series(['in' for x in range(len(train_cola.index))])
train_cola.head()

train_cola.head()

Unnamed: 0,source,acceptability,authors_acceptability,sentence,domain
0,gj04,1,,"Our friends won't buy this analysis, let alone...",in
1,gj04,1,,One more pseudo generalization and I'm giving up.,in
2,gj04,1,,One more pseudo generalization or I'm giving up.,in
3,gj04,1,,"The more we study verbs, the crazier they get.",in
4,gj04,1,,Day by day the facts are getting murkier.,in


In [133]:
# see samples of acceptable sentences
train_cola[train_cola['acceptability']==1].head()


Unnamed: 0,source,acceptability,authors_acceptability,sentence,domain
0,gj04,1,,"Our friends won't buy this analysis, let alone...",in
1,gj04,1,,One more pseudo generalization and I'm giving up.,in
2,gj04,1,,One more pseudo generalization or I'm giving up.,in
3,gj04,1,,"The more we study verbs, the crazier they get.",in
4,gj04,1,,Day by day the facts are getting murkier.,in


In [134]:
# see samples of unacceptable sentences
train_cola[train_cola['acceptability']==0].head()

Unnamed: 0,source,acceptability,authors_acceptability,sentence,domain
18,gj04,0,*,They drank the pub.,in
20,gj04,0,*,The professor talked us.,in
22,gj04,0,*,We yelled ourselves.,in
23,gj04,0,*,We yelled Harry hoarse.,in
25,gj04,0,*,Harry coughed himself.,in


### Remove author's acceptability? YES

The acceptability column reduces all NaN in authors_acceptability to 1 in acceptability, and all levels (*, ?*,*?, ??) to 0.

From the CoLA paper:

"*When examples appear with non-Boolean judgments (this occurs in less than 3% of cases), we either exclude them (for labels ‘?’ or ‘#’), or label them unacceptable (‘??’ and ‘*?’).*"

In [135]:
train_cola[['acceptability','authors_acceptability']]
train_cola[train_cola['authors_acceptability']=='??']

Unnamed: 0,source,acceptability,authors_acceptability,sentence,domain
136,cj99,0,??,"I can well imagine the more him eating, the fa...",in
301,cj99,0,??,I finally worked up enough courage to ask whic...,in
302,cj99,0,??,Which folks up at corporate headquarters do yo...,in
303,cj99,0,??,This is a problem that you'll be able to tell ...,in
305,cj99,0,??,This is a problem that you solve it and you'll...,in
306,cj99,0,??,Those are the folks that you just solve this p...,in
307,cj99,0,??,They failed to tell me which problem I'll beat...,in
308,cj99,0,??,This is the problem that you'll beat the compe...,in
361,bc01,0,??,Which problem do you wonder whether John said ...,in
400,bc01,0,??,How many people do you wonder whether I consid...,in


In [136]:
# remove authors_acceptability since lots of NaN
train_cola.drop(['authors_acceptability'], axis=1, inplace= True)
train_cola.isna().sum()
train_cola.head()


Unnamed: 0,source,acceptability,sentence,domain
0,gj04,1,"Our friends won't buy this analysis, let alone...",in
1,gj04,1,One more pseudo generalization and I'm giving up.,in
2,gj04,1,One more pseudo generalization or I'm giving up.,in
3,gj04,1,"The more we study verbs, the crazier they get.",in
4,gj04,1,Day by day the facts are getting murkier.,in


### Add punctuation
Add a period to all those with no punctuation. Caveats: we won't get emotion from it.


In [137]:
# check to see which acceptable sentences don't have any punctuation
punctuation = (".", "?",'!')
no_punct = train_cola[(train_cola['acceptability']==1)& (train_cola["sentence"].str.endswith(punctuation)==False)]
no_punct.head()

Unnamed: 0,source,acceptability,sentence,domain
7699,ad03,1,She tried to leave,in
7703,ad03,1,Paul had eighty eight billion sixty three mill...,in
7706,ad03,1,The landlord donated a helicopter,in
7710,ad03,1,Michael abandoned an old friend at Mardi Gras,in
7711,ad03,1,You friends of the king are all the same,in


Check unacceptable sentences to see whether adding punctuation/capitalization makes sense

- Only 5 sentences wihout final period, and adding the period won't make them acceptable, so will add it.

In [138]:
# check to see which unacceptable sentences don't have any punctuation
punctuation = (".", "?",'!')
no_punct = train_cola[(train_cola['acceptability']==0)& (train_cola["sentence"].str.endswith(punctuation)==False)]
no_punct.head()

Unnamed: 0,source,acceptability,sentence,domain
7701,ad03,0,Gilgamesh doesn't be in the dungeon,in
7713,ad03,0,Lucy's Gomez's wallet,in
7731,ad03,0,They kicked himself,in
7748,ad03,0,He book,in
7751,ad03,0,Kiss pigs is my happiest memory,in


In [139]:
# Apply punctuation
train_cola['punct_sentence'] = train_cola['sentence'].apply(fix_sentence)

In [140]:
# check to see if it worked
punctuation = (".", "?",'!')
no_punct = train_cola[train_cola["punct_sentence"].str.endswith(punctuation)==False]
no_punct.head()


Unnamed: 0,source,acceptability,sentence,domain,punct_sentence


In [141]:
# make sure no NAs were added
train_cola.isna().sum()

source            0
acceptability     0
sentence          0
domain            0
punct_sentence    0
dtype: int64

### Capitalize first letter

In [142]:
# check to see which sentences aren't capitalized
no_cap = train_cola[train_cola["punct_sentence"].str.istitle()==False]
no_cap.head()

Unnamed: 0,source,acceptability,sentence,domain,punct_sentence
0,gj04,1,"Our friends won't buy this analysis, let alone...",in,"Our friends won't buy this analysis, let alone..."
1,gj04,1,One more pseudo generalization and I'm giving up.,in,One more pseudo generalization and I'm giving up.
2,gj04,1,One more pseudo generalization or I'm giving up.,in,One more pseudo generalization or I'm giving up.
3,gj04,1,"The more we study verbs, the crazier they get.",in,"The more we study verbs, the crazier they get."
4,gj04,1,Day by day the facts are getting murkier.,in,Day by day the facts are getting murkier.


In [143]:
# Apply capitlization
train_cola['capped_sentence'] = train_cola['punct_sentence'].apply(add_cap)
train_cola

Unnamed: 0,source,acceptability,sentence,domain,punct_sentence,capped_sentence
0,gj04,1,"Our friends won't buy this analysis, let alone...",in,"Our friends won't buy this analysis, let alone...","Our friends won't buy this analysis, let alone..."
1,gj04,1,One more pseudo generalization and I'm giving up.,in,One more pseudo generalization and I'm giving up.,One more pseudo generalization and I'm giving up.
2,gj04,1,One more pseudo generalization or I'm giving up.,in,One more pseudo generalization or I'm giving up.,One more pseudo generalization or I'm giving up.
3,gj04,1,"The more we study verbs, the crazier they get.",in,"The more we study verbs, the crazier they get.","The more we study verbs, the crazier they get."
4,gj04,1,Day by day the facts are getting murkier.,in,Day by day the facts are getting murkier.,Day by day the facts are getting murkier.
...,...,...,...,...,...,...
8546,ad03,0,Poseidon appears to own a dragon,in,Poseidon appears to own a dragon.,Poseidon appears to own a dragon.
8547,ad03,0,Digitize is my happiest memory,in,Digitize is my happiest memory.,Digitize is my happiest memory.
8548,ad03,1,It is easy to slay the Gorgon.,in,It is easy to slay the Gorgon.,It is easy to slay the Gorgon.
8549,ad03,1,I had the strangest feeling that I knew you.,in,I had the strangest feeling that I knew you.,I had the strangest feeling that I knew you.


In [144]:
# sanity check
train_cola.isna().sum()

source             0
acceptability      0
sentence           0
domain             0
punct_sentence     0
capped_sentence    0
dtype: int64

## Test

In [145]:
# Test in domain: 527 sentences
test_in_cola = pd.read_csv('/content/drive/MyDrive/266/Data/Raw_Data/CoLA/cola_public/raw/in_domain_dev.tsv', sep='\t', header=None)

# change column names
column_names = ['source','acceptability','authors_acceptability','sentence']
test_in_cola = test_in_cola.set_axis(column_names, axis=1)

# add column to specify domain (in/out of domain)
test_in_cola['domain'] = pd.Series(['in' for x in range(len(test_in_cola.index))])
test_in_cola.head()


Unnamed: 0,source,acceptability,authors_acceptability,sentence,domain
0,gj04,1,,The sailors rode the breeze clear of the rocks.,in
1,gj04,1,,The weights made the rope stretch over the pul...,in
2,gj04,1,,The mechanical doll wriggled itself loose.,in
3,cj99,1,,"If you had eaten more, you would want less.",in
4,cj99,0,*,"As you eat the most, you want the least.",in


### Remove author's acceptability

In [146]:
# remove authors_acceptability since lots of NaN
test_in_cola.drop(['authors_acceptability'], axis=1, inplace= True)
test_in_cola.isna().sum()

source           0
acceptability    0
sentence         0
domain           0
dtype: int64

In [147]:
# Test out domain: 516 sentences
test_out_cola = pd.read_csv('/content/drive/MyDrive/266/Data/Raw_Data/CoLA/cola_public/raw/out_of_domain_dev.tsv', sep='\t', header=None)

# change column names
column_names = ['source','acceptability','authors_acceptability','sentence']
test_out_cola = test_out_cola.set_axis(column_names, axis=1)
test_out_cola.head()


# add column to specify domain (in/out of domain)
test_out_cola['domain'] = pd.Series(['out' for x in range(len(test_out_cola.index))])
test_out_cola.head()

Unnamed: 0,source,acceptability,authors_acceptability,sentence,domain
0,clc95,1,,Somebody just left - guess who.,out
1,clc95,1,,"They claimed they had settled on something, bu...",out
2,clc95,1,,"If Sam was going, Sally would know where.",out
3,clc95,1,,"They're going to serve the guests something, b...",out
4,clc95,1,,She's reading. I can't imagine what.,out


In [148]:
# remove authors_acceptability since lots of NaN
test_out_cola.drop(['authors_acceptability'], axis=1, inplace= True)
test_out_cola.isna().sum()

source           0
acceptability    0
sentence         0
domain           0
dtype: int64

### Merge in-domain and out-of-domain test sets

In [149]:
test_cola = pd.concat([test_in_cola, test_out_cola])
test_cola.head()


Unnamed: 0,source,acceptability,sentence,domain
0,gj04,1,The sailors rode the breeze clear of the rocks.,in
1,gj04,1,The weights made the rope stretch over the pul...,in
2,gj04,1,The mechanical doll wriggled itself loose.,in
3,cj99,1,"If you had eaten more, you would want less.",in
4,cj99,0,"As you eat the most, you want the least.",in


### Add punctuation

In [150]:
# Apply punctuation
test_cola['punct_sentence'] = test_cola['sentence'].apply(fix_sentence)

In [151]:
# check to see if it worked
punctuation = (".", "?",'!')
no_punct = test_cola[test_cola["punct_sentence"].str.endswith(punctuation)==False]
no_punct.head()

Unnamed: 0,source,acceptability,sentence,domain,punct_sentence


In [152]:
# sanity check
test_cola.isna().sum()

source            0
acceptability     0
sentence          0
domain            0
punct_sentence    0
dtype: int64

### Capitalize first letter




In [153]:
test_cola['capped_sentence'] = test_cola['punct_sentence'].apply(add_cap)
test_cola.head()

Unnamed: 0,source,acceptability,sentence,domain,punct_sentence,capped_sentence
0,gj04,1,The sailors rode the breeze clear of the rocks.,in,The sailors rode the breeze clear of the rocks.,The sailors rode the breeze clear of the rocks.
1,gj04,1,The weights made the rope stretch over the pul...,in,The weights made the rope stretch over the pul...,The weights made the rope stretch over the pul...
2,gj04,1,The mechanical doll wriggled itself loose.,in,The mechanical doll wriggled itself loose.,The mechanical doll wriggled itself loose.
3,cj99,1,"If you had eaten more, you would want less.",in,"If you had eaten more, you would want less.","If you had eaten more, you would want less."
4,cj99,0,"As you eat the most, you want the least.",in,"As you eat the most, you want the least.","As you eat the most, you want the least."


In [154]:
# sanity check
test_cola.isna().sum()

source             0
acceptability      0
sentence           0
domain             0
punct_sentence     0
capped_sentence    0
dtype: int64

# Clean tokenized data set

Note: The only cleaning done here was changing the column names, and removing the author's acceptability column. I didn't work on the punctuation or capitalization since it would probably mess up the tokenizing. Anyways, we're not going to be using the NLTK tokenizer in any of our grammatical acceptability classifiers (I would think).

## Tokenized - Train

In [155]:
# tokenized with the NLTK tokenizer
tokenized_train_cola = pd.read_csv('/content/drive/MyDrive/266/Data/Raw_Data/CoLA/cola_public/tokenized/in_domain_train.tsv', sep='\t', header=None)

# change column names
column_names = ['source','acceptability','authors_acceptability','sentence']
tokenized_train_cola = tokenized_train_cola.set_axis(column_names, axis=1)

# add column to specify domain (in/out of domain)
tokenized_train_cola['domain'] = pd.Series(['in' for x in range(len(tokenized_train_cola.index))])
tokenized_train_cola.head()

tokenized_train_cola.head()

Unnamed: 0,source,acceptability,authors_acceptability,sentence,domain
0,gj04,1,,"our friends wo n't buy this analysis , let alo...",in
1,gj04,1,,one more pseudo generalization and i 'm giving...,in
2,gj04,1,,one more pseudo generalization or i 'm giving ...,in
3,gj04,1,,"the more we study verbs , the crazier they get .",in
4,gj04,1,,day by day the facts are getting murkier .,in


In [156]:
# remove authors_acceptability since lots of NaN
tokenized_train_cola.drop(['authors_acceptability'], axis=1, inplace= True)
tokenized_train_cola.isna().sum()

source           0
acceptability    0
sentence         0
domain           0
dtype: int64

## Tokenized - Test

In [157]:
# tokenized with the NLTK tokenizer
tokenized_in_test_cola = pd.read_csv('/content/drive/MyDrive/266/Data/Raw_Data/CoLA/cola_public/tokenized/in_domain_dev.tsv', sep='\t', header=None)

# change column names
column_names = ['source','acceptability','authors_acceptability','sentence']
tokenized_in_test_cola = tokenized_in_test_cola.set_axis(column_names, axis=1)

# add column to specify domain (in/out of domain)
tokenized_in_test_cola['domain'] = pd.Series(['in' for x in range(len(tokenized_in_test_cola.index))])
tokenized_in_test_cola.head()

tokenized_in_test_cola.head()

Unnamed: 0,source,acceptability,authors_acceptability,sentence,domain
0,gj04,1,,the sailors rode the breeze clear of the rocks .,in
1,gj04,1,,the weights made the rope stretch over the pul...,in
2,gj04,1,,the mechanical doll wriggled itself loose .,in
3,cj99,1,,"if you had eaten more , you would want less .",in
4,cj99,0,*,"as you eat the most , you want the least .",in


In [158]:
# remove authors_acceptability since lots of NaN
tokenized_in_test_cola.drop(['authors_acceptability'], axis=1, inplace= True)
tokenized_in_test_cola.isna().sum()

source           0
acceptability    0
sentence         0
domain           0
dtype: int64

In [159]:
# tokenized with the NLTK tokenizer
tokenized_out_test_cola = pd.read_csv('/content/drive/MyDrive/266/Data/Raw_Data/CoLA/cola_public/tokenized/out_of_domain_dev.tsv', sep='\t', header=None)

# change column names
column_names = ['source','acceptability','authors_acceptability','sentence']
tokenized_out_test_cola = tokenized_out_test_cola.set_axis(column_names, axis=1)

# add column to specify domain (in/out of domain)
tokenized_out_test_cola['domain'] = pd.Series(['out' for x in range(len(tokenized_out_test_cola.index))])
tokenized_out_test_cola.head()

tokenized_out_test_cola.head()

Unnamed: 0,source,acceptability,authors_acceptability,sentence,domain
0,clc95,1,,somebody just left - guess who .,out
1,clc95,1,,"they claimed they had settled on something , b...",out
2,clc95,1,,"if sam was going , sally would know where .",out
3,clc95,1,,"they 're going to serve the guests something ,...",out
4,clc95,1,,she 's reading . i ca n't imagine what .,out


In [160]:
# remove authors_acceptability since lots of NaN
tokenized_out_test_cola.drop(['authors_acceptability'], axis=1, inplace= True)
tokenized_out_test_cola.isna().sum()

source           0
acceptability    0
sentence         0
domain           0
dtype: int64

## Merge in / out-of domain tokenized test data

In [161]:
test_token_cola = pd.concat([tokenized_in_test_cola, tokenized_out_test_cola])
test_token_cola.head()

Unnamed: 0,source,acceptability,sentence,domain
0,gj04,1,the sailors rode the breeze clear of the rocks .,in
1,gj04,1,the weights made the rope stretch over the pul...,in
2,gj04,1,the mechanical doll wriggled itself loose .,in
3,cj99,1,"if you had eaten more , you would want less .",in
4,cj99,0,"as you eat the most , you want the least .",in


# Clean minor/major grammatical annotations

## Minor grammatical annotations


In [162]:
# 1043 minor grammatical annotations
minor = pd.read_csv('/content/drive/MyDrive/266/Data/Raw_Data/CoLA/CoLA_grammatical_annotations/CoLA_grammatical_annotations_minor_features.tsv', sep='\t')

# change column names to lower case
minor.columns = [c.lower() for c in minor]

# change spaces to underscore
minor.columns = [c.replace(' ', '_') for c in minor]

# strip leading white spaces
minor.columns = [c.lstrip() for c in minor]

# strip trailing white spaces
minor.columns = [c.rstrip() for c in minor]

minor.head()


Unnamed: 0,source,domain,acceptability,sentence,simple,copula,pred/sc,result/depictive,particle,vp_adjunct,...,subordinate/cond,ellipsis/anaphor,s-adjunct,quantifier,partitive,npi/fci,comparative,sem__violation,infl/agr_violation,extra/mising_expr
0,gj04,In,1,The sailors rode the breeze clear of the rocks.,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,gj04,In,1,The weights made the rope stretch over the pul...,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,gj04,In,1,The mechanical doll wriggled itself loose.,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,cj99,In,1,"If you had eaten more, you would want less.",0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
4,cj99,In,0,"As you eat the most, you want the least.",0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0


### Capitalization and puntuaction

In [163]:
minor['punct_sentence'] = minor['sentence'].apply(fix_sentence)

In [164]:
# check to see if it worked
punctuation = (".", "?",'!')
no_punct = minor[minor["punct_sentence"].str.endswith(punctuation)==False]
no_punct.head()

Unnamed: 0,source,domain,acceptability,sentence,simple,copula,pred/sc,result/depictive,particle,vp_adjunct,...,ellipsis/anaphor,s-adjunct,quantifier,partitive,npi/fci,comparative,sem__violation,infl/agr_violation,extra/mising_expr,punct_sentence


In [165]:
minor['capped_sentence'] = minor['punct_sentence'].apply(add_cap)
minor.head()

Unnamed: 0,source,domain,acceptability,sentence,simple,copula,pred/sc,result/depictive,particle,vp_adjunct,...,s-adjunct,quantifier,partitive,npi/fci,comparative,sem__violation,infl/agr_violation,extra/mising_expr,punct_sentence,capped_sentence
0,gj04,In,1,The sailors rode the breeze clear of the rocks.,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,The sailors rode the breeze clear of the rocks.,The sailors rode the breeze clear of the rocks.
1,gj04,In,1,The weights made the rope stretch over the pul...,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,The weights made the rope stretch over the pul...,The weights made the rope stretch over the pul...
2,gj04,In,1,The mechanical doll wriggled itself loose.,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,The mechanical doll wriggled itself loose.,The mechanical doll wriggled itself loose.
3,cj99,In,1,"If you had eaten more, you would want less.",0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,"If you had eaten more, you would want less.","If you had eaten more, you would want less."
4,cj99,In,0,"As you eat the most, you want the least.",0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,"As you eat the most, you want the least.","As you eat the most, you want the least."


In [166]:
# sanity check
minor.isna().sum()

source                0
domain                0
acceptability         0
sentence              0
simple                0
                     ..
sem__violation        0
infl/agr_violation    0
extra/mising_expr     0
punct_sentence        0
capped_sentence       0
Length: 69, dtype: int64

In [167]:
# drop punctuation sentence and original sentence
minor = minor.drop(['punct_sentence','sentence'], axis=1)

# rename capped_sentence
minor = minor.rename(columns={'capped_sentence': 'sentence'})
minor.head()


Unnamed: 0,source,domain,acceptability,simple,copula,pred/sc,result/depictive,particle,vp_adjunct,np_adjunct,...,ellipsis/anaphor,s-adjunct,quantifier,partitive,npi/fci,comparative,sem__violation,infl/agr_violation,extra/mising_expr,sentence
0,gj04,In,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,The sailors rode the breeze clear of the rocks.
1,gj04,In,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,The weights made the rope stretch over the pul...
2,gj04,In,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,The mechanical doll wriggled itself loose.
3,cj99,In,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,"If you had eaten more, you would want less."
4,cj99,In,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,"As you eat the most, you want the least."


## Major grammatical annotations

- Simple
- Pred
- Adjunct
- Arg types
- Arg altern
- Bind
- Question
- Comp clause
- Auxiliary
- to-VP
- N, Adj
- S-Syntax
- Determiner


In [168]:
# 1043 major grammatical annotations
major = pd.read_csv('/content/drive/MyDrive/266/Data/Raw_Data/CoLA/CoLA_grammatical_annotations/CoLA_grammatical_annotations_major_features.tsv', sep='\t')

# change column names to lower case
major.columns = [c.lower() for c in major]


# change spaces to underscore
major.columns = [c.replace(' ', '_') for c in major]

# strip leading white spaces
major.columns = [c.lstrip() for c in major]

# strip trailing white spaces
major.columns = [c.rstrip() for c in major]

major.head()


Unnamed: 0,source,domain,acceptability,sentence,simple,predicate,adjunct,argument_type,arg_altern,imperative,binding,question,comp_clause,auxiliary,to-vp,"n,_adj",s-syntax,determiner,violations
0,gj04,In,1,The sailors rode the breeze clear of the rocks.,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0
1,gj04,In,1,The weights made the rope stretch over the pul...,0,1,0,1,1,0,0,0,0,0,1,0,0,0,0
2,gj04,In,1,The mechanical doll wriggled itself loose.,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0
3,cj99,In,1,"If you had eaten more, you would want less.",0,0,0,0,0,0,0,0,0,1,0,0,1,1,0
4,cj99,In,0,"As you eat the most, you want the least.",0,0,0,0,0,0,0,0,0,0,0,0,1,1,0


### Capitalization and punctuation

In [169]:
major['punct_sentence'] = major['sentence'].apply(fix_sentence)

In [170]:
# check to see if it worked
no_punct = major[major["punct_sentence"].str.endswith(punctuation)==False]
no_punct.head()

Unnamed: 0,source,domain,acceptability,sentence,simple,predicate,adjunct,argument_type,arg_altern,imperative,binding,question,comp_clause,auxiliary,to-vp,"n,_adj",s-syntax,determiner,violations,punct_sentence


In [171]:
major['capped_sentence'] = major['punct_sentence'].apply(add_cap)
major.head()

Unnamed: 0,source,domain,acceptability,sentence,simple,predicate,adjunct,argument_type,arg_altern,imperative,...,question,comp_clause,auxiliary,to-vp,"n,_adj",s-syntax,determiner,violations,punct_sentence,capped_sentence
0,gj04,In,1,The sailors rode the breeze clear of the rocks.,0,1,0,1,1,0,...,0,0,0,0,1,0,0,0,The sailors rode the breeze clear of the rocks.,The sailors rode the breeze clear of the rocks.
1,gj04,In,1,The weights made the rope stretch over the pul...,0,1,0,1,1,0,...,0,0,0,1,0,0,0,0,The weights made the rope stretch over the pul...,The weights made the rope stretch over the pul...
2,gj04,In,1,The mechanical doll wriggled itself loose.,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,The mechanical doll wriggled itself loose.,The mechanical doll wriggled itself loose.
3,cj99,In,1,"If you had eaten more, you would want less.",0,0,0,0,0,0,...,0,0,1,0,0,1,1,0,"If you had eaten more, you would want less.","If you had eaten more, you would want less."
4,cj99,In,0,"As you eat the most, you want the least.",0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,"As you eat the most, you want the least.","As you eat the most, you want the least."


In [172]:
# sanity check
major.isna().sum()

source             0
domain             0
acceptability      0
sentence           0
simple             0
predicate          0
adjunct            0
argument_type      0
arg_altern         0
imperative         0
binding            0
question           0
comp_clause        0
auxiliary          0
to-vp              0
n,_adj             0
s-syntax           0
determiner         0
violations         0
punct_sentence     0
capped_sentence    0
dtype: int64

In [173]:
# drop punctuation sentence and original sentence
major = major.drop(['punct_sentence','sentence'], axis=1)

# rename capped_sentence
major = major.rename(columns={'capped_sentence': 'sentence'})
major.head()

Unnamed: 0,source,domain,acceptability,simple,predicate,adjunct,argument_type,arg_altern,imperative,binding,question,comp_clause,auxiliary,to-vp,"n,_adj",s-syntax,determiner,violations,sentence
0,gj04,In,1,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,The sailors rode the breeze clear of the rocks.
1,gj04,In,1,0,1,0,1,1,0,0,0,0,0,1,0,0,0,0,The weights made the rope stretch over the pul...
2,gj04,In,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,The mechanical doll wriggled itself loose.
3,cj99,In,1,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,"If you had eaten more, you would want less."
4,cj99,In,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,"As you eat the most, you want the least."


# EDA

- Class imbalance in training set?
- Instances with two sentences?

In [174]:
# Class imbalance
train_cola[['acceptability']].value_counts()

acceptability
1                6023
0                2528
dtype: int64

In [175]:
# instances with two sentences
# Regex pattern to match a sentence ending with '.', '?' or '!' followed by a space
pattern = r'[^.!?]+[.!?]'

# Filter rows with two sentences using regex
filtered_df = train_cola[train_cola['capped_sentence'].str.count(pattern) == 2]
print("Unacceptable instances with >1 punctuation mark:",len(filtered_df[filtered_df['acceptability']==0]))
print("Acceptable instances with >1 punctuation mark:",len(filtered_df[filtered_df['acceptability']==1]))


print("Remove sentences from count which have Mr. Mrs. Dr. or P.M. ")
print('-'*100)
filtered_df = filtered_df[~filtered_df['capped_sentence'].str.contains(r'(?i)(Mr|Mrs|Dr|P\.M)\.')]


print("Unacceptable instances with >1 punctuation mark:",len(filtered_df[filtered_df['acceptability']==0]))
print("Acceptable instances with >1 punctuation mark:",len(filtered_df[filtered_df['acceptability']==1]))


print("Acceptable:")

filtered_df[filtered_df['acceptability']==1]


Unacceptable instances with >1 punctuation mark: 4
Acceptable instances with >1 punctuation mark: 21
Remove sentences from count which have Mr. Mrs. Dr. or P.M. 
----------------------------------------------------------------------------------------------------
Unacceptable instances with >1 punctuation mark: 1
Acceptable instances with >1 punctuation mark: 12
Acceptable:


  filtered_df = filtered_df[~filtered_df['capped_sentence'].str.contains(r'(?i)(Mr|Mrs|Dr|P\.M)\.')]


Unnamed: 0,source,acceptability,sentence,domain,punct_sentence,capped_sentence
1419,r-67,1,I gave to the officer in charge the blackjack....,in,I gave to the officer in charge the blackjack....,I gave to the officer in charge the blackjack....
2025,rhl07,1,Where did you throw the ball? To third base.,in,Where did you throw the ball? To third base.,Where did you throw the ball? To third base.
2026,rhl07,1,Where did you send the bicycle? To Rome.,in,Where did you send the bicycle? To Rome.,Where did you send the bicycle? To Rome.
3805,ks08,1,Are you going on holiday before or after Easte...,in,Are you going on holiday before or after Easte...,Are you going on holiday before or after Easte...
5185,kl93,1,Do you have dry socks? claim.,in,Do you have dry socks? claim.,Do you have dry socks? claim.
5209,kl93,1,Every man who has any matches is happy. happy.,in,Every man who has any matches is happy. happy.,Every man who has any matches is happy. happy.
5865,c_13,1,Eloise wants you to study a new language. assu...,in,Eloise wants you to study a new language. assu...,Eloise wants you to study a new language. assu...
7087,sgww85,1,"Kim likes Sandy, and Lee Leslie. to try to go ...",in,"Kim likes Sandy, and Lee Leslie. to try to go ...","Kim likes Sandy, and Lee Leslie. to try to go ..."
7088,sgww85,1,"Pat wanted to try to go to Berne, and Chris to...",in,"Pat wanted to try to go to Berne, and Chris to...","Pat wanted to try to go to Berne, and Chris to..."
8269,ad03,1,Have you seen Mary? I have vP seen Mary,in,Have you seen Mary? I have vP seen Mary.,Have you seen Mary? I have vP seen Mary.


# Remove unwanted columns

In [176]:
train_cola = clean_df(train_cola)

In [177]:
test_cola = clean_df(test_cola)

# Train:test split (Rachel's approach)

In [178]:
train, validation = train_test_split(df, test_size=0.2, random_state=1234)

In [179]:
train

Unnamed: 0,sentence,acceptability
5007,To please John is tough.,1
7480,John did not like Mary.,1
5454,John is not more reliable a fellow than Bill.,0
2175,Joan knew the answer.,1
3170,The baby dressed.,1
...,...,...
6137,Was been hit by Bill by the baseball.,0
664,Mary considers John a fool and Bill a wimp.,1
7540,Mary appreciates John and himself.,0
7221,Henri wants the book which is on the top shelf.,1


In [180]:
print("Number of acceptable/unacceptable sentences in train set")
print("-"*56)
train['acceptability'].value_counts()

Number of acceptable/unacceptable sentences in train set
--------------------------------------------------------


1    4802
0    2038
Name: acceptability, dtype: int64

In [181]:
acc = round(len(train[train['acceptability'] == 1]) / len(df)*100)
print(f'Percentage of training instances which are acceptable: {acc}%')

Percentage of training instances which are acceptable: 56%


In [182]:
validation

Unnamed: 0,sentence,acceptability
4749,Which man did you talk to?,1
7987,What she thought was that the poison was neutr...,1
3851,The teacher made students happy.,1
8430,I have sent 0 letter to Environmental Heath.,0
7780,We believed to be omnipotent.,0
...,...,...
8001,Evan's every idea was completely insane.,1
2244,Faustina sprayed the lilies.,1
4185,I am fond of him.,1
2437,Tabs were kept on the suspect.,1


In [183]:
print("Number of acceptable/unacceptable sentences in validation set")
print("-"*61)
validation['acceptability'].value_counts()

Number of acceptable/unacceptable sentences in validation set
-------------------------------------------------------------


1    1221
0     490
Name: acceptability, dtype: int64

In [184]:
acc = round(len(validation[validation['acceptability'] == 1]) / len(df)*100)
print(f'Percentage of validation instances which are acceptable: {acc}%')

Percentage of validation instances which are acceptable: 14%


# Dealing with class imbalance (Rachel's approach)

https://pypi.org/project/nlpaug/0.0.5/

## Create a random augmentation function using nlpaug

In [185]:
def augment(text):
  choices = ['swap', 'delete', 'crop']
  if len(text.split(' ')) <= 6:
    choice = 'swap'
  else:
    choice = random.choice(choices)

  if choice == 'swap':
    aug = naw.RandomWordAug(action="swap")
  elif choice == 'delete':
    aug = naw.RandomWordAug()
  elif choice == 'crop':
    aug = naw.RandomWordAug(action='crop')

  augmented_text = aug.augment(text)
  return augmented_text

## Apply the random augmentation function to each of the unacceptable examples to create a more balanced dataset

In [186]:
unacceptable = train[train['acceptability'] == 0]['sentence'].tolist()
print("First 20 instances of unacceptable sentences in train set:")
print("-"*90)
unacceptable[:20]

First 20 instances of unacceptable sentences in train set:
------------------------------------------------------------------------------------------


['John is not more reliable a fellow than Bill.',
 "Myra took Betty's snooze.",
 'Bill left when that no one else was awake is certain.',
 'The car honked down the road.',
 'The cake was been eating.',
 'The weather rained.',
 'She said Moya liked football.',
 'It is the tall man come from the back that Mary saw the tall man come from the back.',
 'It is an alive fish.',
 'Tony bent the table with the rod.',
 'The magician vanished a rabbit into thin air.',
 'Who did Herb believe the claim that cheated?',
 'On which I consulted with the chairman of the Select Committee was this matter.',
 'I lent the book all the way to Tony.',
 'We are knowing this theory.',
 'Lilly recounted a story to remember because Holly had also recounted a story to.',
 'The doctor cured pneumonia from Pat.',
 'Steve tossed the wall with the ball.',
 'John convinced it to be obvious that Bill left.',
 "I'm sure that I ever met him."]

In [187]:
# apply random augmentation to each of the unacceptable samples to create more unacceptable samples
augmented = []
for each in unacceptable:
  each_aug = augment(each[:-1])
  augmented.append(each_aug[0] + each[-1])

print("First 20 instances of unacceptable sentences in augmented set:")
print("-"*90)
augmented[:20]

First 20 instances of unacceptable sentences in augmented set:
------------------------------------------------------------------------------------------


['John is not more fellow Bill.',
 "Took myra ' Betty s snooze.",
 'Bill left that no else was certain.',
 'Car the down honked the road.',
 'The cake eating was been.',
 'The rained weather.',
 'Said she Moya football liked.',
 'Is from the back that Mary saw the tall man come from the back.',
 'It an is fish alive.',
 'Tony bent the rod.',
 'The magician vanished a air.',
 'Did believe the claim that cheated?',
 'On which I consulted with Committee was this matter.',
 'Lent book all the way to Tony.',
 'We knowing this are theory.',
 'Recounted lilly a story to remember Holly because also had recounted a story to.',
 'Doctor the cured pneumonia Pat from.',
 'Tossed the wall the.',
 'John it convinced be to obvious that left Bill.',
 "I ' I ever met him."]

In [188]:
# find out how many more augmented unacceptable samples we need to create a balanced set
subsample = random.sample(augmented, len(train[train['acceptability'] == 1]) - (len(train[train['acceptability'] == 0]) *2))


len(subsample)

726

In [189]:
# apply random augmentation to each of the subsample
for each in subsample:
  each_aug = augment(each[:-1])
  augmented.append(each_aug[0] + each[-1])

len(augmented)

2764

In [190]:
augmented_dict = {
    'acceptability': 0,
    'sentence': augmented
}

In [191]:
augmented_df = pd.DataFrame(augmented_dict)

augmented_df

Unnamed: 0,acceptability,sentence
0,0,John is not more fellow Bill.
1,0,Took myra ' Betty s snooze.
2,0,Bill left that no else was certain.
3,0,Car the down honked the road.
4,0,The cake eating was been.
...,...,...
2759,0,Marianne left not.
2760,0,More has mary friends two that.
2761,0,The is leaks roof.
2762,0,Expect double to than my.


In [192]:
balanced_train = pd.concat([train,augmented_df])

print("Number of acceptable/unacceptable sentences in balanced_train")
print("-"*60)
balanced_train['acceptability'].value_counts()

Number of acceptable/unacceptable sentences in balanced_train
------------------------------------------------------------


1    4802
0    4802
Name: acceptability, dtype: int64

# Save files

**ALL FILES in /content/drive/MyDrive/266/Data/Clean_Data/CoLA/**
- cola_train_clean: clean unbalanced untokenized train data
- cola_test_clean: clean unbalanced untokenized test data
- cola_tok_train_clean: clean unbalanced tokenized train data
- cola_tok_test_clean: clean unbalanced tokenized test data
- cola_major_annotations: clean major annotations
- cola_minor_annotations: clean minor annotations


**FILES FOR USE WITH MODELS**:
- cola_balanced_train: clean balanced untokenized train data
- cola_validation: clean unbalanced untokenized validation data
- cola_test: clean unbalanced untokenized test data


In [194]:
# For use with models
cola_balanced_train = balanced_train
cola_validation = validation
cola_test = test_cola

# Other
cola_train_and_val_clean = train_cola
cola_tok_train_and_val_clean = tokenized_train_cola
cola_tok_test_clean = test_token_cola
cola_major_annotations = major
cola_minor_annotations = minor

In [195]:
# for use with models
cola_balanced_train.to_csv('/content/drive/MyDrive/266/Data/Clean_Data/CoLA/cola_balanced_train.csv', index=False)
cola_validation.to_csv('/content/drive/MyDrive/266/Data/Clean_Data/CoLA/cola_validation.csv', index=False)
cola_test.to_csv('/content/drive/MyDrive/266/Data/Clean_Data/CoLA/cola_test.csv', index=False)


# others
cola_train_and_val_clean.to_csv('/content/drive/MyDrive/266/Data/Clean_Data/CoLA/cola_train_clean.csv', index=False)
cola_tok_train_and_val_clean.to_csv('/content/drive/MyDrive/266/Data/Clean_Data/CoLA/cola_tok_train_clean.csv', index=False)
cola_tok_test_clean.to_csv('/content/drive/MyDrive/266/Data/Clean_Data/CoLA/cola_tok_test_clean.csv', index=False)
cola_major_annotations.to_csv('/content/drive/MyDrive/266/Data/Clean_Data/CoLA/cola_major_annotations.csv', index=False)
cola_minor_annotations.to_csv('/content/drive/MyDrive/266/Data/Clean_Data/CoLA/cola_minor_annotations.csv', index=False)