# Data Augmentation and Excerpt-Target Correlation Analysis 🔥🔥

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path
import plotly.express as px
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Install packages
!pip install plotly statsmodels pandas numpy tokenizers nltk mosestokenizer transformers
!wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt -P data

import nltk
nltk.download('perluniprops')

In [None]:
DATADIR = Path("/kaggle/working/data")
FILEPATH = Path("/kaggle/input/commonlitreadabilityprize/train.csv")

In [None]:
df = pd.read_csv(FILEPATH)
df.tail(10)

In [None]:
X = df.excerpt.apply(lambda x: len(x))
Y = df.target
fig = px.scatter(x=X, y=Y, labels={'x': "Length of sentence", 'y': "Target Value"})
fig.show()

As you can see below, there is slight downslope. i.e. as the length of sentence increases the target is more negative. This is weird because as the length of sentence increses, it should be more difficult to read.

In [None]:
fig = px.scatter(x=X, y=Y, opacity=0.65, trendline='ols', trendline_color_override='red')
fig.show()

Let's also see the correlation between these two. Ah! as expected. A -ve correlation.

In [None]:
X.corr(Y)

# Trying different tokenizers

### Levenshtein distance

In information theory, linguistics, and computer science, the Levenshtein distance is a string metric for measuring the difference between two sequences. Informally, the Levenshtein distance between two words is the minimum number of single-character edits required to change one word into the other. - *Wikipedia*

In [None]:
from Levenshtein import distance

In [None]:
sample = df.excerpt[2].replace('\n', '').lower()
print(sample)

## Tokenize words with BertWordPieceTokenizer

In [None]:
from tokenizers import BertWordPieceTokenizer
tokenizer = BertWordPieceTokenizer('/kaggle/working/data/bert-base-uncased-vocab.txt', lowercase=True)
print(tokenizer)
print()
output = tokenizer.encode(sample)
decoded = tokenizer.decode(output.ids).replace('" ', '"').replace('? "', '?"').replace(' "', '"')
print(decoded)

The tokenizer seem to not recreate the exact same sentence from encoding and decoding. Is there a bug in tokenizer?

The distance will be zero if the two sentences exactly matches to each other.

But, here it is not zero =(

In [None]:
distance(decoded, sample)

## Tokenize words with NLTK word_tokenize

In [None]:
from nltk.tokenize import word_tokenize
from mosestokenizer import MosesDetokenizer
tokens = word_tokenize(sample)
print(tokens)

In [None]:
decoded = ' '.join(tokens)
print(decoded)

Worse results than BertWordPieceTokenizer

In [None]:
distance(decoded, sample)

Trying a decoder. But still not better than BertWordPieceTokenizer

In [None]:
detokenizer = MosesDetokenizer()
decoded2 = detokenizer(tokens)
distance(decoded2, sample)

# Data Augmentation

## Split the data into sentences and create new samples with target +- standard_error

1. Convert sample paragraphs to single sentences.

2. Suppose target = -0.340259 and standard_error = 0.464009. create *N* samples in the range  [(-0.340259 - 0.464009), (-0.340259 + 0.464009)] while leaving the excerpt same.


In [None]:
from nltk.tokenize import sent_tokenize
from fastprogress import progress_bar
import random

N = 10 # Number of new samples to generated from `target +- standard_error`

Here, I'm splitting the paragraph to sentences but you can also use the whole paragraph.

In [None]:
newdf = {"id": [], "excerpt": [], "target":[], "standard_error": []}
for idx in progress_bar(range(len(df))):
    row = df.iloc[idx]
    sample_id, excerpt, target, standard_error = row.id, row.excerpt, row.target, row.standard_error
    for i, sentence in enumerate(sent_tokenize(excerpt)): # Break paragraph into sentences
        frac_error = standard_error * .10 # Taking only 10% of the original error to keep the target range narrow
        _from, _to = target + frac_error, target - frac_error
        for _ in range(N): # target +- standard_error random N values
            new_target = random.uniform(_from, _to)
            newdf["id"].append(sample_id)
            newdf["excerpt"].append(sentence)
            newdf["target"].append(new_target)
            newdf["standard_error"].append(standard_error)

In [None]:
newdf = pd.DataFrame.from_dict(newdf)
newdf.to_csv(f"{DATADIR}/generated_data.csv")
print("Number of training examples after augmentation =", newdf.shape[0])

Note that the each sample has one sentence (not paragraph).

In [None]:
newdf.sample(10)

let's look at the correlation of new data

In [None]:
X = newdf.excerpt.apply(lambda x: len(x))
Y = newdf.target
X.corr(Y)

In [None]:
fig = px.scatter(x=X, y=Y, opacity=0.65, trendline='ols', trendline_color_override='red')
fig.show()

**Please let me know in the comments if this information was helpful and the augmentation is useful!**

*If you find this information useful, please <span style="color:green">upvote</span> this notebook.*

**That's all =)**