

<!-- ![](https://cdn.dribbble.com/users/2353146/screenshots/9073115/media/eadd1ef050b49fbdf20d42e7c504458d.png)
Image is made by [Алена Молчанова](https://dribbble.com/shots/9073115-WRITING?utm_source=Clipboard_Shot&utm_campaign=mimipig&utm_content=WRITING&utm_medium=Social_Share&utm_source=Clipboard_Shot&utm_campaign=mimipig&utm_content=WRITING&utm_medium=Social_Share) -->

# Here is a baseline notebook for [Feedback Prize - Evaluating Student Writing](https://www.kaggle.com/c/feedback-prize-2021/)


**The notebook outline:** 
1. spacy EDA 
2. Text preprocessing and transforming data to sentences using this great notebook: [📖Feedback- Baseline🤗 Sentence Classifier [0.226]](https://www.kaggle.com/julian3833/feedback-baseline-sentence-classifier-0-226)
3. Train model with [ULMFIT](https://arxiv.org/pdf/1708.02182v1.pdf) using fastai library for sentence classficiation
4. Submit




### First we do necessary imports and load the data

In [None]:
from fastai.text import *
from fastai.text.all import *
import nltk
import spacy
import pandas as pd
import numpy as np
import fastai
import torch
from tqdm.notebook import tqdm
from sklearn.metrics import classification_report
tqdm.pandas()
print(f"fastai version: {fastai.__version__}")
if torch.cuda.is_available():
    print(f"GPU which is used : {torch.cuda.get_device_name(0)}")
    

## defining directories
root = Path().absolute()
data_dir = root / "../input/feedback-prize-2021"
train_text_dir = data_dir / "train"
test_text_dir = data_dir / "test"
out = root / "out"
out.mkdir(exist_ok=True)
# data_dir.ls()

In [None]:
df = pd.read_csv(data_dir / "train.csv", dtype={'discourse_id': int, 'discourse_start': int, 'discourse_end': int})
print('\nHere is the data sample : \n\n')
df.head(2)

### Visualize classes

In [None]:
def visualize_classes(file_name):
    ents = []
    for i, row in df[df['id'] == file_name].iterrows():
        ents.append({
                        'start': int(row['discourse_start']), 
                         'end': int(row['discourse_end']), 
                         'label': row['discourse_type']
                    })
    with open(f'{train_text_dir}/{file_name}.txt', 'r') as file: data = file.read()

    doc2 = {
        "text": data,
        "ents": ents,
        "title": file_name
    }
    colors = {'Lead': '#c1dbd5',
            'Position': '#fcf2b6',
            'Claim': '#bbceae',
            'Evidence': '#c8f1bf',
            'Counterclaim': '#d3b88a',
            'Concluding Statement': '#ed9a8b',
            'Rebuttal': '#ef8c9d'}

    options = {"ents": df.discourse_type.unique().tolist(), "colors": colors}
    spacy.displacy.render(doc2, style="ent", options=options, manual=True, jupyter=True)
    print('\n')

Visualize 4 sample of data

In [None]:
for i in df['id'].sample(n=4, random_state=2).values.tolist():
    visualize_classes(i)

### preprocessing

Convert classes into ints, also we put `No Class` for sentences with no label

In [None]:
ID2CLASS = dict(enumerate(df['discourse_type'].unique().tolist() + ['No Class']))
CLASS2ID = {v: k for k, v in ID2CLASS.items()}
# print(ID2CLASS)
CLASS2ID

And here are some helper functions to transform data to sentences
* fill_gaps(), classifies parts of the texts that has not label as "No Class" 
* get_elements(), creates a list of text sections for a each text
* get_x_samples(), maps each sentences to their labels.

In [None]:
def get_text(source_dir,a_id):
    a_file = f"{source_dir}/{a_id}.txt"
    with open(a_file, "r") as fp:
        txt = fp.read()
    return txt

def fill_gaps(elements, text):
    """Add "No Class" elements to a list of elements (see get_elements) """
    initial_idx = 0
    final_idx = len(text)

    # Add element at the beginning if it doesn't in index 0
    new_elements = []
    if elements[0][0] != initial_idx:
        starting_element = (0, elements[0][0]-1, 'No Class')
        new_elements.append(starting_element)


    # Add element at the end if it doesn't in index "-1"
    if elements[-1][1] != final_idx:
        closing_element = (elements[-1][1]+1, final_idx, 'No Class')
        new_elements.append(closing_element)

    elements += new_elements
    elements = sorted(elements, key=lambda x: x[0])

    # Add "No class" elements inbetween separated elements 
    new_elements = []
    for i in range(1, len(elements)-1):
        if elements[i][0] != elements[i-1][1] + 1 and elements[i][0] != elements[i-1][1]:
            new_element = (elements[i-1][1] + 1, elements[i][0]-1, 'No Class')
            new_elements.append(new_element)

    elements += new_elements
    elements = sorted(elements, key=lambda x: x[0])
    return elements


def get_elements(df, text_id, do_fill_gaps=True, text=None):
    """Get a list of (start, end, class) elements for a given text_id"""
    text = get_text(text_id) if text is None else text
    df_text = df[df['id'] == text_id]
    elements = df_text[['discourse_start', 'discourse_end', 'discourse_type']].to_records(index=False).tolist()
    if do_fill_gaps:
        elements = fill_gaps(elements, text)
    return elements

def get_x_samples(df, text_id, do_fill_gaps=True):
    """Create a dataframe of the sentences of the text_id, with columns text, label """
    text = get_text(train_text_dir,text_id)
    elements = get_elements(df, text_id, do_fill_gaps, text)
    sentences = []
    for start, end, class_ in elements:
        elem_sentences = nltk.sent_tokenize(text[start:end])
        sentences += [(sentence, class_) for sentence in elem_sentences]
    df = pd.DataFrame(sentences, columns=['text', 'label'])
    df['label'] = df['label'].map(CLASS2ID)
    return df

In [None]:
text_ids = df['id'].unique().tolist()

And here we build our sentence dataframe

In [None]:
x = []
for text_id in tqdm(text_ids):
    x.append(get_x_samples(df, text_id))

df_sentences = pd.concat(x)
df_sentences = df_sentences[df_sentences.text.str.split().str.len() >= 3]
df_sentences.to_csv("df_sentence.csv", index=False)

Slice trainset, in order to train the kernel faster

It should be removed when training model for submission

In [None]:
df_sample = df_sentences.sample(n=3000, random_state=42)
print('class distribution in sample set :')
df_sample['label'].value_counts()

### Finetune AWD-LSTM on our corpus to train [ULMFIT](https://docs.fast.ai/tutorial.text.html) 

Create [TextDataLoaders](https://docs.fast.ai/text.data.html#TextDataLoaders.from_df) from our df_sentences. keep in mind that using `is_lm` to `True` because later on we want to use it for finetuning our language model.

In [None]:
dls_lm = TextDataLoaders.from_df(df_sample, path=data_dir,is_lm=True)
dls_lm.show_batch(max_n=3)

and save dataloader for later use

In [None]:
torch.save(dls_lm, out / 'dls_lm.pkl')
# dls_lm = torch.load(out / 'dls_lm.pkl')

Here we instantiate our [language_model_learner](https://docs.fast.ai/text.learner.html#language_model_learner) and using [Mixed precision training](https://docs.fast.ai/callback.fp16.html#Learner.to_fp16) by adding `.to_fp16()` at the end of code

In [None]:
learn = language_model_learner(dls_lm, AWD_LSTM, metrics=[accuracy, Perplexity()], path=data_dir, wd=0.1, model_dir="/tmp/model/").to_fp16()

Using learning rate finder of fastai. Here we plot the loss versus the learning rates. We're interested in finding a good order of magnitude of the learning rate, so we plot with a log scale. Then, we choose a value that is approximately in the middle of the sharpest downward slope.

For more information on the finding the good learning rate you can refer to this post: [how do you find a good learning rate](https://sgugger.github.io/how-do-you-find-a-good-learning-rate.html)

In [None]:
learn.lr_find()

Next, we finetune the model. By default, a pretrained Learner is in a frozen state, meaning that only the head of the model will train while the body stays frozen.

In [None]:
learn.fit_one_cycle(1, 1e-3)

We can then fine-tune the model after unfreezing

In [None]:
learn.unfreeze()
learn.fit_one_cycle(7, 1e-4)

Saving encoder for later use in text classification
NOTE THAT

`
Encoder: The model not including the task-specific final layer(s). It means much the same thing as body when applied to vision CNNs, but tends to be more used for NLP and generative models
`

In [None]:
learn.save_encoder(out / 'finetuned')

Here we gather our data for text classification almost exactly like before.

The main difference is that we have to use the exact same vocabulary as when we were fine-tuning our language model, or the weights learned won't make any sense. We pass that vocabulary with vocab by adding `text_vocab=dls_lm.vocab`

In [None]:
dls_clas = TextDataLoaders.from_df(df_sentences, path=data_dir, text_col='text', label_col='label',text_vocab=dls_lm.vocab)
dls_clas.show_batch(max_n=3)

In [None]:
torch.save(dls_clas, out / 'dls_clas.pkl')
# dls_clas = torch.load(out / 'dls_clas.pkl')

Then we can define our text classifier like before:

Defing metrics: we use [accuracy](https://docs.fast.ai/metrics.html#accuracy) and [F1Score](https://docs.fast.ai/metrics.html#F1Score)

In [None]:
metrics=[accuracy,F1Score(average='micro')]
learn = text_classifier_learner(dls_clas, AWD_LSTM, drop_mult=0.5, metrics=metrics, model_dir="/tmp/model/")

As said before we load our encoder that we saved so that we use the exact same vocabulary as when we were fine-tuning our language model.

In [None]:
learn = learn.load_encoder(out / 'finetuned')

In [None]:
learn.lr_find()

In [None]:
lr = 1e-2
learn.fit_one_cycle(1, lr)

The last step is to train with discriminative learning rates and gradual unfreezing. In computer vision, we often unfreeze the model all at once, but for NLP classifiers, we find that unfreezing a few layers at a time makes a real difference.

Here is abit of information about using `slice` in learning rate

Applying different lr for different groups is a technique called “discriminative layer training” that is introduced in part 1. This technique is commonly used in both computer vision and natural language processing.
you can find some good info by reading this blog post: [The 1cycle policy](https://sgugger.github.io/the-1cycle-policy.html#the-1cycle-policy)

slice() can be passed 1 or 2 arguments only. Below is a snippet of experiments for your reference:

```python
In [9]: slice(5)
Out[9]: slice(None, 5, None)

In [10]: slice(1, 5)
Out[10]: slice(1, 5, None)
```

Therefore, in your last line, `slice(5e-3/(2.6**4),5e-3)` is equivalent to `slice(start = 5e-3/(2.6**4), stop = 5e-3, step = None)`

In [None]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(lr/(2.6**4),lr))

In [None]:
learn.freeze_to(-3)
learn.fit_one_cycle(1, slice(lr/2/(2.6**4),lr/2))

In [None]:
learn.unfreeze()
learn.fit_one_cycle(2, slice(lr/10/(2.6**4),lr/10))

### Prepare test data

In [None]:
def create_df_test():
    test_ids = [f[:-4] for f in os.listdir(test_text_dir)]
    test_data = []
    for test_id in test_ids:
        text = get_text(test_text_dir, test_id)
        sentences = nltk.sent_tokenize(text)
        id_sentences = []
        idx = 0 
        for sentence in sentences:
            id_sentence = []
            words = sentence.split()
            # I created this heuristic for mapping words in senteces to "word indexes"
            # This is not definitive and might have strong drawbacks and problems
            for w in words:
                id_sentence.append(idx)
                idx+=1
            id_sentences.append(id_sentence)
        test_data += list(zip([test_id] * len(sentences), sentences, id_sentences))
    df_test = pd.DataFrame(test_data, columns=['id', 'text', 'ids'])
    return df_test

In [None]:
df_test = create_df_test()
df_test.head()

### Make prediction

In [None]:
def predict(txt):
    with learn.no_bar(), learn.no_logging():
        return int(learn.predict(txt)[0])

In [None]:
df_test['predictions'] = df_test["text"].progress_apply(lambda x: predict(x))

df_test['class'] = df_test['predictions'].map(ID2CLASS)
df_test.head()

Turn the word ids into predictionstring 

In [None]:
df_test['predictionstring'] = df_test['ids'].apply(lambda x: ' '.join([str(i) for i in x]))
df_test.head()

Drop the sentences with label "No class" 

In [None]:
df_test = df_test[df_test['class'] != 'No Class']
df_test.head()

In [None]:
# submit
df_test[['id', 'class', 'predictionstring']].to_csv("submission.csv", index=False)

**Refrences:**

- [Transfer learning in text](https://docs.fast.ai/tutorial.text.html)
- [Efficient multi-lingual language model fine-tuning](https://nlp.fast.ai/)
- [Universal Language Model Fine-tuning for Text Classification](https://arxiv.org/abs/1801.06146)