### The dataset Overview
<ul>
<li> Lead - an introduction that begins with a statistic, a quotation, a description, or some other device to grab the reader’s attention and point toward the thesis </li>
<li>Position - an opinion or conclusion on the main question</li>
<li>Claim - a claim that supports the position</li>
<li>Counterclaim - a claim that refutes another claim or gives an opposing reason to the position</li>
<li>Rebuttal - a claim that refutes a counterclaim</li>
<li>Evidence - ideas or examples that support claims, counterclaims, or rebuttals.</li>
<li>Concluding Statement - a concluding statement that restates the claims</li>
</ul>

##### Your task is to predict the quality rating of each discourse element. Human readers rated each rhetorical or argumentative element, in order of increasing quality, as one of:

<ul>
<li>Ineffective</li>
<li>Adequate</li>
<li>Effective</li>
</ul>

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
import transformers
import tqdm

In [None]:
MODEL_PATH = '../input/huggingface-bert-variants/distilbert-base-cased/distilbert-base-cased'

### Load train & test data

In [None]:
dataset_tr = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/train.csv')
dataset_te = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/test.csv')

#### Concate feedback text in the train dataset

In [None]:
dataset_tr['text'] = dataset_tr['essay_id'].apply(lambda x: open(f'/kaggle/input/feedback-prize-effectiveness/train/{x}.txt').read())
dataset_tr.head(2)

#### Map the labels to Numerical Category

In [None]:
effectiveness_map = {"Ineffective":0, "Adequate":1,"Effective":2}
dataset_tr["target"] = dataset_tr["discourse_effectiveness"].map(effectiveness_map)

### Load Tokenizer

In [None]:
from transformers import BertTokenizer
#initialize tokenizer
tokenizer = BertTokenizer.from_pretrained(MODEL_PATH)

#### Tokenize the **discourse_type** and **text** using *sep_token*

In [None]:
dataset_tr['text']  = dataset_tr['discourse_type'] + tokenizer.sep_token + dataset_tr['text']
dataset_te['text']  = dataset_te['discourse_type'] + tokenizer.sep_token + dataset_te['discourse_text']

In [None]:
import numpy as np
X_input_ids = np.zeros((len(dataset_tr), 256))
X_attn_masks = np.zeros((len(dataset_tr), 256))

In [None]:
def encode_data(df, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(df['text'])):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=256, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

In [None]:
from tqdm.auto import tqdm

X_input_ids, X_attn_masks = encode_data(dataset_tr, X_input_ids, X_attn_masks, tokenizer)

#### Prepare labels

In [None]:
labels = np.zeros((len(dataset_tr), 3))
labels[np.arange(len(dataset_tr)), dataset_tr['target'].values] = 1
# labels

In [None]:
import tensorflow as tf

def DatasetMapFunction(input_ids, attn_masks, labels):
    return {
        \
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))
dataset = dataset.map(DatasetMapFunction)     # converting to required format for tensorflow dataset
dataset = dataset.shuffle(10000).batch(16, drop_remainder=True) # batch size, drop any left out tensor

### Split Dataset into train and validation

In [None]:
p = 0.8
train_size = int((len(dataset_tr)//16)*p) # for each 16 batch of data we will have len(df)//16 samples, take 80% of that for train.
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

### Load Model

In [None]:
from transformers import TFBertModel
model = TFBertModel.from_pretrained(MODEL_PATH) # bert base model with pretrained weights

In [None]:
input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')

In [None]:
bert_embds = model.bert(input_ids, attention_mask=attn_masks)[1] # 0 -> activation layer (3D), 1 -> pooled output layer (2D)
intermediate_layer = tf.keras.layers.Dense(512, activation='relu', name='intermediate_layer')(bert_embds)
output_layer = tf.keras.layers.Dense(3, activation='softmax', name='output_layer')(intermediate_layer) # softmax -> calcs probs of classes

In [None]:
feedback_model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
feedback_model.summary()

In [None]:
feedback_model.compile(optimizer=Adam(learning_rate=1e-5, decay=1e-6), 
                        loss='categorical_crossentropy', 
                        metrics=['accuracy'])

In [None]:
history = feedback_model.fit(
    train_dataset,
    steps_per_epoch=200,
    validation_data=val_dataset,
    epochs=5
)

### Process test dataset

In [None]:
X_test_input_ids = np.zeros((len(dataset_te), 256))
X_test_attn_masks = np.zeros((len(dataset_te), 256))
for i, text in enumerate(dataset_te['text']):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=256, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        X_test_input_ids[i, :] = tokenized_text.input_ids
        X_test_attn_masks[i, :] = tokenized_text.attention_mask
pred_labels = feedback_model.predict([X_test_input_ids, X_test_attn_masks] )

### Submission Section

In [None]:
sample_submission = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/sample_submission.csv')
sample_submission['discourse_id'] = dataset_te['discourse_id']
sample_submission['Ineffective'] = pred_labels[:,0]
sample_submission['Adequate'] = pred_labels[:,1]
sample_submission['Effective'] = pred_labels[:,2]
sample_submission.to_csv("submission.csv", index=False)