# FEEDBACK PRIZE - EDA

If you find this notebook useful, support with an upvote👍

Model Training Reference:
* https://www.kaggle.com/zzy990106/pytorch-ner-infer/notebook

### Import Libraries

In [None]:
import os
import time
import torch
import spacy
import random
import wordcloud
import torch.nn as nn
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import random

import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
from sklearn.model_selection import *
from transformers import *

from wordcloud import WordCloud, STOPWORDS
from sklearn.feature_extraction.text import CountVectorizer

plt.style.use('ggplot')

pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', None)
pd.set_option('max_rows', None)

In [None]:
train_dir = "../input/feedback-prize-2021/train"
test_dir = "../input/feedback-prize-2021/test"
train_files = os.listdir(train_dir)
test_files = os.listdir(test_dir)

for file in range(len(train_files)):
    train_files[file] = str(train_dir) + "/" +  str(train_files[file])
for file in range(len(test_files)):
    test_files[file] = str(test_dir) + "/" +  str(test_files[file])
    
train = pd.read_csv("../input/feedback-prize-2021/train.csv")

In [None]:
test_df = pd.read_csv('../input/feedback-prize-2021/sample_submission.csv')
test_df

In [None]:
print(f'The training set has {train.shape[0]} rows and {train.shape[1]} columns')

In [None]:
train.head(3)

**The column descriptions are:**

* id - ID code for essay response
* discourse_id - ID code for discourse element
* discourse_start - character position where discourse element begins in the essay response
* discourse_end - character position where discourse element ends in the essay response
* discourse_text - text of discourse element
* discourse_type - classification of discourse element
* discourse_type_num - enumerated class label of discourse element
* predictionstring - the word indices of the training sample, as required for predictions

In [None]:
print(f"We have {train['id'].nunique()} essays")

In [None]:
f = open(train_files[0], "r")
print(f.read())

In [None]:
f = open(test_files[4], "r")
print(f.read())

In [None]:
train.isnull().sum()

**The 7 different Discourse Type**

* Lead - an introduction that begins with a statistic, a quotation, a description, or some other device to grab the reader’s attention and point toward the thesis
* Position - an opinion or conclusion on the main question
* Claim - a claim that supports the position
* Counterclaim - a claim that refutes another claim or gives an opposing reason to the position
* Rebuttal - a claim that refutes a counterclaim
* Evidence - ideas or examples that support claims, counterclaims, or rebuttals.
* Concluding Statement - a concluding statement that restates the claims

In [None]:
color_discrete_map = {'German Shephfard': 'rgb(255,0,0)'}
fig = px.bar(x = np.unique(train["discourse_type"]),
             y = [list(train["discourse_type"]).count(i) for i in np.unique(train["discourse_type"])] , 
             color = np.unique(train["discourse_type"]),
             color_discrete_map=color_discrete_map) 

fig.update_xaxes(title="Assets")
fig.update_yaxes(title = "Number of Rows")
fig.update_layout(showlegend = True,title = {
    'text': 'Discourse Type Distribution ',
    'y':0.95,
    'x':0.5,
    'xanchor': 'center',
    'yanchor': 'top'})

fig.show()

In [None]:
color_discrete_map = {'German Shephard': 'rgb(255,0,0)'}
fig = px.bar(x = np.unique(train["discourse_type_num"]),
             y = [list(train["discourse_type_num"]).count(i) for i in np.unique(train["discourse_type_num"])] , 
             color = np.unique(train["discourse_type_num"]),
             color_discrete_map=color_discrete_map) 

fig.update_xaxes(title="Assets")
fig.update_yaxes(title = "Number of Rows")
fig.update_layout(showlegend = True,title = {
    'text': 'Discourse Element Distribution ',
    'y':0.95,
    'x':0.5,
    'xanchor': 'center',
    'yanchor': 'top'})

fig.show()

In [None]:
train["discourse_len"] = train["discourse_end"] - train["discourse_start"]
fig = px.box(data_frame= train, x="discourse_len")
fig.show()

In [None]:
train['full_text'] = train['discourse_text'].groupby(train['id']).transform(lambda x: ' '.join(x)) # obviously we will have duplicates

In [None]:
text_length = train['full_text'].drop_duplicates().apply(len)

fig = plt.figure(figsize=(10,8))

ax1 = text_length.plot(kind='hist', color = "#120f7a", bins=100)
ax1.set_title('Essay Length Distribution')
ax1.set_xlabel("Essay Length")
ax1.set_ylabel("Frequency")

plt.show()

In [None]:
word_count = train['full_text'].drop_duplicates().apply(lambda x: len(str(x).split()))

fig = plt.figure(figsize=(10,8))

ax1 = word_count.plot(kind='hist', color = "#120f7a", bins=100)
ax1.set_title('Word Count Distribution')
ax1.set_xlabel("Word Count")
ax1.set_ylabel("Frequency")

plt.show()

In [None]:
wordcloud = wordcloud.WordCloud(stopwords=wordcloud.STOPWORDS, max_font_size=80, max_words=5000,
                      width = 600, height = 400,
                      background_color='black').generate(' '.join(txt for txt in train["discourse_text"]))
fig, ax = plt.subplots(figsize=(14,10))
ax.imshow(wordcloud, interpolation='bilinear')
ax.set_axis_off()
plt.imshow(wordcloud);

# Modeling

In [None]:
CFG = {
    'fold_num': 5, 
    'seed': 42,
    'model': '../input/roberta-base',
    'max_len': 512,
    'epochs': 5,
    'train_bs': 24,
    'valid_bs': 32,
    'lr': 2e-5,
    'num_workers': 0,
    'weight_decay': 1e-6,
}

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(CFG['seed'])

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
test_names, test_texts = [], []
for f in tqdm(list(os.listdir('../input/feedback-prize-2021/test'))):
    test_names.append(f.replace('.txt', ''))
    test_texts.append(open('../input/feedback-prize-2021/test/' + f, 'r').read())
test_texts = pd.DataFrame({'id': test_names, 'text': test_texts})
test_texts['text'] = test_texts['text'].apply(lambda x:x.split())
test_texts

In [None]:
tokenizer = AutoTokenizer.from_pretrained(CFG['model'], add_prefix_space=True)

In [None]:
class MyDataset(Dataset):
    def __init__(self, df):
        self.df = df
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        text = self.df.text.values[idx]
        
        return text

In [None]:
def collate_fn(data):
    input_ids, attention_mask = [], []
    
    tokenized_inputs = tokenizer(
        data,
        max_length=CFG['max_len'],
        padding='max_length',
        truncation=True,
        is_split_into_words=True,
        return_tensors='pt'
    )

    words = []
    for i in range(len(data)):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        words.append(word_ids)

    tokenized_inputs["word_ids"] = words
    
    return tokenized_inputs

In [None]:
test_loader = DataLoader(MyDataset(test_texts), batch_size=CFG['valid_bs'], collate_fn=collate_fn, shuffle=False, num_workers=4)
batch = next(iter(test_loader))

In [None]:
batch

In [None]:
model =  AutoModelForTokenClassification.from_pretrained(CFG['model'], num_labels=15).to(device)
model.load_state_dict(torch.load('../input/feedback-roberta/roberta-base_fold_0.pt'))
model.eval()

In [None]:
y_pred = []
words = []

with torch.no_grad():
    tk = tqdm(test_loader, total=len(test_loader), position=0, leave=True)
    for step, batch in enumerate(tk):
        word_ids = batch['word_ids']
        words.extend(word_ids)
        batch = {k: v.to(device) for k, v in batch.items() if k != 'word_ids'}

        output = model(**batch).logits

        y_pred.extend(output.argmax(-1).cpu().numpy())
        
y_pred = np.array(y_pred)

In [None]:
labels = ['O', 'B-Lead', 'I-Lead', 'B-Position', 'I-Position', 'B-Claim', 'I-Claim', 'B-Counterclaim', 'I-Counterclaim', 
          'B-Rebuttal', 'I-Rebuttal', 'B-Evidence', 'I-Evidence', 'B-Concluding Statement', 'I-Concluding Statement']

In [None]:
final_preds = []

for i in tqdm(range(len(test_texts))):
    idx = test_texts.id.values[i]
    pred = ['']*len(test_texts.text.values[i])

    for j in range(len(y_pred[i])):
        if words[i][j] != None:
            pred[words[i][j]] = labels[y_pred[i][j]]

    pred = [x.replace('B-','').replace('I-','') for x in pred]

    preds = []
    j = 0
    while j < len(pred):
        cls = pred[j]
        if cls == 'O':
            j += 1
        end = j + 1
        while end < len(pred) and pred[end] == cls:
            end += 1
            
        if cls != 'O' and cls != '' and end - j > 10:
            final_preds.append((idx, cls, ' '.join(map(str, list(range(j, end))))))
        
        j = end

In [None]:
final_preds[1]

In [None]:
sub = pd.DataFrame(final_preds)
sub.columns = test_df.columns
sub

In [None]:
sub.to_csv('submission.csv', index=False)

Thank you.