### Other Notebooks::

<ul>  
    <li>Using the datasets : chaii, mlqa, squad, tamil_xquad </li>
    <li>EDA and data cleaning : <a href="https://www.kaggle.com/kishalmandal/some-eda-and-cleaning-chaii">some eda and cleaning chaii</a> </li>
    <li>Training for 2 epochs : <a href="https://www.kaggle.com/kishalmandal/chaii-fit-2-epochs-mlqa-xquad-chaii/">chaii | FIT - 2 epochs | mlqa, xquad, chaii</a> </li>
    <li>Training for 7 epochs with tamil_xquad: <a href="https://www.kaggle.com/kishalmandal/chaii-fit-7-epochs-extra-tamil-data/">chaii | FIT - 7 epochs | Extra Tamil Data</a> </li>
    <li>Inferencing from 5 folds | fold-0 and fold-1 (7-epochs) | fold-2, fold-3 and fold-4 (2-epochs)| based on cross validation scores and a little bit of experimentation 😜: <a href="https://www.kaggle.com/kishalmandal/5-epochs-infer-combined-model-0-792/">5 epochs | INFER | combined model (0.792)</a></li>
    <li>Tried to used weighted average 😅 using optuna to optimise weights: <a href="https://www.kaggle.com/kishalmandal/using-optuna-to-optimise-weighted-average/">🔥 Using OPTUNA to optimise weighted average 🔥</a></li>
    
</ul>

### References : 

<ul> 
    <li>For training: <a href="https://www.kaggle.com/rhtsingh/chaii-qa-5-fold-xlmroberta-torch-fit">chaii QA - 5 Fold XLMRoberta Torch | FIT</a> by <a href="https://www.kaggle.com/rhtsingh">torch</a></li>
    <li>For Inference: <a href="https://www.kaggle.com/rhtsingh/chaii-qa-5-fold-xlmroberta-torch-infer">chaii QA - 5 Fold XLMRoberta Torch | Infer</a> by <a href="https://www.kaggle.com/rhtsingh">torch</a></li>
    <li>For post processing outputs: <a href="https://www.kaggle.com/nbroad/chaii-qa-torch-5-fold-with-post-processing-765">chaii QA-Torch 5 fold with post-processing (.765)</a> by <a href="https://www.kaggle.com/nbroad">Nicholas Broad 🟢</a></li>
</ul>


In [None]:
import os
import pandas as pd 
import plotly.express as px
from tqdm import tqdm

In [None]:
fold0 = pd.read_csv('../input/cv-folds/cv_fold_0.csv') 
fold1 = pd.read_csv('../input/cv-folds/cv_fold_1.csv')
fold2 = pd.read_csv('../input/cv-folds/cv_fold_2.csv')
fold3 = pd.read_csv('../input/cv-folds/cv_fold_3.csv')
fold4 = pd.read_csv('../input/cv-folds/cv_fold_4.csv')
folds = [fold0, fold1, fold2, fold3, fold4]

# <strong>Folds Exploration</strong>

In [None]:
values=[]
for fold in folds:
    tamil=0
    hindi=0
    for _,row in tqdm(fold.iterrows()):
        if row['language']=='tamil':
            tamil+=1
        else:
            hindi+=1
    values.append([tamil,hindi])

In [None]:
values

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

labels = ['Tamil','Hindi']


# Create subplots: use 'domain' type for Pie subplot
fig = make_subplots(rows=1, cols=5, specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}, {'type':'domain'}, {'type':'domain'}]], subplot_titles=("fold 0", "fold 1", "fold 2", "fold 3", 'fold 4'))
fig.add_trace(go.Pie(labels=labels, values=values[0], name="fold 0"),
              1, 1)
fig.add_trace(go.Pie(labels=labels, values=values[1], name="fold 1"),
              1, 2)
fig.add_trace(go.Pie(labels=labels, values=values[2], name="fold 2"),
              1, 3)
fig.add_trace(go.Pie(labels=labels, values=values[3], name="fold 3"),
              1, 4)
fig.add_trace(go.Pie(labels=labels, values=values[4], name="fold 4"),
              1, 5)

# Use `hole` to create a donut-like pie chart
fig.update_traces(hole=.7, hoverinfo="label+percent+name")

fig.update_layout(
    title_text="% Tamil vs % Hindi for each fold", title_x=0.5)
fig.show()

#### Conclusion :
Ratio of <strong>Tamil : Hindi</strong> language is equally divided among all the folds. This information might later help us to choose different models for the final ensemble.

In [None]:
values2=[]
for fold in folds:
    less=0
    more=0
    for _,row in tqdm(fold.iterrows()):
        if len(row['answer_text'])>30:
            more+=1
        else:
            less+=1
    values2.append([more,less])

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

labels = ['>30','<=30']


# Create subplots: use 'domain' type for Pie subplot
fig = make_subplots(rows=1, cols=5, specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}, {'type':'domain'}, {'type':'domain'}]], subplot_titles=("fold 0", "fold 1", "fold 2", "fold 3", 'fold 4'))
fig.add_trace(go.Pie(labels=labels, values=values2[0], name="GHG Emissions"),
              1, 1)
fig.add_trace(go.Pie(labels=labels, values=values2[1], name="CO2 Emissions"),
              1, 2)
fig.add_trace(go.Pie(labels=labels, values=values2[2], name="CO2 Emissions"),
              1, 3)
fig.add_trace(go.Pie(labels=labels, values=values2[3], name="CO2 Emissions"),
              1, 4)
fig.add_trace(go.Pie(labels=labels, values=values2[4], name="CO2 Emissions"),
              1, 5)

# Use `hole` to create a donut-like pie chart
fig.update_traces(hole=.7, hoverinfo="label+percent+name")

fig.update_layout(
    title_text="'answer_text' Lengths", title_x=0.5,
    # Add annotations in the center of the donut pies.
    annotations=[dict(text='fold 0',x=0.05, font_size=20, showarrow=False),
                 dict(text='fold 1',x=0.26, font_size=20, showarrow=False),
                 dict(text='fold 2',x=0.5, font_size=20, showarrow=False),
                 dict(text='fold 3',x=0.74, font_size=20, showarrow=False),
                 dict(text='fold 4',x=0.95, font_size=20, showarrow=False)])
fig.show()

#### Conclusion: 
Maximum answer lengths are <=30. But to be more confident, we will plot box plots to see the IQR.

In [None]:
values=[]
for fold in folds:
    lens=[]
    for _,row in tqdm(fold.iterrows()):
        lens.append(len(row['answer_text']))
         
    values.append(lens)

In [None]:
import numpy as np
c = ['hsl('+str(h)+',50%'+',50%)' for h in np.linspace(0, 360, 5)]

fig = go.Figure()
fig.add_trace(go.Box(y=values[0], name = 'fold 0',
              marker_color = c[0]))
fig.add_trace(go.Box(y=values[1], name = 'fold 1',
              marker_color = c[1]))
fig.add_trace(go.Box(y=values[2], name = 'fold 2',
              marker_color = c[2]))
fig.add_trace(go.Box(y=values[3], name = 'fold 3',
              marker_color = c[3]))
fig.add_trace(go.Box(y=values[4], name = 'fold 4',
              marker_color = c[4]))

fig.update_layout(
    title_text="answer_text Lengths (Box Plot)", title_x=0.5,)

fig.show()

#### Conclusion from the box plots
Upper fence of the box plot shows the value of around 30. So taking max answer length =30 during inference

<hr>

# <strong>Trained Models:</strong>

I have trained 4 different models with some different parameters:
Out of which one model is public and has a LB score of 0.792
The notebook links to the training kernels are:
<ul>
    <li>Training for 2 epochs : <a href="https://www.kaggle.com/kishalmandal/chaii-fit-2-epochs-mlqa-xquad-chaii/">chaii | FIT - 2 epochs | mlqa, xquad, chaii</a> </li>
    <li>Training for 7 epochs with tamil_xquad: <a href="https://www.kaggle.com/kishalmandal/chaii-fit-7-epochs-extra-tamil-data/">chaii | FIT - 7 epochs | Extra Tamil Data</a> </li>
</ul>

## <strong>There are 4 models with their cv scores and comparison.</strong> 
<ol>
    <li> Compared the CV results of different folds of a model.</li>
    <li> Compared CV scores of different models w.r.t. their folds.</li>
    <li> Compared the LB score of different folds of the public model (0.792). </li>
    <li> Will compare the LB scores of folds of the other models. </li>
</ol>

## Trying to find out some pattern and choose the best models


<hr>

## Experimental Results

The cross validation scores were calculated on the chaii competition dataset, whereas the model was trained using mlqa and xquad data. The graphs shows the cv-scores for each fold.

## 1. 5 folds roberta - public highest scoring model (0.792)
#### Link : <a href='https://www.kaggle.com/kishalmandal/5foldsroberta'>5 folds roberta</a>

fold 0: trained with 7 epochs <br>
fold 1: trained with 7 epochs <br>
fold 2: trained with 2 epochs <br>
fold 3: trained with 2 epochs <br>
fold 4: trained with 2 epochs 

#### CV and LB Scores for each folds:

In [None]:
scores_1 = [0.6337764253683537, 0.6212957259481922, 0.7170776613403889, 0.69876961603419, 0.706597669097669]
lb_scores_1 = [0.720, 0.737, 0.761, 0.760, 0.743]

folds_names = ['fold 0', 'fold 1', 'fold 2', 'fold 3', 'fold 4']

fig = go.Figure(data=[
    go.Bar(x=folds_names, y=scores_1, name='CV Score', marker_color='salmon'),
    go.Bar(x=folds_names, y=lb_scores_1, name='LB Score', marker_color='indianred')
])
for data in fig.data:
    data["width"] = 0.25

fig.update_yaxes(range=[0, 1])
fig.update_layout(title_text="CV and LB Scores for each fold (public model)", title_x=0.5,)

fig.show()

## 2. Private-Model-1 (2epochs seed-1)

In [None]:
scores_2 = [0.6600509956671002, 0.6738531212185921, 0.7170776613403889, 0.69876961603419, 0.706597669097669]

folds_names = ['fold 0', 'fold 1', 'fold 2', 'fold 3', 'fold 4']

fig = px.bar(x=folds_names, y=scores_2, 
             labels={'x':'fold','y':'Score'}, width = 1000, height = 600)
fig = px.bar(x=folds_names, y=scores_2, 
             labels={'x':'fold','y':'Score'}, width = 1000, height = 600)

for data in fig.data:
    data["width"] = 0.3

fig.update_yaxes(range=[0, 1])
fig.update_layout(title_text="CV Scores for each fold (model-2)", title_x=0.5,barmode='group')
fig.update_traces(marker_color='salmon')
fig.show()

## 3. Private-Model-2 (2epochs seed-2)

In [None]:
scores_3 = [0.68865577621183, 0.6638506299380739, 0.7201259876147768, 0.7022838895036204, 0.6932807807807808]

folds_names = ['fold 0', 'fold 1', 'fold 2', 'fold 3', 'fold 4']

fig = px.bar(x=folds_names, y=scores_3, 
             labels={'x':'fold','y':'Score'}, width = 1000, height = 600)
for data in fig.data:
    data["width"] = 0.3

fig.update_yaxes(range=[0, 1])
fig.update_layout(title_text="CV Scores for each fold (model-3)", title_x=0.5,)
fig.update_traces(marker_color='salmon')
fig.show()

## 4. Private-Model-3 (7epochs with extra tamil data)

In [None]:
scores_4 = [0.7766629297458894, 0.6212957259481922, 0.6281327852516194, 0.67546711509716, 0.7262144992526158]

folds_names = ['fold 0', 'fold 1', 'fold 2', 'fold 3', 'fold 4']

fig = px.bar(x=folds_names, y=scores_4, 
             labels={'x':'fold','y':'Score'}, width = 1000, height = 600)
for data in fig.data:
    data["width"] = 0.3

fig.update_yaxes(range=[0, 1])
fig.update_layout(title_text="CV Scores for each fold (model-4)", title_x=0.5,)
fig.update_traces(marker_color='salmon')
fig.show()

# 5. Private-Model-4 (2epochs seed-3)

In [None]:
scores_5 = [0.7057788810591502, 0.7244207772795216, 0.7250186846038864, 0.7249119154388213, 0.7219223788169976]

folds_names = ['fold 0', 'fold 1', 'fold 2', 'fold 3', 'fold 4']

fig = px.bar(x=folds_names, y=scores_5, 
             labels={'x':'fold','y':'Score'}, width = 1000, height = 600)
for data in fig.data:
    data["width"] = 0.3

fig.update_yaxes(range=[0, 1])
fig.update_layout(title_text="CV Scores for each fold (model-5)", title_x=0.5,)
fig.update_traces(marker_color='salmon')
fig.show()

<hr>

# Comapring the all models based on folds

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=folds_names, y=scores_1,
                    mode='lines+markers',
                    name='5-folds-roberta'))
fig.add_trace(go.Scatter(x=folds_names, y=scores_2,
                    mode='lines+markers',
                    name='Private-Model-2'))
fig.add_trace(go.Scatter(x=folds_names, y=scores_3,
                    mode='lines+markers', 
                    name='Private-Model-3'))
fig.add_trace(go.Scatter(x=folds_names, y=scores_4,
                    mode='lines+markers', 
                    name='Private-Model-4'))

fig.add_trace(go.Scatter(x=folds_names, y=scores_5,
                    mode='lines+markers', 
                    name='Private-Model-5'))


fig.update_layout(title_text="CV Scores for each fold", title_x=0.5,)


fig.show()

#### Note: blue line and the red lines overlap. Since 3folds of the 5foldsroberta are taken from that model

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=folds_names, y=lb_scores_1,
                    mode='lines+markers',
                    name='5-folds-roberta'))
# fig.add_trace(go.Scatter(x=folds_names, y=scores_2,
#                     mode='lines+markers',
#                     name='Private-Model-2'))
# fig.add_trace(go.Scatter(x=folds_names, y=scores_3,
#                     mode='lines+markers', 
#                     name='Private-Model-3'))
# fig.add_trace(go.Scatter(x=folds_names, y=scores_4,
#                     mode='lines+markers', 
#                     name='Private-Model-4'))

fig.update_layout(title_text="LB Scores for each fold", title_x=0.5,)


fig.show()

#### NOTE: Will update soon

<hr>

# Will be updating with LB scores for each fold and each model 🙂

# ❤️ Upvote if you like ❤️
#### It will really motivate me towards making more kernels