In [None]:
import re
import os
import json
import pandas as pd
from pathlib import Path

In [None]:
from plotly import graph_objects as go
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots

In [None]:
from transformers import AutoTokenizer

In [None]:
from collections import defaultdict, Counter

In [None]:
pd.set_option("display.max_colwidth", 300)

In [None]:
DATA_DIR = Path("/kaggle/input/chaii-hindi-and-tamil-question-answering/")
train_data = pd.read_csv(DATA_DIR / "train.csv")
test_data = pd.read_csv(DATA_DIR / "test.csv")
submission_data = pd.read_csv(DATA_DIR / "sample_submission.csv")

In [None]:
train_data.head(2)

In [None]:
test_data.head(2)

In [None]:
train_data.language.value_counts()

In [None]:
test_data.language.value_counts()

In [None]:
languages = ["tamil", "hindi"]
select_language = lambda x, y: x[x.language == y]

In [None]:
def clean_data(text):
    text = re.sub(r"[a-zA-Z_+-]+", "", text)
    text = re.sub(r"\[\d+\]", "", text)
    text = re.sub(r"\([\d\s ×]+\)", "", text)
    text = re.sub(r"\(\s*\)", "", text)
    text = re.sub(r"\n", " ", text)
    return text

In [None]:
train_data.context = train_data.context.apply(lambda x: clean_data(x))

In [None]:
train_data.head(2)

Copied Wordcloud Visualization script from [here](https://www.kaggle.com/hoshi7/chaii-interactive-wordclouds?scriptVersionId=72235241&cellId=8). Thanks [Shivam Ralli](https://www.kaggle.com/hoshi7)

In [None]:
from IPython.display import HTML
import altair as alt
from  altair.vega import v5

In [None]:
# Defining functions for visualizations: 

def pie_plot(labels, values, colors, title):
    fig = {
      "data": [
        {
          "values": values,
          "labels": labels,
          "domain": {"x": [0, .48]},
          "name": "Job Type",
          "sort": False,
          "marker": {'colors': colors},
          "textinfo":"percent+label",
          "textfont": {'color': '#FFFFFF', 'size': 10},
          "hole": .6,
          "type": "pie"
        } ],
        "layout": {
            "title":title,
            "annotations": [
                {
                    "font": {
                        "size": 25,

                    },
                    "showarrow": False,
                    "text": ""

                }
            ]
        }
    }
    return fig

In [None]:
##-----------------------------------------------------------
# This whole section 
vega_url = 'https://cdn.jsdelivr.net/npm/vega@' + v5.SCHEMA_VERSION
vega_lib_url = 'https://cdn.jsdelivr.net/npm/vega-lib'
vega_lite_url = 'https://cdn.jsdelivr.net/npm/vega-lite@' + alt.SCHEMA_VERSION
vega_embed_url = 'https://cdn.jsdelivr.net/npm/vega-embed@3'
noext = "?noext"

paths = {
    'vega': vega_url + noext,
    'vega-lib': vega_lib_url + noext,
    'vega-lite': vega_lite_url + noext,
    'vega-embed': vega_embed_url + noext
}

workaround = """
requirejs.config({{
    baseUrl: 'https://cdn.jsdelivr.net/npm/',
    paths: {}
}});
"""

#------------------------------------------------ Defs for future rendering
def add_autoincrement(render_func):
    # Keep track of unique <div/> IDs
    cache = {}
    def wrapped(chart, id="vega-chart", autoincrement=True):
        if autoincrement:
            if id in cache:
                counter = 1 + cache[id]
                cache[id] = counter
            else:
                cache[id] = 0
            actual_id = id if cache[id] == 0 else id + '-' + str(cache[id])
        else:
            if id not in cache:
                cache[id] = 0
            actual_id = id
        return render_func(chart, id=actual_id)
    # Cache will stay outside and 
    return wrapped
            
@add_autoincrement
def render(chart, id="vega-chart"):
    chart_str = """
    <div id="{id}"></div><script>
    require(["vega-embed"], function(vg_embed) {{
        const spec = {chart};     
        vg_embed("#{id}", spec, {{defaultStyle: true}}).catch(console.warn);
        console.log("works?");
    }});
    console.log("recheck to see if it works?");
    </script>
    """
    return HTML(
        chart_str.format(
            id=id,
            chart=json.dumps(chart) if isinstance(chart, dict) else chart.to_json(indent=None)
        )
    )



HTML("".join((
    "<script>",
    workaround.format(json.dumps(paths)),
    "</script>")))


In [None]:
# Wordcloud function


def word_cloud(df, pixwidth=6000, pixheight=350, column="index", counts="count"):
    data= [dict(name="dataset", values=df.to_dict(orient="records"))]
    wordcloud = {
        "$schema": "https://vega.github.io/schema/vega/v5.json",
        "width": pixwidth,
        "height": pixheight,
        "padding": 0,
        "title": "Hover to see number of occureances from all the sequences",
        "data": data
    }
    scale = dict(
        name="color",
        type="ordinal",
        range=["cadetblue", "royalblue", "steelblue", "navy", "teal"]
    )
    mark = {
        "type":"text",
        "from":dict(data="dataset"),
        "encode":dict(
            enter=dict(
                text=dict(field=column),
                align=dict(value="center"),
                baseline=dict(value="alphabetic"),
                fill=dict(scale="color", field=column),
                tooltip=dict(signal="datum.count + ' occurrances'")
            )
        ),
        "transform": [{
            "type": "wordcloud",
            "text": dict(field=column),
            "size": [pixwidth, pixheight],
            "font": "Helvetica Neue, Arial",
            "fontSize": dict(field="datum.{}".format(counts)),
            "fontSizeRange": [10, 60],
            "padding": 2
        }]
    }
    wordcloud["scales"] = [scale]
    wordcloud["marks"] = [mark]
    
    return wordcloud



def wordcloud_create(df, field):
    ult = {}
    corpus = df[field].values.tolist()
    final = defaultdict(int) #Declaring an empty dictionary for count (Saves ram usage)
    for words in corpus:
        for word in words:
             final[word]+=1
    temp = Counter(final)
    print("Number of distinct tokens: ", len(temp))
    for k, v in  temp.most_common(300):
        ult[k] = v
    corpus = pd.Series(ult) #Creating a dataframe from the final default dict
    return render(word_cloud(corpus.to_frame(name="count").reset_index(), pixheight=600, pixwidth=900))

## Evaluation against different pretrained tokenizers

In [None]:
def analyse_tokenizer(tokenizer_path, train_data):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

    train_data['tokens'] = train_data['context'].apply(lambda x: tokenizer.tokenize(x))

    train_data["num_tokens"] = train_data['tokens'].apply(lambda x: len(x))

    languages = ["tamil", "hindi"]
    select_language = lambda x, y: x[x.language == y]

    fig = make_subplots(rows= 1, cols= 2,
                        x_title="Number of words", y_title="Number of context")
    for idx, lang in enumerate(languages):
        fig.add_trace(
            go.Histogram(
                x = list(select_language(train_data, lang).num_tokens),
                name = lang.upper()
            ),
            row = 1,
            col = idx + 1,
        )
    fig.update_layout(title="Distribution of Sequence length", title_x=0.5)
    fig.show()
    
    for idx, lang in enumerate(languages):
        # Find number of distinct tokens
        words_freq = Counter([word for sample in select_language(train_data, lang)["tokens"] for word in sample])
        print("Number of distinct tokens: ", len(words_freq))

        # Plot top tokens to check if it has more tokens
        x,y = zip(*words_freq.most_common(60))
        fig = go.Figure()
        fig.add_trace(go.Bar(x=x, y=y))
        fig.update_layout(
            title="Frequent words distribution",
            title_x=0.5,
            xaxis_title="Tokens",
            yaxis_title="Frequency",
        )
        fig.show()

In [None]:
xlmr_train_data = train_data.copy()
analyse_tokenizer("deepset/xlm-roberta-large-squad2", xlmr_train_data)

In [None]:
wordcloud_create(select_language(xlmr_train_data, "hindi"), "tokens")

In [None]:
wordcloud_create(select_language(xlmr_train_data, "tamil"), "tokens")

In [None]:
muril_train_data = train_data.copy()
analyse_tokenizer("google/muril-base-cased", muril_train_data)

In [None]:
wordcloud_create(select_language(muril_train_data, "tamil"), "tokens")

In [None]:
wordcloud_create(select_language(muril_train_data, "hindi"), "tokens")

In [None]:
indicbert_train_data = train_data.copy()
analyse_tokenizer("ai4bharat/indic-bert", indicbert_train_data)

In [None]:
wordcloud_create(select_language(indicbert_train_data, "tamil"), "tokens")

In [None]:
wordcloud_create(select_language(indicbert_train_data, "hindi"), "tokens")

### Conclusion
Though XLM-Roberta have tokens of 100 of languages, it has more tokens than indic-bert trained on Indian languages.
Muril-bert have context richness but it has more unknown words