# Introduction 📝
🎯 **Goal:** To predict answers to real questions about Wikipedia articles. You will use `chaii-1`, a new question answering dataset with question-answer pairs. The dataset covers Hindi and Tamil, collected without the use of translation. It provides a realistic information-seeking task with questions written by native-speaking expert data annotators.

📖 **Data:** 
> **train.csv** - the training dataset
> - ```id``` - a unique identifier
> - ```context```- the text of the Hindi/Tamil sample from which answers should be derived
> - ```question``` - the question, in Hindi/Tamil
> - ```answer_text``` - the answer to the question (note: for test, this is what you are attempting to predict)
> - ```answer_start``` - the starting character in context for the answer
> - ```language``` - whether the text in question is in Tamil or Hindi

> **test.csv** - the test dataset
> - ```id``` - a unique identifier
> - ```context```- the text of the Hindi/Tamil sample from which answers should be derived
> - ```question``` - the question, in Hindi/Tamil
> - ```language``` - whether the text in question is in Tamil or Hindi

> **sample_submission.csv** - the submission format
> - ```id``` - a unique identifier
> - ```PredictionString```- string that best answers the provided question based on the context.


🧪 **Evaluation metric:** [Jaccard Score](https://en.wikipedia.org/wiki/Jaccard_index)
> $$Score = \frac{1}{n} \sum_{i=1}^n jaccard( gt_i, dt_i )$$
> where 
> * $n$ = $\textrm{number of documents}$
> * $jaccard$ = $J(y_i, \hat{y}_i) = \frac{|y_i \cap \hat{y}_i|}{|y_i \cup \hat{y}_i|}$
> * $gt_i$ = $\textrm{the ith ground truth}$
> * $dt_i$ = $\textrm{the ith prediction}$

# Import libraries 📚

In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties

from wordcloud import WordCloud
from collections import Counter
from spacy.lang.hi import Hindi
from spacy.lang.ta import Tamil
from spacy.lang.hi import STOP_WORDS as hindi_stopwords
from spacy.lang.ta import STOP_WORDS as tamil_stopwords

from sklearn.feature_extraction.text import CountVectorizer

from typing import Counter
from pandas._typing import FrameOrSeries

In [None]:
!wget -q https://www.wfonts.com/download/data/2016/04/29/nirmala-ui/nirmala-ui.zip
!unzip -q nirmala-ui.zip
!ls -lrt *.ttf

In [None]:
# configure the Tamil font
tamil_font = FontProperties(fname='./Nirmala.ttf')

In [None]:
train_df = pd.read_csv("../input/chaii-hindi-and-tamil-question-answering/train.csv")
test_df = pd.read_csv("../input/chaii-hindi-and-tamil-question-answering/test.csv")
print(f"Train Shape: {train_df.shape}, Test Shape: {test_df.shape}")

In [None]:
train_df.head()

In [None]:
test_df.head()

# Language Distribution 🃏

In [None]:
def target_distribution(df: FrameOrSeries, target_column: str) -> None:
    """
    Target Variable Distribution
    Args:
        df (FrameOrSeries): DataFrame
        target_column (str): Target column name
    """    
    vc = df[target_column].value_counts()
    print(f'Distribution: \n\n{vc} \n')
    
    colors = ['#66b3ff', '#ff9999']
    plt.pie(vc.values, labels=vc.keys(), colors=colors, shadow=True, startangle=90, autopct='%1.1f%%')
    
    #draw circle
    centre_circle = plt.Circle((0,0), 0.80, fc='white')
    fig = plt.gcf()
    fig.gca().add_artist(centre_circle)
    
    plt.title(f'"{target_column}" Distribution')
    plt.show()

In [None]:
# Language Distribution across training dataset

target_distribution(df=train_df, target_column="language")

In [None]:
# Language Distribution across test dataset

target_distribution(df=test_df, target_column="language")

# Missing values ❌

In [None]:
def plot_missing_values(df: FrameOrSeries) -> None:
    """Plot HeatMap of missing values
    Args:
        df (FrameOrSeries): DataFrame
    """
    plt.figure(figsize=(10, 8))
    sns.heatmap(df.isnull().T, cbar=False)
    plt.yticks(rotation=45)

In [None]:
plot_missing_values(train_df)

> ***As the plot is black, there are no Missing Values in the Train dataset***

# EDA 📊

In [None]:
train_corpus = train_df['context']
test_corpus = test_df['context']

def get_top_n_anigram(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

def plot_top_n(corpus: list, title: str = None):
    df = pd.DataFrame(corpus, columns=['word','freq'])
    plt.figure(figsize=(16, 8))
    ax = sns.barplot(x='freq', y='word', data=df, facecolor=(0, 0, 0, 0), linewidth=2, edgecolor=sns.color_palette("ch:start=3, rot=.1",20))
    ax.bar_label(ax.containers[0], padding=5)
    
    plt.title(title)
    plt.xlabel("Frequency")
    plt.ylabel("Count")
    plt.yticks(fontproperties=tamil_font)
    plt.show()

In [None]:
# "context" corpus of Training data

plot_top_n(get_top_n_anigram(train_corpus, 20), title="Train Top 20 Unigrams")
plot_top_n(get_top_n_bigram(train_corpus, 20), title="Train Top 20 Bigrams")
plot_top_n(get_top_n_trigram(train_corpus, 20), title="Train Top 20 Trigrams")

In [None]:
# "context" corpus of Test data

plot_top_n(get_top_n_anigram(test_corpus, 20), title="Test Top 20 Unigrams")
plot_top_n(get_top_n_bigram(test_corpus, 20), title="Test Top 20 Bigrams")
plot_top_n(get_top_n_trigram(test_corpus, 20), title="Test Top 20 Trigrams")

In [None]:
## WordCloud on "question" variable

tamil_text = " ".join(train_df[train_df["language"]=="tamil"]["question"])
hindi_text = " ".join(train_df[train_df["language"]=="hindi"]["question"])

# Get the tokens and frequencies for Hindi language
hindi_nlp = Hindi()
hindi_doc = hindi_nlp(hindi_text)
hindi_tokens = set([token.text for token in hindi_doc])
hindi_tokens_counter = Counter(hindi_tokens)

# Get the tokens and frequencies for Tamil language
tamil_nlp = Tamil()
tamil_doc = hindi_nlp(tamil_text)
tamil_tokens = set([token.text for token in tamil_doc])
tamil_tokens_counter = Counter(tamil_tokens)

def plot_wordcloud(frequencies: Counter, stopwords: set, title: str = None):
    wordcloud = WordCloud(font_path="./Nirmala.ttf",
                      width=400,
                      height=400,
                      background_color="white",
                      stopwords=stopwords,
                      collocations=True,
                      min_font_size=7).generate_from_frequencies(frequencies)
    
    plt.figure(figsize=(10, 10))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.title(title)
    plt.show()

In [None]:
plot_wordcloud(frequencies=hindi_tokens_counter, stopwords=hindi_stopwords, title="Hindi WordCloud")

In [None]:
plot_wordcloud(frequencies=tamil_tokens_counter, stopwords=tamil_stopwords, title="Tamil WordCloud")

### **More EDA and Model Coming Soon**

<center> <h4> Please <b><span style="color:red">LIKE</span></b> the Notebook if you like it !! </h4></center>