# chaii ☕ - Hindi and Tamil Question Answering

Hindi happens to be the most comman language in India which is the 2nd Most populated country in the world. Comparing Hindi, Tamil is not far behind thanks to Mr Rajinikanth 🤩. Building a QA model for Indian languages are not as easy compared to building them for English.So lets give it a try

Our Task is build a QA model which gives answers to all the questions given in either Hindi or Tamil.

In [None]:
import os 
import string
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud, STOPWORDS
from spacy.lang.hi import Hindi
from spacy.lang.ta import Tamil
from spacy.lang.hi import STOP_WORDS as hindi_stopwords
from spacy.lang.ta import STOP_WORDS as tamil_stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize
from collections import defaultdict
from collections import  Counter
plt.style.use('ggplot')
stop=set(stopwords.words('english'))
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import torch
import torch.nn as nn 
from torch.optim import Adam,AdamW
from torch.utils.data import SequentialSampler
from torch.utils.data import Dataset,DataLoader
import transformers
from transformers import XLMRobertaTokenizer,XLMRobertaModel,AutoTokenizer,XLMRobertaModel,XLMRobertaConfig
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer, AutoTokenizer, default_data_collator

In [None]:
DATA_DIR='../input/chaii-hindi-and-tamil-question-answering/'
train_df = pd.read_csv(DATA_DIR+'train.csv')
test_df = pd.read_csv(DATA_DIR+'test.csv')
test_df.head()

In [None]:
print('There are {} rows and {} columns in train'.format(train_df.shape[0],train_df.shape[1]))
print('There are {} rows and {} columns in train'.format(test_df.shape[0],test_df.shape[1]))

Class Distribution

Before we begin with anything else,let's check the class distribution.There are only two classes Hindi and Tamil.

In [None]:
x = train_df.language.value_counts()
sns.barplot(x.index, x)
plt.gca().set_ylabel('samples')

This clearly indicates that Hindi is double when compared to Tamil. So need to be careful with evaluation metrics

## Basic EDA
Now lets do a character/word level analysis of both context and question to understand the average size of context and questions

In [None]:
#character level analysis of context
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))
trainh_len=train_df[train_df['language']=='hindi']['context'].str.len()
ax1.hist(trainh_len,color='red')
ax1.set_title('Hindi')
traint_len=train_df[train_df['language']=='tamil']['context'].str.len()
ax2.hist(traint_len,color='green')
ax2.set_title('Tamil')
fig.suptitle('Characters')
plt.show()

In [None]:
#character level analysis of questions
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))
trainh_len=train_df[train_df['language']=='hindi']['question'].str.len()
ax1.hist(trainh_len,color='red')
ax1.set_title('Hindi')
traint_len=train_df[train_df['language']=='tamil']['question'].str.len()
ax2.hist(traint_len,color='green')
ax2.set_title('Tamil')
fig.suptitle('Characters')
plt.show()

In [None]:
#character level analysis of answer
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))
trainh_len=train_df[train_df['language']=='hindi']['answer_text'].str.len()
ax1.hist(trainh_len,color='red')
ax1.set_title('Hindi')
traint_len=train_df[train_df['language']=='tamil']['answer_text'].str.len()
ax2.hist(traint_len,color='green')
ax2.set_title('Tamil')
fig.suptitle('Characters')
plt.show()

In [None]:
#word level analysis of context
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))
trainh_len=train_df[train_df['language']=='hindi']['context'].str.split().map(lambda x: len(x))
ax1.hist(trainh_len,color='blue')
ax1.set_title('Hindi')
traint_len=train_df[train_df['language']=='tamil']['context'].str.split().map(lambda x: len(x))
ax2.hist(traint_len,color='orange')
ax2.set_title('Tamil')
fig.suptitle('Word Level')
plt.show()

In [None]:
#word level analysis of question
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))
trainh_len=train_df[train_df['language']=='hindi']['question'].str.split().map(lambda x: len(x))
ax1.hist(trainh_len,color='blue')
ax1.set_title('Hindi')
traint_len=train_df[train_df['language']=='tamil']['question'].str.split().map(lambda x: len(x))
ax2.hist(traint_len,color='orange')
ax2.set_title('Tamil')
fig.suptitle('Word Level')
plt.show()

In [None]:
#word level analysis of answer
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))
trainh_len=train_df[train_df['language']=='hindi']['answer_text'].str.split().map(lambda x: len(x))
ax1.hist(trainh_len,color='blue')
ax1.set_title('Hindi')
traint_len=train_df[train_df['language']=='tamil']['answer_text'].str.split().map(lambda x: len(x))
ax2.hist(traint_len,color='orange')
ax2.set_title('Tamil')
fig.suptitle('Word Level')
plt.show()

## Data Cleaning

From a quick data investigation  we can see there are punctuations and some English words (Google) within the context and questions, so lets clean that up

In [None]:
#lets take a sample data from test
test_df.loc[1,'context'][:1000]

The above context contains a lot of noise like English words, special characters, index ...etc. We can just clean them up

In [None]:
def cleaner(text):
    table=str.maketrans('','',string.punctuation)
    text = text.translate(table)
    return ' '.join([w for w in text.split() if not re.match(r'[A-Z]+', w, re.I)])
cleaner(test_df.loc[1,'context'])[:1000]

This looks much better :) 
Now we can clean up all the relevant contents like context, questions, answer_text ..etc

In [None]:
train_df.loc[:,'context'] = train_df.loc[:,'context'].apply(lambda x:cleaner(x))
train_df.loc[:,'question'] = train_df.loc[:,'question'].apply(lambda x:cleaner(x))
train_df.loc[:,'answer_text'] = train_df.loc[:,'answer_text'].apply(lambda x:cleaner(x))
test_df.loc[:,'context'] = test_df.loc[:,'context'].apply(lambda x:cleaner(x))
test_df.loc[:,'question'] = test_df.loc[:,'question'].apply(lambda x:cleaner(x))

In [None]:
test_df.head()


## WordCloud

Now lets look at the word cloud

Thanks to @aakashnain for reference


In [None]:
#fetch all the tamil and hindi text
tamil_text = " ".join(train_df[train_df["language"]=="tamil"]["question"])
hindi_text = " ".join(train_df[train_df["language"]=="hindi"]["question"])

In [None]:
# Download and extract the fonts
!wget -q http://www.lipikaar.com/sites/www.lipikaar.com/themes/million/images/support/fonts/Devanagari.zip
!wget -q http://www.lipikaar.com/sites/www.lipikaar.com/themes/million/images/support/fonts/Tamil.zip

!unzip -qq Devanagari.zip
!unzip -qq Tamil.zip

In [None]:
# Get the tokens and frequencies for Hindi language
hindi_nlp = Hindi()
hindi_doc = hindi_nlp(hindi_text)
hindi_tokens = set([token.text for token in hindi_doc])
hindi_tokens_counter = Counter(hindi_tokens)


# Get the tokens and frequencies for Tamil language
tamil_nlp = Tamil()
tamil_doc = hindi_nlp(tamil_text)
tamil_tokens = set([token.text for token in tamil_doc])
tamil_tokens_counter = Counter(tamil_tokens)

In [None]:
def plot_wordcloud(
    font_path,
    frequencies,
    stopwords,
    width=500,
    height=500,
    background_color="white",
    collocations=True,
    min_font_size=8,
):
    
    wordcloud = WordCloud(font_path=font_path,
                      width=width,
                      height=height,
                      background_color=background_color,
                      stopwords=stopwords,
                      collocations=collocations,
                      min_font_size=min_font_size).generate_from_frequencies(frequencies)

    
    plt.figure(figsize=(10, 10))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()

In [None]:
# Plot the wordcloud for hindi langauge
plot_wordcloud(font_path="Devanagari/kalimati.ttf",
               frequencies=hindi_tokens_counter,
               stopwords=hindi_stopwords
              )

In [None]:
# Plot the wordcloud for hindi langauge
plot_wordcloud(font_path="Tamil/Samyak-Tamil.ttf",
               frequencies=tamil_tokens_counter,
               stopwords=tamil_stopwords,
              )

# QA Model using XLM-RoberTa

In [None]:
class ChaiiQAModel(nn.Module):
    def __init__(self):
        super(ChaiiQAModel, self).__init__()
        self.model_config = XLMRobertaConfig.from_pretrained('../input/xlm-roberta-base')
        self.model_config.return_dict=True
        self.model = XLMRobertaModel.from_pretrained('../input/xlm-roberta-base', config=self.model_config)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(self.model_config.hidden_size, 2)
        
    def forward(self, input_ids, attention_mask):
        output = self.model(inputs_ids, attention_mask)
        last_hiddent_state = output['last_hidden_state']
        x = self.dropout(last_hidden_state)
        x = self.fc(x)
        start_logits, end_logits = x.split(1,dim=-1)
        start_logits= start_logits.squeeze(-1)
        end_logits=end_logits.squeeze(-1)
        return start_logits, end_logits

In [None]:
model = ChaiiQAModel()
model

## Coming Soon