In [73]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Preliminary Preparation

___

## Dependent Libraries Import

In [74]:
# data visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# text-processing libraries
import re
import string
import nltk
from nltk.corpus import stopwords

## Accelerator Detection

In [75]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

# Data Preparation

---

## Data Extraction

In [76]:
train = pd.read_csv("../input/chaii-hindi-and-tamil-question-answering/train.csv")
print('train set shape: ', train.shape)

test = pd.read_csv("../input/chaii-hindi-and-tamil-question-answering/test.csv")
print('Test set shape: ', test.shape)

sample_submission = pd.read_csv("../input/chaii-hindi-and-tamil-question-answering/sample_submission.csv")
print('Sample submission set shape: ', sample_submission.shape)

In [77]:
train.head()

In [78]:
train.info()

In [79]:
test.head()

In [80]:
test.info()

In [81]:
sample_submission.head()

# Regular Text Processing

---

## Text Clean 

In [82]:
'''
    Make text lowercase, remove text in square brackets,
    remove links, remove punctuation and remove words containing numbers.
'''
def text_cleaner(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

## Word Tokenization

In [83]:
'''
    Cleaning and parsing the text.
'''
def word_tokenizer(text):
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    nopunc = text_cleaner(text)
    tokenized_text = tokenizer.tokenize(nopunc)
    # remove_stopwords = [w for w in tokenized_text if w not in stopwords.words('english')]
    combined_text = ' '.join(tokenized_text)
    return combined_text

## Text Processor

In [84]:
train['tokenized_text'] = train['context'].apply(str).apply(lambda x: word_tokenizer(x))
train['text_len'] = train['tokenized_text'].astype(str).apply(len)
train['text_word_count'] = train['tokenized_text'].apply(lambda x: len(str(x).split()))

train[['context', 'tokenized_text', 'text_len', 'text_word_count']].head(10)

In [85]:
test['tokenized_text'] = test['context'].apply(str).apply(lambda x: word_tokenizer(x))
test['text_len'] = test['tokenized_text'].astype(str).apply(len)
test['text_word_count'] = test['tokenized_text'].apply(lambda x: len(str(x).split()))

test[['context', 'tokenized_text', 'text_len', 'text_word_count']]

# Data Visualization

---

## Pie Chart

In [86]:
train_groupby = train.groupby(by = ['language']).count()

train_groupby

In [87]:
sns.color_palette('pastel')

In [88]:
# create text props
textprops = dict(
    horizontalalignment = "center",
    verticalalignment = "top",
    rotation = 0,
    rotation_mode = "anchor",
    # rotation_mode = "default",
    size = 14,
    color = sns.color_palette('pastel')[-5],
    # color = "#81D8D0"
)

# create pie chart
plt.figure(figsize=(6, 6))

# configure pie chart
plt.pie(
    x = train_groupby.id,
    labels = train_groupby.index,
    colors = sns.color_palette('pastel')[2 : 3] + sns.color_palette('pastel')[-1 : ],
    autopct='%.2f%%',
    explode = [0.02] * 2,
    # explode = (0.02, 0.02),
    startangle = 90,
    pctdistance = 0.4,
    labeldistance = 1.2,
    textprops = textprops,
       )

# configure pie chart legend
legend = plt.legend(
    title = "Distinguish Samples by Language - Pie Chart",
    title_fontsize = 'x-large',
    fontsize = 'large',
    loc = "lower center",
    bbox_to_anchor = (0.25, -0.2, 0.5, 0.5),
    labelcolor = sns.color_palette('pastel')[-5],
    facecolor = '#F6F8ED',
    edgecolor = sns.color_palette('pastel')[1],
          )

# change pie chart legend color
plt.setp(legend.get_title(), color = sns.color_palette('pastel')[3])

# draw circle
centre_circle = plt.Circle((0,0),0.70,fc = 'white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

# show pie chart
plt.show()

## Bar Chart

In [89]:
sns.color_palette('Paired')

In [96]:
sns.histplot(
    x = 'text_len',
    y = train.text_len.value_counts(),
    bins = 100,
    hue = 'language',
    data = train,
    palette = 'Paired',
)

In [98]:
train.text_len.value_counts()