## EDA ##

Exploratory data analysis (EDA) is an approach to analyze data sets to summarize their main characteristics, mainly with visual methods.

In [None]:
#Import necessary libraries
import pandas as pd
import numpy as np

In [None]:
#load dataset and class_map
dataset = pd.read_csv('/content/train.csv'); 
class_map = pd.read_csv('/content/class_map.csv')
dataset['base_graphemes'] = dataset['grapheme'].apply(tuple)
dataset.head()

Unnamed: 0,image_id,grapheme_root,vowel_diacritic,consonant_diacritic,grapheme,base_graphemes
0,Train_0,15,9,5,ক্ট্রো,"(ক, ্, ট, ্, র, ো)"
1,Train_1,159,0,0,হ,"(হ,)"
2,Train_2,22,3,5,খ্রী,"(খ, ্, র, ী)"
3,Train_3,53,2,2,র্টি,"(র, ্, ট, ি)"
4,Train_4,71,9,5,থ্রো,"(থ, ্, র, ো)"


There are 168 grapheme roots, 11 vowel diacritics, 7 consonant diacritics, which consist 168*7*11 = 12936 types of theoretical grapheme combinations and 1295 unique graphemes within the 20k training dataset.

In [None]:
unique = dataset.apply(lambda col: col.nunique()); unique

image_id               200840
grapheme_root             168
vowel_diacritic            11
consonant_diacritic         7
grapheme                 1295
base_graphemes           1295
dtype: int64

In [None]:
# Source: https://github.com/JamesMcGuigan/kaggle-digit-recognizer/blob/master/src/utils/confusion_matrix.py
from typing import Union

import pandas as pd
from pandas.io.formats.style import Styler


def combination_matrix(dataset: pd.DataFrame, x: str, y: str, z: str,
                       format=None, unique=True) -> Union[pd.DataFrame, Styler]:
    """
    Returns a combination matrix, showing all valid combinations between three DataFrame columns.
    Sort of like a heatmap, but returning lists of (optionally) unique values

    :param dataset: The dataframe to create a combination_matrx from
    :param x: column name to use for the X axis
    :param y: column name to use for the Y axis
    :param z: column name to use for the Z axis (values that appear in the cells)
    :param format: '', ', '-', ', '\n'    = format value lists as "".join() string
                    str, bool, int, float = cast value lists
    :param unique:  whether to return only unique values or not - eg: combination_matrix(unique=False).applymap(sum)
    :return: returns nothing
    """
    unique_y = sorted(dataset[y].unique())
    combinations = pd.DataFrame({
        n: dataset.where(lambda df: df[y] == n)
            .groupby(x)[z]
            .pipe(lambda df: df.unique() if unique else df )
            .apply(list)
            .apply(sorted)
        for n in unique_y
    }).T

    if isinstance(format, str):
        combinations = combinations.applymap(
            lambda cell: f"{format}".join([str(value) for value in list(cell) ])
            if isinstance(cell, list) else cell
        )
    if format == str:   combinations = combinations.applymap(lambda cell: str(cell)      if isinstance(cell, list) and len(cell) > 0 else ''     )
    if format == bool:  combinations = combinations.applymap(lambda cell: True           if isinstance(cell, list) and len(cell) > 0 else False  )
    if format == int:   combinations = combinations.applymap(lambda cell: int(cell[0])   if isinstance(cell, list) and len(cell)     else ''     )
    if format == float: combinations = combinations.applymap(lambda cell: float(cell[0]) if isinstance(cell, list) and len(cell)     else ''     )

    combinations.index.rename(y, inplace=True)
    combinations.fillna('', inplace=True)
    if format == '\n':
        return combinations.style.set_properties(**{'white-space': 'pre-wrap'})  # needed for display
    else:
        return combinations  # Allows for subsequent .applymap()

However, all 12936 possibilities cannot show up, for some of combinations would never be used in practice, or it is impossible to pronounce it . 
We can find the combination patterns of vowel and consonant diactritic:

1.Vowel #0 and consonant #0 cound combine with everything

2.Vowels #3, #5, #6, #8 only combine with some of consonants

3.Consonant #3 only combined with vowel #0

4.Consonant #6 only combined with vowels #0 and #1

In [None]:
combination_matrix(dataset, x='consonant_diacritic', y='vowel_diacritic', z='consonant_diacritic', unique=False).applymap(len)

consonant_diacritic,0.0,1.0,2.0,3.0,4.0,5.0,6.0
vowel_diacritic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,23960,768,6262,619,5413,4180,306
1,18799,2843,3838,0,6573,3752,1081
2,17449,464,3764,0,1255,3035,0
3,11391,0,2290,0,0,2471,0
4,11832,1215,1563,0,2206,2032,0
5,3794,0,297,0,784,422,0
6,3873,0,463,0,0,0,0
7,16991,1197,3778,0,4072,2685,0
8,3210,0,0,0,167,151,0
9,10727,774,1210,0,800,2521,0


There are also some root_grapheme and vowel/consonant patterns discovered from below:

1.Vowel #0 and Consonant #0 combine with (nearly) everything,

2.ALL Roots combine with some Consonant #0

3.Several Roots do NOT combine with Vowel #0 = [26, 28, 33, 34, 73, 82, 108, 114, 126, 152, 157, 158, 163]

4.Several Roots do combine ALL Vowels = [13, 23, 64, 72, 79, 81, 96, 107, 113, 115, 133, 147]

5.Only Root #107 combines with ALL Consonants

In [None]:
root_vowels            = dataset.groupby('grapheme_root')['vowel_diacritic'].unique().apply(sorted).to_frame().T
root_consonants        = dataset.groupby('grapheme_root')['consonant_diacritic'].unique().apply(sorted).to_frame().T
root_vowels_values     = root_vowels.applymap(len).values.flatten()
root_consonants_values = root_consonants.applymap(len).values.flatten()

display(root_vowels)
display({
    "mean":   root_vowels_values.mean(),
    "median": np.median( root_vowels_values ),
    "min":    root_vowels_values.min(),
    "max":    root_vowels_values.max(),
    "unique_vowels":    unique['vowel_diacritic'],
    "root_combine_0":   sum([ 0 in lst for lst in root_vowels.values.flatten() ]),
    "unique_roots":     unique['grapheme_root'],
    "root_combine_not_0": str([ index for index, lst in enumerate(root_vowels.values.flatten()) if 0 not in lst ]),    
    "root_combine_all":       [ index for index, lst in enumerate(root_vowels.values.flatten()) if len(lst) == unique['vowel_diacritic'] ],
})
# print('--------------------')
display(root_consonants)
display({
    "mean":   root_consonants_values.mean(),
    "median": np.median( root_consonants_values ),
    "min":    root_consonants_values.min(),
    "max":    root_consonants_values.max(),
    "unique_consonants":  unique['consonant_diacritic'],
    "root_combine_0": sum([ 0 in lst for lst in root_consonants.values.flatten() ]),
    "unique_roots":   unique['grapheme_root'],
    "root_combine_not_0": str([ index for index, lst in enumerate(root_consonants.values.flatten()) if 0 not in lst ]),        
    "root_combine_all":       [ index for index, lst in enumerate(root_consonants.values.flatten()) if len(lst) == unique['consonant_diacritic'] ],
})

grapheme_root,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167
vowel_diacritic,[0],[0],"[0, 1]",[0],[0],[0],[0],[0],[0],"[0, 1]",[0],[0],[0],"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]","[0, 1, 2, 7, 9]","[0, 1, 2, 7, 9]","[0, 1, 2, 6, 7, 9]","[0, 1, 2, 7, 9]","[0, 1, 2, 3, 4, 7, 9, 10]","[0, 10]","[0, 3]","[0, 1, 2, 3, 7]","[0, 1, 2, 3, 4, 6, 7, 8, 9]","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]","[0, 1]","[0, 1, 2, 3, 4, 7]",[7],"[0, 2]","[1, 2, 4, 9]","[0, 1, 2, 4, 5, 6, 7, 9]","[0, 2, 7]","[0, 1, 2, 4, 7]","[0, 1, 2, 3, 4, 6, 7, 9]",[2],"[1, 2]","[0, 1, 2]","[0, 1, 2, 3, 4, 7, 9]","[0, 2]","[0, 1, 2, 3, 4, 5, 7, 8, 9, 10]","[0, 1, 2, 4]",...,"[0, 1, 2, 7, 9]","[0, 1, 2, 3, 7, 9]",[0],"[0, 2, 7]","[0, 1, 2, 3, 4, 7, 9]","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]","[0, 1, 2, 4]","[0, 1, 7, 9]","[0, 1, 2, 7]","[0, 2, 3, 7]","[0, 2, 3, 4, 7, 9]","[0, 1, 2, 3, 4, 7, 9]","[0, 1, 2, 3, 6, 7]","[0, 1, 2, 3, 4, 7]","[0, 1, 2, 3, 4, 7]","[0, 1, 4, 7]","[0, 1, 7]","[0, 9]","[0, 7]","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]","[0, 1, 2, 4, 6, 7, 9]","[0, 1, 2, 3, 4, 7, 9]","[0, 1, 2, 3, 4, 5, 6, 7, 9]","[0, 1, 2, 5, 7]","[1, 2, 7, 9]","[0, 1, 2, 3, 6, 7, 9]","[0, 3, 4, 5, 9]","[0, 1, 2, 3, 7, 8]","[0, 1, 2, 3, 6]","[1, 9]",[4],"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]","[0, 2, 7]","[0, 1]","[0, 1, 3]",[1],[0],"[0, 1, 2, 3, 4, 7, 9]","[0, 7]","[0, 1, 2, 3, 4, 5, 7, 9]"


{'max': 11,
 'mean': 4.869047619047619,
 'median': 5.0,
 'min': 1,
 'root_combine_0': 155,
 'root_combine_all': [13, 23, 64, 72, 79, 81, 96, 107, 113, 115, 133, 147],
 'root_combine_not_0': '[26, 28, 33, 34, 73, 82, 108, 114, 126, 152, 157, 158, 163]',
 'unique_roots': 168,
 'unique_vowels': 11}

grapheme_root,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167
consonant_diacritic,[0],[0],"[0, 4]","[0, 1]","[0, 1]",[0],"[0, 1]",[0],[0],"[0, 1, 4]",[0],[0],[0],"[0, 1, 2, 4, 5, 6]",[0],"[0, 5]",[0],[0],"[0, 4]",[0],[0],"[0, 2]","[0, 1, 2, 4, 5]","[0, 1, 2, 4, 5, 6]",[0],"[0, 4]",[0],[0],"[0, 4]","[0, 1, 2, 3, 4, 5]",[0],[0],[0],[0],[0],[0],[0],[0],"[0, 1, 2, 4]",[0],...,"[0, 2]",[0],[0],[0],[0],"[0, 2, 4, 5]",[0],[0],"[0, 2]",[0],[0],"[0, 2, 4]","[0, 5]","[0, 4, 5]","[0, 4]",[0],"[0, 5]",[0],[0],"[0, 1, 2, 4, 5]","[0, 4, 5]","[0, 2, 4, 5, 6]","[0, 5]","[0, 2, 4]",[0],"[0, 4, 5]",[0],[0],[0],[0],[0],"[0, 1, 2, 4, 5]",[0],[0],[0],[0],[0],[0],[0],"[0, 4]"


{'max': 7,
 'mean': 1.9583333333333333,
 'median': 1.0,
 'min': 1,
 'root_combine_0': 168,
 'root_combine_all': [107],
 'root_combine_not_0': '[]',
 'unique_consonants': 7,
 'unique_roots': 168}

When exploring the combination of root,vowel and consonant, we found that there are only 1292 combinations in the dataset, while there are 1295 unique graphemes in the training dataset, so there must be some duplicates. 

In [None]:
from itertools import chain
{
    "combinations": len(list(chain( 
        *combination_matrix(dataset, x='consonant_diacritic', y='vowel_diacritic', z='grapheme_root')
        .values.flatten() 
    ))),
    "unique_graphemes": unique['grapheme']
}

{'combinations': 1292, 'unique_graphemes': 1295}

Here we want to see which combinations have multiple rendering that cause the difference above, we found that these were

{'64-3-2': ['র্তী', 'র্ত্রী'], '64-7-2': ['র্তে', 'র্ত্রে'], '72-0-2': ['র্দ্র', 'র্দ']}

In [None]:
( 
    dataset
    .groupby(['grapheme_root', 'vowel_diacritic', 'consonant_diacritic'])
    .nunique(dropna=False) > 1
).query("grapheme != False")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,image_id,grapheme,base_graphemes
grapheme_root,vowel_diacritic,consonant_diacritic,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
64,3,2,True,True,True
64,7,2,True,True,True
72,0,2,True,True,True


In [None]:
multilabled_graphemes = {
    "64-3-2": dataset.query("grapheme_root == 64 & vowel_diacritic == 3 & consonant_diacritic == 2")['grapheme'].unique().tolist(),
    "64-7-2": dataset.query("grapheme_root == 64 & vowel_diacritic == 7 & consonant_diacritic == 2")['grapheme'].unique().tolist(),
    "72-0-2": dataset.query("grapheme_root == 72 & vowel_diacritic == 0 & consonant_diacritic == 2")['grapheme'].unique().tolist(),
}
multilabled_graphemes

{'64-3-2': ['র্তী', 'র্ত্রী'],
 '64-7-2': ['র্তে', 'র্ত্রে'],
 '72-0-2': ['র্দ্র', 'র্দ']}

Here we want to count the appearances of these combination graphemes, and the result shows that they do not show up frequently. However, this is a "one to many" reflection so it won't influence much of the evaluation result.

In [None]:
dataset[ dataset['grapheme'].isin(multilabled_grapheme_list) ].groupby(['grapheme']).count()['image_id'].to_dict()

{'র্তী': 144,
 'র্তে': 153,
 'র্ত্রী': 145,
 'র্ত্রে': 150,
 'র্দ': 146,
 'র্দ্র': 151}

Then, we want to get the top 10 and bottom 10 grapheme_root and top 5 vowel/consonant diacritic components.

In [None]:
HEIGHT = 236
WIDTH = 236

def get_n(df, field, n, top=True):
    top_graphemes = df.groupby([field]).size().reset_index(name='counts')['counts'].sort_values(ascending=not top)[:n]
    top_grapheme_roots = top_graphemes.index
    top_grapheme_counts = top_graphemes.values
    top_graphemes = class_map[class_map['component_type'] == field].reset_index().iloc[top_grapheme_roots]
    top_graphemes.drop(['component_type', 'label'], axis=1, inplace=True)
    top_graphemes.loc[:, 'count'] = top_grapheme_counts
    return top_graphemes

def image_from_char(char):
    image = Image.new('RGB', (WIDTH, HEIGHT))
    draw = ImageDraw.Draw(image)
    myfont = ImageFont.truetype('/kaggle/input/kalpurush-fonts/kalpurush-2.ttf', 120)
    w, h = draw.textsize(char, font=myfont)
    draw.text(((WIDTH - w) / 2,(HEIGHT - h) / 3), char, font=myfont)

    return image

In [None]:
top_10_roots = get_n(dataset, 'grapheme_root', 10)
top_10_roots

Unnamed: 0,index,component,count
72,72,দ,5736
64,64,ত,5596
13,13,ক,5420
107,107,ব,5321
23,23,গ,5149
96,96,প,4926
113,113,ভ,4395
147,147,স,4392
133,133,শ,4374
115,115,ম,4015


In [None]:
top_5_vowel = get_n(dataset,
                    'vowel_diacritic', 5)
top_5_vowel

Unnamed: 0,index,component,count
0,168,0,41508
1,169,া,36886
7,175,ে,28723
2,170,ি,25967
4,172,ু,18848


In [None]:
top_5_consonants = get_n(dataset, 'consonant_diacritic', 5)
top_5_consonants

Unnamed: 0,index,component,count
0,179,0,125278
2,181,র্,23465
5,184,্র,21397
4,183,্য,21270
1,180,ঁ,7424


In [None]:
bottom_10_roots = get_n(dataset, 'grapheme_root', 10, False)
bottom_10_roots

Unnamed: 0,index,component,count
73,73,দ্ঘ,130
33,33,ঙ্ক্ত,136
102,102,প্স,141
158,158,স্স,143
45,45,জ্জ্ব,144
130,130,ল্ব,144
1,1,ঃ,145
12,12,ঔ,146
0,0,ং,147
63,63,ণ্ণ,149
