# Analyze data

In [1]:
import os

import pandas as pd
from tabulate import tabulate
from tqdm import tqdm

tqdm.pandas()


## Load raw data

In [2]:
TRAIN_DEV_PATH = os.path.join('../data/eam2021-train-set/bq-results-20210825-203004-swh711l21gv2.csv')
TEST_PATH = os.path.join('../data/eam2021-test-set-public/eam2021-test-set-public.csv')
assert os.path.isfile(TRAIN_DEV_PATH)
assert os.path.isfile(TEST_PATH)

In [3]:
train_dev_df = pd.read_csv(TRAIN_DEV_PATH, encoding='utf-8')
test_df = pd.read_csv(TEST_PATH, encoding='utf-8')

## Peek at the raw data frames

In [4]:
train_dev_df.head()

Unnamed: 0,language,post_index,commentText,report_count_comment,report_count_post,like_count_comment,like_count_post,label,val
0,Hindi,238566,शायद योगी जी है,0,0,1,1,0,0
1,Hindi,7009,Tingri h to putri tu.. .,0,0,0,0,0,0
2,Hindi,404648,Saale Tu kon sa pagal Nahi h . Teri comment pa...,0,0,0,0,1,0
3,Hindi,5057,girl 😘😘 aaj ke baad msg ki to maar daluggi,0,0,0,0,0,0
4,Hindi,107146,Free fire pubg ka baap ha kutta sala kamina,0,0,0,0,1,0


In [5]:
train_dev_df['val'].value_counts()

0    665042
Name: val, dtype: int64

In [6]:
test_df.head()

Unnamed: 0,Id,language,post_index,commentText,report_count_comment,report_count_post,like_count_comment,like_count_post
0,2,Bengali,182442,Bichna theke agun berochhe re tar modhhyeu ami...,0,0,0,0
1,3,Hindi,406921,JYOTI💏 chut ka pani,0,0,0,0
2,4,Hindi,233255,Kuth tik n h,0,0,0,0
3,5,Telugu,219308,Ekkada ap valaki kallu guddi ah?😠sarigga chuda...,0,0,0,0
4,6,Hindi,269812,pagal khi ke gadhe me dha aata h bachho se kuc...,0,0,0,0


## Compute frequencies of labels and languages of raw data

In [7]:
def tabulify(df: pd.DataFrame, col_name: str) -> None:
    """Tabulate the frequencies with percentages and display the total"""
    freqs_dict = df[col_name].value_counts(normalize=False).to_dict()
    fracs_dict = df[col_name].value_counts(normalize=True).to_dict()
    names = [k for k, v in freqs_dict.items()]
    freqs = [freqs_dict[k] for k in names]
    fracs = [fracs_dict[k] for k in names]
    percentages = [f'{100.0 * frac:0.1f}%' for frac in fracs]
    print(tabulate(
        {col_name: names + ['---Total---'], 'frequency': freqs + [sum(freqs_dict.values())],
         'percentage': percentages + ['100.0%']},
        headers='keys',
    ))

In [8]:
print(f'\n\n*** TRAIN DEV LABEL FREQUENCIES ***\n')
tabulify(df=train_dev_df, col_name='label')



*** TRAIN DEV LABEL FREQUENCIES ***

╒═════════════╤═════════════╤══════════════╕
│ label       │   frequency │ percentage   │
╞═════════════╪═════════════╪══════════════╡
│ 0           │      352386 │ 53.0%        │
├─────────────┼─────────────┼──────────────┤
│ 1           │      312656 │ 47.0%        │
├─────────────┼─────────────┼──────────────┤
│ ---Total--- │      665042 │ 100.0%       │
╘═════════════╧═════════════╧══════════════╛


In [9]:
print(f'\n\n*** TRAIN DEV LANGUAGE FREQUENCIES ***\n')
tabulify(df=train_dev_df, col_name='language')



*** TRAIN DEV LANGUAGE FREQUENCIES ***

╒═════════════╤═════════════╤══════════════╕
│ language    │   frequency │ percentage   │
╞═════════════╪═════════════╪══════════════╡
│ Hindi       │      307180 │ 46.2%        │
├─────────────┼─────────────┼──────────────┤
│ Telugu      │       97012 │ 14.6%        │
├─────────────┼─────────────┼──────────────┤
│ Marathi     │       72044 │ 10.8%        │
├─────────────┼─────────────┼──────────────┤
│ Tamil       │       69497 │ 10.5%        │
├─────────────┼─────────────┼──────────────┤
│ Malayalam   │       40965 │ 6.2%         │
├─────────────┼─────────────┼──────────────┤
│ Bengali     │       22835 │ 3.4%         │
├─────────────┼─────────────┼──────────────┤
│ Kannada     │       13943 │ 2.1%         │
├─────────────┼─────────────┼──────────────┤
│ Odia        │       10974 │ 1.7%         │
├─────────────┼─────────────┼──────────────┤
│ Gujarati    │        8828 │ 1.3%         │
├─────────────┼─────────────┼──────────────┤
│ Haryanvi   

In [10]:
train_dev_df['language_and_label'] = train_dev_df.progress_apply(lambda x: (x['language'], x['label']), axis=1)
print(f'\n\n*** TRAIN DEV LANGUAGE-LABEL PAIR FREQUENCIES ***\n')
tabulify(df=train_dev_df, col_name='language_and_label')

100%|██████████| 665042/665042 [00:09<00:00, 68177.01it/s]



*** TRAIN DEV LANGUAGE-LABEL PAIR FREQUENCIES ***

╒══════════════════════╤═════════════╤══════════════╕
│ language_and_label   │   frequency │ percentage   │
╞══════════════════════╪═════════════╪══════════════╡
│ ('Hindi', 1)         │      153747 │ 23.1%        │
├──────────────────────┼─────────────┼──────────────┤
│ ('Hindi', 0)         │      153433 │ 23.1%        │
├──────────────────────┼─────────────┼──────────────┤
│ ('Telugu', 1)        │       48551 │ 7.3%         │
├──────────────────────┼─────────────┼──────────────┤
│ ('Telugu', 0)        │       48461 │ 7.3%         │
├──────────────────────┼─────────────┼──────────────┤
│ ('Marathi', 0)       │       44677 │ 6.7%         │
├──────────────────────┼─────────────┼──────────────┤
│ ('Tamil', 0)         │       34792 │ 5.2%         │
├──────────────────────┼─────────────┼──────────────┤
│ ('Tamil', 1)         │       34705 │ 5.2%         │
├──────────────────────┼─────────────┼──────────────┤
│ ('Malayalam', 0)     │     




In [11]:
print(f'\n\n*** TEST LANGUAGE FREQUENCIES ***\n')
tabulify(df=test_df, col_name='language')



*** TEST LANGUAGE FREQUENCIES ***

╒═════════════╤═════════════╤══════════════╕
│ language    │   frequency │ percentage   │
╞═════════════╪═════════════╪══════════════╡
│ Hindi       │       34313 │ 46.2%        │
├─────────────┼─────────────┼──────────────┤
│ Telugu      │       10877 │ 14.6%        │
├─────────────┼─────────────┼──────────────┤
│ Marathi     │        8057 │ 10.9%        │
├─────────────┼─────────────┼──────────────┤
│ Tamil       │        7864 │ 10.6%        │
├─────────────┼─────────────┼──────────────┤
│ Malayalam   │        4465 │ 6.0%         │
├─────────────┼─────────────┼──────────────┤
│ Bengali     │        2534 │ 3.4%         │
├─────────────┼─────────────┼──────────────┤
│ Kannada     │        1592 │ 2.1%         │
├─────────────┼─────────────┼──────────────┤
│ Odia        │        1131 │ 1.5%         │
├─────────────┼─────────────┼──────────────┤
│ Haryanvi    │        1025 │ 1.4%         │
├─────────────┼─────────────┼──────────────┤
│ Gujarati    │   

## Prepared data

### Load prepared data

In [12]:
PREPARED_TRAIN_PATH = os.path.join('../data/prepared/train.csv')
PREPARED_DEV_PATH = os.path.join('../data/prepared/dev.csv')
PREPARED_TEST_PATH = os.path.join('../data/prepared/test.csv')
assert os.path.isfile(PREPARED_TRAIN_PATH)
assert os.path.isfile(PREPARED_DEV_PATH)
assert os.path.isfile(PREPARED_TEST_PATH)

In [13]:
prepared_train_df = pd.read_csv(PREPARED_TRAIN_PATH, encoding='utf-8')
prepared_dev_df = pd.read_csv(PREPARED_DEV_PATH, encoding='utf-8')
prepared_test_df = pd.read_csv(PREPARED_TEST_PATH, encoding='utf-8')
assert test_df.equals(prepared_test_df)

### Compute frequencies of labels and languages of prepared data

In [14]:

print(f'\n\n*** TRAIN LABEL FREQUENCIES ***\n')
tabulify(df=prepared_train_df, col_name='label')

print(f'\n\n*** DEV LABEL FREQUENCIES ***\n')
tabulify(df=prepared_dev_df, col_name='label')



*** TRAIN LABEL FREQUENCIES ***

╒═════════════╤═════════════╤══════════════╕
│ label       │   frequency │ percentage   │
╞═════════════╪═════════════╪══════════════╡
│ 0           │      334765 │ 53.0%        │
├─────────────┼─────────────┼──────────────┤
│ 1           │      297024 │ 47.0%        │
├─────────────┼─────────────┼──────────────┤
│ ---Total--- │      631789 │ 100.0%       │
╘═════════════╧═════════════╧══════════════╛


*** DEV LABEL FREQUENCIES ***

╒═════════════╤═════════════╤══════════════╕
│ label       │   frequency │ percentage   │
╞═════════════╪═════════════╪══════════════╡
│ 0           │       17621 │ 53.0%        │
├─────────────┼─────────────┼──────────────┤
│ 1           │       15632 │ 47.0%        │
├─────────────┼─────────────┼──────────────┤
│ ---Total--- │       33253 │ 100.0%       │
╘═════════════╧═════════════╧══════════════╛


In [15]:
print(f'\n\n*** TRAIN LANGUAGE FREQUENCIES ***\n')
tabulify(df=prepared_train_df, col_name='language')

print(f'\n\n*** DEV LANGUAGE FREQUENCIES ***\n')
tabulify(df=prepared_dev_df, col_name='language')



*** TRAIN LANGUAGE FREQUENCIES ***

╒═════════════╤═════════════╤══════════════╕
│ language    │   frequency │ percentage   │
╞═════════════╪═════════════╪══════════════╡
│ Hindi       │      291820 │ 46.2%        │
├─────────────┼─────────────┼──────────────┤
│ Telugu      │       92161 │ 14.6%        │
├─────────────┼─────────────┼──────────────┤
│ Marathi     │       68442 │ 10.8%        │
├─────────────┼─────────────┼──────────────┤
│ Tamil       │       66022 │ 10.5%        │
├─────────────┼─────────────┼──────────────┤
│ Malayalam   │       38916 │ 6.2%         │
├─────────────┼─────────────┼──────────────┤
│ Bengali     │       21694 │ 3.4%         │
├─────────────┼─────────────┼──────────────┤
│ Kannada     │       13246 │ 2.1%         │
├─────────────┼─────────────┼──────────────┤
│ Odia        │       10425 │ 1.7%         │
├─────────────┼─────────────┼──────────────┤
│ Gujarati    │        8387 │ 1.3%         │
├─────────────┼─────────────┼──────────────┤
│ Haryanvi    │  

In [16]:
print(f'\n\n*** TRAIN LANGUAGE-LABEL PAIR FREQUENCIES ***\n')
tabulify(df=prepared_train_df, col_name='language_and_label')

print(f'\n\n*** DEV LANGUAGE-LABEL PAIR FREQUENCIES ***\n')
tabulify(df=prepared_dev_df, col_name='language_and_label')



*** TRAIN LANGUAGE-LABEL PAIR FREQUENCIES ***

╒══════════════════════╤═════════════╤══════════════╕
│ language_and_label   │   frequency │ percentage   │
╞══════════════════════╪═════════════╪══════════════╡
│ ('Hindi', 1)         │      146059 │ 23.1%        │
├──────────────────────┼─────────────┼──────────────┤
│ ('Hindi', 0)         │      145761 │ 23.1%        │
├──────────────────────┼─────────────┼──────────────┤
│ ('Telugu', 1)        │       46123 │ 7.3%         │
├──────────────────────┼─────────────┼──────────────┤
│ ('Telugu', 0)        │       46038 │ 7.3%         │
├──────────────────────┼─────────────┼──────────────┤
│ ('Marathi', 0)       │       42443 │ 6.7%         │
├──────────────────────┼─────────────┼──────────────┤
│ ('Tamil', 0)         │       33052 │ 5.2%         │
├──────────────────────┼─────────────┼──────────────┤
│ ('Tamil', 1)         │       32970 │ 5.2%         │
├──────────────────────┼─────────────┼──────────────┤
│ ('Malayalam', 0)     │       30