# Exploratory Data Analysis (EDA)

## Setup

In [None]:
from tqdm.auto import tqdm
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import plotly.express as px

## Read data

In [None]:
df = pd.read_csv('data/train_0512.csv')

In [None]:
df.head(10)

In [None]:
conv_id_counts = df.value_counts('conv_id')
conv_id_counts[conv_id_counts == 1]

In [None]:
df[df['conv_id'] == 'hit:2444_conv:4888']['utterance'].values

## Preprocessing
- [x] replace "\_comma\_" with ","
- [x] remove punctuations
- [x] to lower case
- [x] concatenate the whole conversation grouped by `conv_id` with token "[SEP]" joined
- [x] prepend "[CLS]"
- [x] append sentimental text label
- [ ] correct spelling

In [None]:
# replace "_comma_" with ","
df.loc[:, 'prompt'] = df['prompt'].str.replace('_comma_', ',').str.lower()
df.loc[:, 'utterance'] = df['utterance'].str.replace('_comma_', ',').str.lower()

In [None]:
# remove punctuations
import re, string

def remove_punctuations(text: str):
    punc_filter = re.compile(f'[{string.punctuation}]')
    return punc_filter.sub(' ', text)
    
df.loc[:, 'prompt'] = df['prompt'].apply(remove_punctuations)
df.loc[:, 'utterance'] = df['utterance'].apply(remove_punctuations)

In [None]:
sent_id = {
    'sad':      0,  'trusting':     1,  'terrified': 2,  'caring':      3,  'disappointed': 4, 
    'faithful': 5,  'joyful':       6,  'jealous':   7,  'disgusted':   8,  'surprised':    9, 
    'ashamed':  10, 'afraid':       11, 'impressed': 12, 'sentimental': 13, 'devastated':   14, 
    'excited':  15, 'anticipating': 16, 'annoyed':   17, 'anxious':     18, 'furious':      19, 
    'content':  20, 'lonely':       21, 'angry':     22, 'confident':   23, 'apprehensive': 24, 
    'guilty':   25, 'embarrassed':  26, 'grateful':  27, 'hopeful':     28, 'proud':        29, 
    'prepared': 30, 'nostalgic':    31
}

id_sent = {v: k for k, v in sent_id.items()}

In [None]:
# concatenate conversations and append sentiment text label
df_concat = pd.DataFrame(columns=df.columns.to_list() + ['sent']) \
            .drop(['id', 'utterance_idx'], axis=1) \
            .rename(columns={'utterance': 'conv'})

conv_id_groups = df.groupby(['conv_id'])

for _, indices in tqdm(conv_id_groups.groups.items()):
    conv_rows = df.loc[indices].copy().sort_values(['utterance_idx'])
    conv_row = conv_rows.iloc[0].drop(['id', 'utterance_idx']).rename({'utterance': 'conv'})
    conv_row['prompt'] = '[CLS] ' + conv_row['prompt'] + ' [SEP]'
    conv_row['conv'] = '[CLS] ' + ' [SEP] '.join(conv_rows['utterance'].values)
    conv_row['sent'] = id_sent[conv_row['label']]
    df_concat = df_concat.append(conv_row, ignore_index=True)

## Analysis
- [x] count each label, check classes imbalance
- [ ] check the words with high frequencies for each label
- [ ] check the label similarities (maybe by common words counts)
- [ ] data augmentation

In [None]:
sent_tiers = pd.DataFrame(df_concat.value_counts(subset=['sent']), columns=['count']) \
            .sort_values('count').reset_index()
px.bar(sent_tiers, x='count', y='sent', color='count', 
       title='Label Counts', width=800, height=800,
       color_continuous_scale=px.colors.sequential.Blues)

The classes are not quite imbalanced. But the samples with `surprised` label are far more than others.  
we may need data augmentation for other labels excluding `surprised` label.  

In [None]:
# tokenization
vocab = set()
prompt_tokens, conv_tokens = list(), list()

for prompt in df_concat['prompt'].values:
    tokens = prompt.split()
    vocab = vocab.union(set(tokens))
    prompt_tokens.append(tokens)
    
for conv in df_concat['conv'].values:
    tokens = conv.split()
    vocab = vocab.union(set(tokens))
    conv_tokens.append(tokens)

In [None]:
# summary
seqlens_prompt = np.array([len(tokens) for tokens in prompt_tokens])
print(f'# of vocab: {len(vocab)}')
print('=== Tokenized Prompt Corpus ===')
print(f'max sequence length: {seqlens_prompt.max()}')
print(f'min sequence length: {seqlens_prompt.min()}')
print(f'avg sequence length: {seqlens_prompt.mean()}\n')

seqlens_conv = np.array([len(tokens) for tokens in conv_tokens])
print('=== Tokenized Conversation Corpus ===')
print(f'max sequence length: {seqlens_conv.max()}')
print(f'min sequence length: {seqlens_conv.min()}')
print(f'avg sequence length: {seqlens_conv.mean()}')

In [None]:
np.where(seqlens_conv >= 512)[0]

In [None]:
plt.figure(figsize=(20, 8))
palette = iter([plt.cm.Accent(i) for i in range(10)])

plt.subplot(1, 2, 1)
plt.hist([len(prompt) for prompt in prompt_tokens], range=(1, 513), color=next(palette))
plt.title('Prompt Corpus Sequences Length Histogram', fontsize=16)

plt.subplot(1, 2, 2)
plt.hist([len(conv) for conv in conv_tokens], range=(1, 513), color=next(palette))
plt.title('Conversation Corpus Sequences Length Histogram', fontsize=16)

plt.show()

## Proposal
- Use BERT to infer `prompt` & `utterance` representations，concatenate the two hypotheses.
- Add a `LayerNorm` layer to receive the concatenated result.
- Use `Linear` layer to do classification.
- Maybe we can use `SAM` to smooth the loss landscape