# Exploratory Data Analysis (EDA)

## Setup

In [1]:
from tqdm.auto import tqdm
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import plotly.express as px

## Read data

In [2]:
df = pd.read_csv('data/train_0512.csv')

In [3]:
df.head(10)

Unnamed: 0,id,conv_id,utterance_idx,prompt,utterance,label
0,0,hit:0_conv:1,1,I remember going to the fireworks with my best...,I remember going to see the fireworks with my ...,13
1,1,hit:0_conv:1,2,I remember going to the fireworks with my best...,Was this a friend you were in love with_comma_...,13
2,2,hit:0_conv:1,3,I remember going to the fireworks with my best...,This was a best friend. I miss her.,13
3,3,hit:0_conv:1,4,I remember going to the fireworks with my best...,Where has she gone?,13
4,4,hit:0_conv:1,5,I remember going to the fireworks with my best...,We no longer talk.,13
5,5,hit:0_conv:1,6,I remember going to the fireworks with my best...,Oh was this something that happened because of...,13
6,6,hit:1_conv:2,1,i used to scare for darkness,it feels like hitting to blank wall when i se...,11
7,7,hit:1_conv:2,2,i used to scare for darkness,Oh ya? I don't really see how,11
8,8,hit:1_conv:2,3,i used to scare for darkness,dont you feel so.. its a wonder,11
9,9,hit:1_conv:2,4,i used to scare for darkness,I do actually hit blank walls a lot of times b...,11


In [10]:
conv_id_counts = df.value_counts('conv_id')
conv_id_counts[conv_id_counts == 1].index.to_list()

['hit:7600_conv:15200',
 'hit:818_conv:1637',
 'hit:3397_conv:6794',
 'hit:11439_conv:22879',
 'hit:10015_conv:20030',
 'hit:869_conv:1738',
 'hit:9078_conv:18156',
 'hit:8075_conv:16150',
 'hit:5493_conv:10987',
 'hit:362_conv:724',
 'hit:4672_conv:9345',
 'hit:194_conv:388',
 'hit:5802_conv:11605',
 'hit:5289_conv:10578',
 'hit:1068_conv:2136',
 'hit:9991_conv:19982',
 'hit:10381_conv:20763',
 'hit:9989_conv:19979',
 'hit:9486_conv:18972',
 'hit:3617_conv:7235',
 'hit:11677_conv:23355',
 'hit:5806_conv:11613',
 'hit:1412_conv:2825',
 'hit:10424_conv:20849',
 'hit:5996_conv:11992',
 'hit:832_conv:1665',
 'hit:830_conv:1661',
 'hit:3484_conv:6969',
 'hit:7288_conv:14577',
 'hit:3196_conv:6393',
 'hit:8812_conv:17625',
 'hit:7971_conv:15943',
 'hit:8754_conv:17508',
 'hit:1412_conv:2824',
 'hit:11626_conv:23253',
 'hit:7846_conv:15692',
 'hit:10475_conv:20951',
 'hit:11693_conv:23386',
 'hit:5678_conv:11356',
 'hit:5680_conv:11360',
 'hit:11100_conv:22200',
 'hit:10567_conv:21134',
 'hi

In [13]:
df[df['conv_id'] == 'hit:2444_conv:4888']['utterance'].values

array(['i was happy when Lebron signed with the lakers,5|5|5_4|5|5,\nhit:2444_conv:4888,2,joyful,i was happy when Lebron signed with the lakers,390,Oh yeah! So was I. That was so awesome! ,5|5|5_4|5|5,\nhit:2444_conv:4888,3,joyful,i was happy when Lebron signed with the lakers,238,yea i hope they take down the warriors,5|5|5_4|5|5,\nhit:2444_conv:4888,4,joyful,i was happy when Lebron signed with the lakers,390,Woo hoo! That would be so awesome! ,5|5|5_4|5|5,\nhit:2444_conv:4889,1,anxious,I was on my way to the eye doctor yesterday and thought I left on time. I wound up being 15 minutes late despite frantically driving like a madman. ,390,I get so frantic when I am late for appointments. I had an eye doctor appointment and was 15 minutes late. ,4|5|5_5|5|5,\nhit:2444_conv:4889,2,anxious,I was on my way to the eye doctor yesterday and thought I left on time. I wound up being 15 minutes late despite frantically driving like a madman. ,238,what ended up happening,4|5|5_5|5|5,\nhit:2444_con

## Preprocessing
- [x] replace "\_comma\_" with ","
- [x] remove punctuations
- [x] to lower case
- [x] concatenate the whole conversation grouped by `conv_id` with token "[SEP]" joined
- [x] prepend "[CLS]"
- [x] append sentimental text label
- [ ] correct spelling

In [None]:
# replace "_comma_" with ","
df.loc[:, 'prompt'] = df['prompt'].str.replace('_comma_', ',').str.lower()
df.loc[:, 'utterance'] = df['utterance'].str.replace('_comma_', ',').str.lower()

In [None]:
# remove punctuations
import re, string

def remove_punctuations(text: str):
    punc_filter = re.compile(f'[{string.punctuation}]')
    return punc_filter.sub(' ', text)
    
df.loc[:, 'prompt'] = df['prompt'].apply(remove_punctuations)
df.loc[:, 'utterance'] = df['utterance'].apply(remove_punctuations)

In [None]:
sent_id = {
    'sad':      0,  'trusting':     1,  'terrified': 2,  'caring':      3,  'disappointed': 4, 
    'faithful': 5,  'joyful':       6,  'jealous':   7,  'disgusted':   8,  'surprised':    9, 
    'ashamed':  10, 'afraid':       11, 'impressed': 12, 'sentimental': 13, 'devastated':   14, 
    'excited':  15, 'anticipating': 16, 'annoyed':   17, 'anxious':     18, 'furious':      19, 
    'content':  20, 'lonely':       21, 'angry':     22, 'confident':   23, 'apprehensive': 24, 
    'guilty':   25, 'embarrassed':  26, 'grateful':  27, 'hopeful':     28, 'proud':        29, 
    'prepared': 30, 'nostalgic':    31
}

id_sent = {v: k for k, v in sent_id.items()}

In [None]:
# concatenate conversations and append sentiment text label
df_concat = pd.DataFrame(columns=df.columns.to_list() + ['sent']) \
            .drop(['id', 'utterance_idx'], axis=1) \
            .rename(columns={'utterance': 'conv'})

conv_id_groups = df.groupby(['conv_id'])

for _, indices in tqdm(conv_id_groups.groups.items()):
    conv_rows = df.loc[indices].copy().sort_values(['utterance_idx'])
    conv_row = conv_rows.iloc[0].drop(['id', 'utterance_idx']).rename({'utterance': 'conv'})
    conv_row['prompt'] = '[CLS] ' + conv_row['prompt'] + ' [SEP]'
    conv_row['conv'] = '[CLS] ' + ' [SEP] '.join(conv_rows['utterance'].values)
    conv_row['sent'] = id_sent[conv_row['label']]
    df_concat = df_concat.append(conv_row, ignore_index=True)

## Analysis
- [x] count each label, check classes imbalance
- [ ] check the words with high frequencies for each label
- [ ] check the label similarities (maybe by common words counts)
- [ ] data augmentation

In [None]:
sent_tiers = pd.DataFrame(df_concat.value_counts(subset=['sent']), columns=['count']) \
            .sort_values('count').reset_index()
px.bar(sent_tiers, x='count', y='sent', color='count', 
       title='Label Counts', width=800, height=800,
       color_continuous_scale=px.colors.sequential.Blues)

The classes are not quite imbalanced. But the samples with `surprised` label are far more than others.  
we may need data augmentation for other labels excluding `surprised` label.  

In [None]:
# tokenization
vocab = set()
prompt_tokens, conv_tokens = list(), list()

for prompt in df_concat['prompt'].values:
    tokens = prompt.split()
    vocab = vocab.union(set(tokens))
    prompt_tokens.append(tokens)
    
for conv in df_concat['conv'].values:
    tokens = conv.split()
    vocab = vocab.union(set(tokens))
    conv_tokens.append(tokens)

In [None]:
# summary
seqlens_prompt = np.array([len(tokens) for tokens in prompt_tokens])
print(f'# of vocab: {len(vocab)}')
print('=== Tokenized Prompt Corpus ===')
print(f'max sequence length: {seqlens_prompt.max()}')
print(f'min sequence length: {seqlens_prompt.min()}')
print(f'avg sequence length: {seqlens_prompt.mean()}\n')

seqlens_conv = np.array([len(tokens) for tokens in conv_tokens])
print('=== Tokenized Conversation Corpus ===')
print(f'max sequence length: {seqlens_conv.max()}')
print(f'min sequence length: {seqlens_conv.min()}')
print(f'avg sequence length: {seqlens_conv.mean()}')

In [None]:
np.where(seqlens_conv >= 512)[0]

In [None]:
plt.figure(figsize=(20, 8))
palette = iter([plt.cm.Accent(i) for i in range(10)])

plt.subplot(1, 2, 1)
plt.hist([len(prompt) for prompt in prompt_tokens], range=(1, 513), color=next(palette))
plt.title('Prompt Corpus Sequences Length Histogram', fontsize=16)

plt.subplot(1, 2, 2)
plt.hist([len(conv) for conv in conv_tokens], range=(1, 513), color=next(palette))
plt.title('Conversation Corpus Sequences Length Histogram', fontsize=16)

plt.show()

## Proposal
- Use BERT to infer `prompt` & `utterance` representations，concatenate the two hypotheses.
- Add a `LayerNorm` layer to receive the concatenated result.
- Use `Linear` layer to do classification.
- Maybe we can use `SAM` to smooth the loss landscape