# Exploratory Data Analysis (EDA)

## Setup

In [1]:
from tqdm.auto import tqdm
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import plotly.express as px

## Read data

In [2]:
traindf = pd.read_csv('data/fixed_train.csv')
validdf = pd.read_csv('data/fixed_valid.csv')
testdf = pd.read_csv('data/fixed_test.csv')

In [3]:
traindf

Unnamed: 0,conv_id,utterance_idx,prompt,utterance,label
0,hit:0_conv:1,1,I remember going to the fireworks with my best...,I remember going to see the fireworks with my ...,13
1,hit:0_conv:1,2,I remember going to the fireworks with my best...,Was this a friend you were in love with_comma_...,13
2,hit:0_conv:1,3,I remember going to the fireworks with my best...,This was a best friend. I miss her.,13
3,hit:0_conv:1,4,I remember going to the fireworks with my best...,Where has she gone?,13
4,hit:0_conv:1,5,I remember going to the fireworks with my best...,We no longer talk.,13
...,...,...,...,...,...
84164,hit:12424_conv:24848,5,I found some pictures of my grandma in the att...,Yeah reminds me of the good old days. I miss ...,13
84165,hit:12424_conv:24849,1,I woke up this morning to my wife telling me s...,I woke up this morning to my wife telling me s...,9
84166,hit:12424_conv:24849,2,I woke up this morning to my wife telling me s...,Oh hey that's awesome! That is awesome right?,9
84167,hit:12424_conv:24849,3,I woke up this morning to my wife telling me s...,It is soooo awesome. We have been wanting a b...,9


## Preprocessing
- [x] replace "\_comma\_" with ","
- [x] remove punctuations
- [x] to lower case
- [x] concatenate the whole conversation grouped by `conv_id` with token "[SEP]" joined
- [x] prepend "[CLS]"
- [x] append sentimental text label
- [ ] correct spelling

In [4]:
# replace "_comma_" with ","
def replace_comma(df):
    df.loc[:, 'prompt'] = df['prompt'].str.replace('_comma_', ',')
    df.loc[:, 'utterance'] = df['utterance'].str.replace('_comma_', ',')

replace_comma(traindf)
replace_comma(validdf)
replace_comma(testdf)

In [5]:
# remove punctuations
import re, string

def remove_punctuations(text: str):
    punc_filter = re.compile(f'[{string.punctuation}]')
    return punc_filter.sub(' ', text)
    
traindf.loc[:, 'prompt'] = traindf['prompt'].apply(remove_punctuations)
traindf.loc[:, 'utterance'] = traindf['utterance'].apply(remove_punctuations)
validdf.loc[:, 'prompt'] = validdf['prompt'].apply(remove_punctuations)
validdf.loc[:, 'utterance'] = validdf['utterance'].apply(remove_punctuations)
testdf.loc[:, 'prompt'] = testdf['prompt'].apply(remove_punctuations)
testdf.loc[:, 'utterance'] = testdf['utterance'].apply(remove_punctuations)

In [6]:
# convert to lower case
def lowercase(df):
    df.loc[:, 'prompt'] = df['prompt'].str.lower()
    df.loc[:, 'utterance'] = df['utterance'].str.lower()
    
lowercase(traindf)
lowercase(validdf)
lowercase(testdf)

In [7]:
# append sentiment text label
sent_id = {
    'sad':      0,  'trusting':     1,  'terrified': 2,  'caring':      3,  'disappointed': 4, 
    'faithful': 5,  'joyful':       6,  'jealous':   7,  'disgusted':   8,  'surprised':    9, 
    'ashamed':  10, 'afraid':       11, 'impressed': 12, 'sentimental': 13, 'devastated':   14, 
    'excited':  15, 'anticipating': 16, 'annoyed':   17, 'anxious':     18, 'furious':      19, 
    'content':  20, 'lonely':       21, 'angry':     22, 'confident':   23, 'apprehensive': 24, 
    'guilty':   25, 'embarrassed':  26, 'grateful':  27, 'hopeful':     28, 'proud':        29, 
    'prepared': 30, 'nostalgic':    31
}

id_sent = {v: k for k, v in sent_id.items()}

def append_sent(df):
    df['sent'] = df['label'].apply(id_sent.get)

append_sent(traindf)
append_sent(validdf)

In [8]:
# concatenate conversations
def concate_conv(df):
    df_concat = pd.DataFrame(columns=df.columns) \
                .drop(['utterance_idx'], axis=1) \
                .rename(columns={'utterance': 'conv'})

    conv_id_groups = df.groupby(['conv_id'])

    for _, indices in tqdm(conv_id_groups.groups.items()):
        conv_rows = df.loc[indices].copy().sort_values(['utterance_idx'])
        conv_row = conv_rows.iloc[0].drop(['utterance_idx']).rename({'utterance': 'conv'})
#         conv_row['prompt'] = '[CLS] ' + conv_row['prompt'] + ' [SEP]'
#         conv_row['conv'] = '[CLS] ' + ' [SEP] '.join(conv_rows['utterance'].values)
        conv_row['conv'] = ' [SEP] '.join(conv_rows['utterance'].values)
        df_concat = df_concat.append(conv_row, ignore_index=True)
        
    return df_concat

_traindf = concate_conv(traindf)
_validdf = concate_conv(validdf)
_testdf = concate_conv(testdf)

  0%|          | 0/19533 [00:00<?, ?it/s]

  0%|          | 0/2770 [00:00<?, ?it/s]

  0%|          | 0/2547 [00:00<?, ?it/s]

In [10]:
_traindf.to_csv('data/new_train.csv', index=False, encoding='utf8')
_validdf.to_csv('data/new_valid.csv', index=False, encoding='utf8')
_testdf.to_csv('data/new_test.csv', index=False, encoding='utf8')

## Analysis
### Training data
- [x] count each label, check classes imbalance
- [x] check the words with high frequencies for each label
- [x] check the label similarities (maybe by common words counts)

### Validation data
- [ ] count each label, check classes imbalance
- [ ] check the words with high frequencies for each label
- [ ] check the label similarities (maybe by common words counts)
- [ ] check OOV

### Testing data
- [ ] check OOV

In [None]:
sent_tiers = pd.DataFrame(_traindf.value_counts(subset=['sent']), columns=['count']) \
            .sort_values('count').reset_index()
px.bar(sent_tiers, x='count', y='sent', color='count', labels={'sent': 'sentiment'},
       title='Label Counts', width=800, height=800,
       color_continuous_scale=px.colors.sequential.Blues)

💡 **Well, not quite imbalanced. But the samples with `surprised` label are far more than others. We may need data augmentation for other labels excluding `surprised` label.**

In [None]:
# tokenization
vocab = set()
prompt_tokens, conv_tokens = list(), list()

for prompt in _traindf['prompt'].values:
    tokens = prompt.split()
    vocab = vocab.union(set(tokens))
    prompt_tokens.append(tokens)
    
for conv in _traindf['conv'].values:
    tokens = conv.split()
    vocab = vocab.union(set(tokens))
    conv_tokens.append(tokens)

In [None]:
# summary
seqlens_prompt = np.array([len(tokens) for tokens in prompt_tokens])
print(f'# of vocab: {len(vocab)}')
print('=== Tokenized Prompt Corpus ===')
print(f'max sequence length: {seqlens_prompt.max()}')
print(f'min sequence length: {seqlens_prompt.min()}')
print(f'avg sequence length: {seqlens_prompt.mean()}\n')

seqlens_conv = np.array([len(tokens) for tokens in conv_tokens])
print('=== Tokenized Conversation Corpus ===')
print(f'max sequence length: {seqlens_conv.max()}')
print(f'min sequence length: {seqlens_conv.min()}')
print(f'avg sequence length: {seqlens_conv.mean()}')

💡 **There's no text is longer than 512 which is the max length that BERT can receive.**

In [None]:
plt.figure(figsize=(20, 8))
palette = iter([plt.cm.Accent(i) for i in range(10)])

plt.subplot(1, 2, 1)
plt.hist([len(prompt) for prompt in prompt_tokens], range=(1, 111), color=next(palette))
plt.title('Prompt Corpus Sequences Length Histogram', fontsize=16)

plt.subplot(1, 2, 2)
plt.hist([len(conv) for conv in conv_tokens], range=(1, 393), color=next(palette))
plt.title('Conversation Corpus Sequences Length Histogram', fontsize=16)

plt.show()

In [None]:
_traindf[_traindf['label'] == 2]['conv']

In [None]:
# count word frequencies for each label
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stpw = stopwords.words('english')

sent_freq_dict = dict()
for label, sent in id_sent.items():
    freq_dict = dict()
    for text in _traindf[_traindf['label'] == label]['conv'].values:
        tokens = text.split()
        for token in tokens:
            if token not in {'[CLS]', '[SEP]'} and token not in stpw:
                freq_dict[token] = freq_dict.get(token, 0)+1
    sent_freq_dict[sent] = freq_dict

In [None]:
# lets see how the words are distributed in each label
sent_top20words = dict()
for label, sent in id_sent.items():
    top20_words = [k for k, v in sorted(sent_freq_dict[sent].items(), key=lambda x: x[1], reverse=True)][:20]
    sent_top20words[sent] = top20_words
    print(f'{sent}:\n{top20_words}\n')

In [None]:
# simply check the similarities between each two labels by top20 words cooccurrance
# or we can enhance this analysis by tf-idf score
n_labels = _traindf['label'].nunique()

sim_matrix = np.empty(shape=(n_labels, n_labels))
for i in range(n_labels):
    for j in range(n_labels):
        if i == j:
            sim_matrix[i, j] = 20  # the two same labels get full score 20
        
        sent_i_words = set(sent_top20words[id_sent[i]])
        sent_j_words = set(sent_top20words[id_sent[j]])
        sim_matrix[i, j] = len(sent_i_words.intersection(sent_j_words))

In [None]:
fig = plt.figure(figsize=(16, 16))
ax = fig.add_subplot(111)
matplot = ax.matshow(sim_matrix)
fig.colorbar(matplot, fraction=0.044)
ax.set_xticks(list(id_sent.keys()), list(id_sent.values()), rotation=-90, fontsize=12)
ax.set_yticks(list(id_sent.keys()), list(id_sent.values()), fontsize=12)
ax.set_title('Sentiment Similarity Matrix', fontdict=dict(fontsize=24))
plt.show()

💡 **Indeed, There are some labels are similar. e.g. anxious & conprehensive, afraid & terrified.**