# Dataset Explorer

In [1]:
import pickle as pkl
import numpy as np
from tqdm import tqdm
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
import itertools

import plotly.express as px
import plotly.subplots as sp
import plotly.figure_factory as ff
import pandas as pd

In [13]:
with open("/data1/multimodal/MOSI/train.pkl", 'rb') as f:
    mosi_train = pkl.load(f)

with open("/data1/multimodal/MOSI/test.pkl", 'rb') as f:
    mosi_test = pkl.load(f)

with open("/data1/multimodal/MOSEI/train.pkl", 'rb') as f:
    mosei_train = pkl.load(f)

with open("/data1/multimodal/MOSEI/test.pkl", 'rb') as f:
    mosei_test = pkl.load(f)

In [19]:
def observe(data):

    label_list = []
    for d in data:
        # MOSI and MOSEI sentiment labels locate in the first column of sentiment matrix
        label = d[1][0][0]
        
        # min, max setting
        if label > 3.:
            label = 3
        elif label < -3.:
            label = -3

        # make class aligned list
        if label == 0.:
            label_list.append([label, 'neutral', round(label), d[2]])
        else:
            label_list.append([label, 'pos', round(label), d[2]]) if label > 0. else label_list.append([label, 'neg', round(label), d[2]])
    
    label_df = pd.DataFrame(label_list, columns=['values', 'binary', '7class', 'segment'])

    @interact
    def get_mosi_sample(idx = range(len(data))):
        (words, visual, acoustic, actual_words, vlen, alen), label, segment = data[idx]
        print("ACTUAL WORDS", actual_words)
        print("TEXT length", len(words))
        print("VISUAL", visual.shape)
        print("ACOUSTIC", acoustic.shape)
        print("LABEL", label)
        print("SEGMENT", segment)
    print("TRAIN SIZE", len(data))

    order = [-3, -2, -1, 0, 1, 2, 3]

    df_binary = label_df.groupby('binary').count().reset_index()
    df_7class = label_df.groupby('7class').count().reset_index()

    fig1 = px.bar(df_binary, x='binary', y='segment')
    fig2 = px.bar(df_7class, x='7class', y='segment')

    fig1_traces = []
    fig2_traces = []

    for trace in range(len(fig1["data"])):
        fig1_traces.append(fig1["data"][trace])
    for trace in range(len(fig2["data"])):
        fig2_traces.append(fig2["data"][trace])

    this_figure = sp.make_subplots(rows=1, cols=2, subplot_titles=("Binary", "7-class"))
    for traces in fig1_traces:
        this_figure.append_trace(traces, row=1, col=1)
    for traces in fig2_traces:
        this_figure.append_trace(traces, row=1, col=2)

    this_figure.update_xaxes(categoryorder='array', categoryarray= order)
    this_figure.update_layout(height=500, width=1200, title_text="Ground-truth distribution")
    # this_figure.update_yaxes(range=[0,420])
    this_figure.show()

In [15]:
def distplot(data):
    label_list = []
    for d in data:
        label_list.append([d[1][0][0], d[2]])
    
    label_df = pd.DataFrame(label_list, columns=['values', 'segment'])

    hist_data = [label_df['values']]
    group_labels = ['distplot']

    fig = ff.create_distplot(hist_data, group_labels)
    fig.layout.update({'height': 500, 'width': 800})
    fig.show()

In [20]:
# mosi train data
observe(mosi_train)

interactive(children=(Dropdown(description='idx', options=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…

TRAIN SIZE 1283


In [6]:
distplot(mosi_train)

In [7]:
# mosi test data
observe(mosi_test)

interactive(children=(Dropdown(description='idx', options=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…

TRAIN SIZE 686


In [8]:
distplot(mosi_test)

In [9]:
# mosei train data
observe(mosei_train)

interactive(children=(Dropdown(description='idx', options=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…

TRAIN SIZE 16315


In [10]:
distplot(mosei_train)

In [11]:
# mosei test data
observe(mosei_test)

interactive(children=(Dropdown(description='idx', options=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…

TRAIN SIZE 4654


In [12]:
distplot(mosei_test)

# Data Explore - noalign

In [17]:
with open("/data1/multimodal/MMIM/MOSI/train.pkl", 'rb') as f:
    mosi_train_noalign = pkl.load(f)

with open("/data1/multimodal/MMIM/MOSI/test.pkl", 'rb') as f:
    mosi_test_noalign = pkl.load(f)

In [22]:
# TODO: MIM에서는 text embedding을 넘기지 않도록 설계되어 있음. --> sentence마다 text embedding 넘기도록 구현하자.
observe(mosi_test_noalign)

interactive(children=(Dropdown(description='idx', options=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…

TRAIN SIZE 686


# MOSI

## TFN

### 1. Ground-truth Distribution

In [42]:
with open(f"results/TFN.pkl", "rb") as handle:
    # tfn_dict = {segment, labels, labels_2, labels_7, preds, preds_2, preds_7}
    tfn = pkl.load(handle) 

In [43]:
print(len(tfn['segment']))

686


In [44]:
d = {'binary': tfn['labels_2'], '7_class': tfn['labels_7']}
df = pd.DataFrame(data=d)
order = ['very negative', 'negative', 'weakly negative', 'Neutral', 'weakly positive', 'positive', 'very positive']

fig1 = px.bar(df, x='binary')
fig2 = px.bar(df, x='7_class')

fig1_traces = []
fig2_traces = []

for trace in range(len(fig1["data"])):
    fig1_traces.append(fig1["data"][trace])
for trace in range(len(fig2["data"])):
    fig2_traces.append(fig2["data"][trace])

this_figure = sp.make_subplots(rows=1, cols=2, subplot_titles=("Binary", "7-class"))
for traces in fig1_traces:
    this_figure.append_trace(traces, row=1, col=1)
for traces in fig2_traces:
    this_figure.append_trace(traces, row=1, col=2,)

this_figure.update_xaxes(categoryorder='array', categoryarray= order)
this_figure.update_layout(height=500, width=1200, title_text="Ground-truth distribution")
# this_figure.update_yaxes(range=[0,420])
this_figure.show()

### 2. Fusion result distribution

In [45]:
with open(f"results/TFN_16d.pkl", "rb") as handle:
    tfn_16d = pkl.load(handle) 

df1 = pd.DataFrame(data=tfn)
df2 = pd.DataFrame(data=tfn_16d)
order = ['very negative', 'negative', 'weakly negative', 'Neutral', 'weakly positive', 'positive', 'very positive']

fig1 = px.bar(df1, x="preds_7")
fig2 = px.bar(df2, x="preds_7")

fig1_traces = []
fig2_traces = []

for trace in range(len(fig1["data"])):
    fig1_traces.append(fig1["data"][trace])
for trace in range(len(fig2["data"])):
    fig2_traces.append(fig2["data"][trace])

this_figure = sp.make_subplots(rows=1, cols=2, subplot_titles=("TFN-40d", "TFN-16d"))
for traces in fig1_traces:
    this_figure.append_trace(traces, row=1, col=1)
for traces in fig2_traces:
    this_figure.append_trace(traces, row=1, col=2)

this_figure.update_layout(height=600, width=1200, title_text="TFN frozen dimension")
this_figure.update_xaxes(categoryorder='array', categoryarray= order)
this_figure.update_yaxes(range=[0,300])
this_figure.show()

In [46]:
with open(f"results/TFN_fusion.pkl", "rb") as handle:
    tfn_origin = pkl.load(handle) 

df1 = pd.DataFrame(data=tfn)
df2 = pd.DataFrame(data=tfn_origin)
order = ['very negative', 'negative', 'weakly negative', 'Neutral', 'weakly positive', 'positive', 'very positive']

fig1 = px.bar(df1, x="labels_7")
fig2 = px.bar(df2, x="preds_7")
fig3 = px.bar(df1, x="preds_7")

fig1_traces = []
fig2_traces = []
fig3_traces = []

for trace in range(len(fig1["data"])):
    fig1_traces.append(fig1["data"][trace])
for trace in range(len(fig2["data"])):
    fig2_traces.append(fig2["data"][trace])
for trace in range(len(fig3["data"])):
    fig3_traces.append(fig3["data"][trace])

this_figure = sp.make_subplots(rows=1, cols=2, subplot_titles=("Grond-truth", "TFN"))
for traces in fig1_traces:
    this_figure.append_trace(traces, row=1, col=1)
for traces in fig2_traces:
    this_figure.append_trace(traces, row=1, col=2)
# for traces in fig3_traces:
#     this_figure.append_trace(traces, row=1, col=3)

this_figure.update_layout(title_text="Fusion result distribution")
this_figure.update_xaxes(categoryorder='array', categoryarray= order)
this_figure.update_yaxes(range=[0,200])
this_figure.show()

### 3. Unimodal result distribution

In [47]:
keys = list(tfn.keys())
for key in keys:
    tfn[key].extend(['very negative', 'negative', 'weakly negative', 'Neutral', 'weakly positive', 'positive', 'very positive'])

df = pd.DataFrame(data=tfn)

fig1 = px.bar(df, x="text_7")
fig2 = px.bar(df, x="audio_7")
fig3 = px.bar(df, x="video_7")
fig4 = px.bar(df, x="preds_7")

fig1_traces = []
fig2_traces = []
fig3_traces = []
fig4_traces = []

for trace in range(len(fig1["data"])):
    fig1_traces.append(fig1["data"][trace])
for trace in range(len(fig2["data"])):
    fig2_traces.append(fig2["data"][trace])
for trace in range(len(fig3["data"])):
    fig3_traces.append(fig3["data"][trace])
for trace in range(len(fig4["data"])):
    fig4_traces.append(fig4["data"][trace])

this_figure = sp.make_subplots(rows=2, cols=2, subplot_titles=("Text-only", "Acoustic-only", "Visual-only", "TFN-fusion"))
for traces in fig1_traces:
    this_figure.append_trace(traces, row=1, col=1)
for traces in fig2_traces:
    this_figure.append_trace(traces, row=1, col=2)
for traces in fig3_traces:
    this_figure.append_trace(traces, row=2, col=1)
for traces in fig4_traces:
    this_figure.append_trace(traces, row=2, col=2)

this_figure.update_layout(height=800, width=1200, title_text="Unimodal results distribution")
this_figure.update_xaxes(categoryorder='array', categoryarray= order)
this_figure.update_yaxes(range=[0,500])
this_figure.show()

## MMIM

In [4]:
import pickle as pkl
with open(f"results/MMIM_mosi.pkl", "rb") as handle:
    # tfn_dict = {segment, labels, labels_2, labels_7, preds, preds_2, preds_7}
    mmim = pkl.load(handle)

In [10]:
print(len(mmim['segment']))

686


In [5]:
import plotly.express as px
import plotly.subplots as sp
import pandas as pd

df = pd.DataFrame(data=mmim)
df_gold = df.groupby('labels_7').count().reset_index()
df_pred = df.groupby('preds_7').count().reset_index()

order = ['very negative', 'negative', 'weakly negative', 'Neutral', 'weakly positive', 'positive', 'very positive']

fig1 = px.bar(df_gold, x="labels_7", y="segment")
fig2 = px.bar(df_pred, x="preds_7", y="segment")

fig1_traces = []
fig2_traces = []

for trace in range(len(fig1["data"])):
    fig1_traces.append(fig1["data"][trace])
for trace in range(len(fig2["data"])):
    fig2_traces.append(fig2["data"][trace])

this_figure = sp.make_subplots(rows=1, cols=2, subplot_titles=("Grond-truth", "MMIM"))
for traces in fig1_traces:
    this_figure.append_trace(traces, row=1, col=1)
for traces in fig2_traces:
    this_figure.append_trace(traces, row=1, col=2)

this_figure.update_layout(title_text="7-Classification Result")
this_figure.update_xaxes(categoryorder='array', categoryarray= order)
this_figure.update_yaxes(range=[0,200])
this_figure.show()

In [13]:
segment_list = mmim["segment"]
preds = mmim["preds"]
labels_7 = mmim["labels_7"]

In [29]:
error_VN, error_N, error_SN, error_Neu, error_SP, error_P, error_VP = [], [], [], [], [], [], []
for idx in range(len(segment_list)):
    if labels_7[idx] == 'very negative': error_VN.append(abs(preds[idx] - (-3.0))[0])
    if labels_7[idx] == 'negative': error_N.append(abs(preds[idx] - (-2.0))[0])
    if labels_7[idx] == 'weakly negative': error_SN.append(abs(preds[idx] - (-1.0))[0])
    if labels_7[idx] == 'Neutral': error_Neu.append(abs(preds[idx])[0])
    if labels_7[idx] == 'weakly positive': error_SP.append(abs(preds[idx] - 1.0)[0])
    if labels_7[idx] == 'positive': error_P.append(abs(preds[idx] - 2.0)[0])
    if labels_7[idx] == 'very positive': error_VP.append(abs(preds[idx] - 3.0)[0])

In [32]:
import plotly.graph_objects as go
import numpy as np

fig = go.Figure()
fig.add_trace(go.Box(y=error_VN, name="very negative", marker_color="#1984c5"))
fig.add_trace(go.Box(y=error_N, name="negative", marker_color="#22a7f0"))
fig.add_trace(go.Box(y=error_SN, name="weakly negative", marker_color="#63bff0"))
fig.add_trace(go.Box(y=error_Neu, name="Neutral", marker_color="#63bff0"))
fig.add_trace(go.Box(y=error_SP, name="weakly positive", marker_color="#63bff0"))
fig.add_trace(go.Box(y=error_P, name="positive", marker_color="#22a7f0"))
fig.add_trace(go.Box(y=error_VP, name="very positive", marker_color="#1984c5"))
fig.update_traces(boxpoints='all', jitter=0)
fig.add_trace(
    go.Scatter(x=order, y=[np.mean(error_VN), np.mean(error_N), np.mean(error_SN), np.mean(error_Neu), np.mean(error_SP), np.mean(error_P), np.mean(error_VP)], \
         mode='markers', name="Mean Absolute Error", marker_color="red"))
fig.update_layout(yaxis_title="Absolute Error" ,title_text="MIM Sentiment Intensity Prediction Error Distribution")
fig.show()

# MOSEI

## TFN

### 1. Ground-truth

In [48]:
with open(f"results/TFN_mosei.pkl", "rb") as handle:
    # tfn_dict = {segment, labels, labels_2, labels_7, preds, preds_2, preds_7, preds_text, text_2, text_7, preds_video, ...}
    tfn_mosei = pkl.load(handle) 

In [49]:
print(len(tfn_mosei['segment']))

4654


In [50]:
import plotly.express as px
import plotly.subplots as sp
import pandas as pd

df = pd.DataFrame(data=tfn_mosei)
df_2 = df.groupby('labels_2').count().reset_index()
df_7 = df.groupby('labels_7').count().reset_index()

order = ['very negative', 'negative', 'weakly negative', 'Neutral', 'weakly positive', 'positive', 'very positive']

fig1 = px.bar(df_2, x='labels_2', y='segment')
fig2 = px.bar(df_7, x='labels_7', y='segment')

fig1_traces = []
fig2_traces = []

for trace in range(len(fig1["data"])):
    fig1_traces.append(fig1["data"][trace])
for trace in range(len(fig2["data"])):
    fig2_traces.append(fig2["data"][trace])

this_figure = sp.make_subplots(rows=1, cols=2, subplot_titles=("Binary", "7-class"))
for traces in fig1_traces:
    this_figure.append_trace(traces, row=1, col=1)
for traces in fig2_traces:
    this_figure.append_trace(traces, row=1, col=2)

this_figure.update_xaxes(categoryorder='array', categoryarray= order)
this_figure.update_layout(height=500, width=1200, title_text="Ground-truth distribution")
this_figure.update_yaxes(range=[0,2500])
this_figure.show()

### 2. Fusion result distribution

In [51]:
df = pd.DataFrame(data=tfn_mosei)
df_gold = df.groupby('labels_7').count().reset_index()
df_pred = df.groupby('preds_7').count().reset_index()

order = ['very negative', 'negative', 'weakly negative', 'Neutral', 'weakly positive', 'positive', 'very positive']

fig1 = px.bar(df_gold, x="labels_7", y="segment")
fig2 = px.bar(df_pred, x="preds_7", y="segment")

fig1_traces = []
fig2_traces = []

for trace in range(len(fig1["data"])):
    fig1_traces.append(fig1["data"][trace])
for trace in range(len(fig2["data"])):
    fig2_traces.append(fig2["data"][trace])

this_figure = sp.make_subplots(rows=1, cols=2, subplot_titles=("Grond-truth", "TFN-frozen"))
for traces in fig1_traces:
    this_figure.append_trace(traces, row=1, col=1)
for traces in fig2_traces:
    this_figure.append_trace(traces, row=1, col=2)

this_figure.update_layout(height=800, width=1200, title_text="Fusion result distribution")
this_figure.update_xaxes(categoryorder='array', categoryarray= order)
this_figure.update_yaxes(range=[0,4500])
this_figure.show()

### 3. Unimodal result distribution

In [52]:
# keys = list(tfn.keys())
# for key in keys:
#     tfn[key].extend(['very negative', 'negative', 'weakly negative', 'Neutral', 'weakly positive', 'positive', 'very positive'])

df = pd.DataFrame(data=tfn_mosei)
df_t = df.groupby('text_7').count().reset_index()
df_a = df.groupby('audio_7').count().reset_index()
df_v = df.groupby('video_7').count().reset_index()
df_f = df.groupby('preds_7').count().reset_index()

fig1 = px.bar(df_t, x="text_7", y="segment")
fig2 = px.bar(df_a, x="audio_7", y="segment")
fig3 = px.bar(df_v, x="video_7", y="segment")
fig4 = px.bar(df_f, x="preds_7", y="segment")

fig1_traces = []
fig2_traces = []
fig3_traces = []
fig4_traces = []

for trace in range(len(fig1["data"])):
    fig1_traces.append(fig1["data"][trace])
for trace in range(len(fig2["data"])):
    fig2_traces.append(fig2["data"][trace])
for trace in range(len(fig3["data"])):
    fig3_traces.append(fig3["data"][trace])
for trace in range(len(fig4["data"])):
    fig4_traces.append(fig4["data"][trace])

this_figure = sp.make_subplots(rows=2, cols=2, subplot_titles=("Text-only", "Acoustic-only", "Visual-only", "TFN-fusion"))
for traces in fig1_traces:
    this_figure.append_trace(traces, row=1, col=1)
for traces in fig2_traces:
    this_figure.append_trace(traces, row=1, col=2)
for traces in fig3_traces:
    this_figure.append_trace(traces, row=2, col=1)
for traces in fig4_traces:
    this_figure.append_trace(traces, row=2, col=2)

this_figure.update_layout(height=800, width=1200, title_text="Unimodal results distribution")
this_figure.update_xaxes(categoryorder='array', categoryarray= order)
# this_figure.update_yaxes(range=[0,500])
this_figure.show()

## MMIM

In [57]:
with open(f"results/MMIM_mosei.pkl", "rb") as handle:
    # tfn_dict = {segment, labels, labels_2, labels_7, preds, preds_2, preds_7}
    mmim_mosei = pkl.load(handle)

In [58]:
print(len(mmim_mosei['segment']))

4654


In [61]:
import plotly.express as px
import plotly.subplots as sp
import pandas as pd

df = pd.DataFrame(data=mmim_mosei)
df_gold = df.groupby('labels_7').count().reset_index()
df_pred = df.groupby('preds_7').count().reset_index()

order = ['very negative', 'negative', 'weakly negative', 'Neutral', 'weakly positive', 'positive', 'very positive']

fig1 = px.bar(df_gold, x="labels_7", y="segment")
fig2 = px.bar(df_pred, x="preds_7", y="segment")

fig1_traces = []
fig2_traces = []

for trace in range(len(fig1["data"])):
    fig1_traces.append(fig1["data"][trace])
for trace in range(len(fig2["data"])):
    fig2_traces.append(fig2["data"][trace])

this_figure = sp.make_subplots(rows=1, cols=2, subplot_titles=("Grond-truth", "MMIM"))
for traces in fig1_traces:
    this_figure.append_trace(traces, row=1, col=1)
for traces in fig2_traces:
    this_figure.append_trace(traces, row=1, col=2)

this_figure.update_layout(title_text="7-Classification Result")
this_figure.update_xaxes(categoryorder='array', categoryarray= order)
this_figure.update_yaxes(range=[0,2500])
this_figure.show()