In [1]:
import json
from collections import Counter
import pandas as pd

In [2]:
# Read File

file_name = "MaSaC_train_erc.json"
with open(file_name, 'r') as file:
        dataset = json.load(file)

In [3]:
# Number of episodes in dataset

N = len(dataset)
print(N)

343


In [4]:
# Sample Episode

dataset[0]

{'episode': 'roseshantstobeadopted.wmv',
 'speakers': ['maya',
  'indu',
  'rosesh',
  'indu',
  'maya',
  'indu',
  'rosesh',
  'indu',
  'sahil',
  'rosesh',
  'maya',
  'indu',
  'sahil',
  'monish',
  'maya',
  'indu',
  'maya',
  'indu',
  'indu',
  'maya',
  'rosesh',
  'maya'],
 'utterances': ['kya kya bhar ke rakha hai indravadan ne is ghar mein indravadan please tum sari bekar ki chijen bahar kyon nahin fekte?',
  'ok, chalo rosesh chalo bahar',
  'momma! hath chhodiye dad!',
  'dekho maya ya na bol raha hai! fir tum hi kahogi faltu chijen jama karte ho fekte nahin ho, chalo',
  'indravadan, tum kitne salo se ye joke maar rahe ho. koi hota hai aaj tak?',
  'kaikai pe joke maroge to manthra thodi na hasegi',
  'very funny',
  'maya, monisha aur sahil aa rahe hain, dekhna yahi jo un per istemal karta hun aur dekhna vah kitna haste hain',
  'hi guys',
  'hi',
  'hi darling. achcha indravadan, please ya kachra bahar fenko!',
  'ok maya, chalo rosesh tumhen bahar fenk au',
  'mom! 

In [5]:
# Reading the data

episodes_list = []
speakers_list = []
utterances_list = []
emotions_list = []

for i in range(len(dataset)):
        episodes_list.append(dataset[i]['episode'])
        speakers_list.append(dataset[i]['speakers'])
        utterances_list.append(dataset[i]['utterances'])
        emotions_list.append(dataset[i]['emotions'])  
    

In [6]:
c = Counter()
for i in range(N):
    c.update(speakers_list[i])
    
print(len(list(c.keys())))

112


In [7]:
# Speaker Data Analysis

# Data Processing
speakers_data = [None] * N

for i in range(N):
    c = Counter()
    c.update(speakers_list[i])
    speakers_data[i] = c.items()

# Finding Average
instance_count = 0
total_count = 0
for i in range(N):
    for person, utt_count in speakers_data[i]:
        total_count += utt_count
        instance_count += 1

avg_utt_per_person = total_count / instance_count
print("Average Number of Utterances one person performs in an episode is {:.2f}".format( avg_utt_per_person))

# Speaker Count Data Analysis
person_count = 0
for i in range(N):
    for person, utt_count in speakers_data[i]:
        person_count += 1
    
print("Average Number of Speakers in an episode is {:.2f}".format(person_count / N))

Average Number of Utterances one person performs in an episode is 6.88
Average Number of Speakers in an episode is 3.61


In [8]:
total_count

8506

In [9]:
# Utterance Count Data Analysis
utterances_count = [None] * N

for i in range(N):
    utterances_count[i] = len(utterances_list[i])
   
sum = 0
for i in range(N):
    sum += utterances_count[i]
    
print("Average Number of Utterances in an episode is {:.2f}".format(sum / N))

Average Number of Utterances in an episode is 24.80


In [10]:
# Emotion Count Data Analysis
emotions_data = [None] * N

for i in range(N):
    c = Counter()
    c.update(emotions_list[i])
    emotions_data[i] = c.items()
    
c = Counter()
for i in range(N):
    c.update(emotions_list[i])

for count, label in sorted( ((v,k) for k,v in c.items()), reverse=True):
    print(f"Utterances with label {label} is {count}")

Utterances with label neutral is 3909
Utterances with label joy is 1596
Utterances with label anger is 819
Utterances with label sadness is 558
Utterances with label contempt is 542
Utterances with label fear is 514
Utterances with label surprise is 441
Utterances with label disgust is 127


In [11]:
# Emotion Flip Analysis
labels = ["neutral", "anger", "surprise", "fear", "joy", "contempt", "sadness", "disgust"]

# Defining the variables
is_flip = [None] * N
flip_from_emotion = [None] * N
flip_from_index = [None] * N
flip_to_emotion = [None] * N
flip_to_index = [None] * N
for i in range(N):
    k = len(utterances_list[i])
    is_flip[i] = [0] * k
    flip_from_emotion[i] = [None] * k
    flip_from_index[i] = [None] * k
    flip_to_emotion[i] = [None] * k
    flip_to_index[i] = [None] * k

# Filling the variables
for i in range(N):
    speakers = set(speakers_list[i])
    state_dict = {}
    for sp in speakers:
        state_dict[sp] = (None, None)   # (ix, emotion)
    
    for j in range(len(utterances_list[i])):
        sp = speakers_list[i][j]
        sp_emotion = emotions_list[i][j]
        if state_dict[sp][0]==None:
            state_dict[sp] = (j, sp_emotion)
        else:
            if state_dict[sp][1]!=sp_emotion:
                is_flip[i][j] = 1
                flip_from_emotion[i][j] = state_dict[sp][1]
                flip_from_index[i][j] = state_dict[sp][0]
                flip_to_emotion[i][j] = sp_emotion
                flip_to_index[i][j] = j
            
            state_dict[sp] = (j, sp_emotion)          
            
# Data Analysis
flip_pair_dict = {}
for label1 in labels:
    for label2 in labels:
        flip_pair_dict[(label1, label2)] = [0, 0]
        
for i in range(N):
    for j in range(len(utterances_list[i])):
        if is_flip[i][j]==1:
            flip_pair_dict[(flip_from_emotion[i][j], flip_to_emotion[i][j])][0] += 1
            flip_pair_dict[(flip_from_emotion[i][j], flip_to_emotion[i][j])][1] += flip_to_index[i][j] - flip_from_index[i][j]

# Formatting
from decimal import Decimal, getcontext
getcontext().prec = 3

# Displaying Results
count_result = {}
for label1 in labels:
    count_result[label1] = [0] * len(labels)

for i, label1 in enumerate(labels):
    for j, label2 in enumerate(labels):
        count_result[label1][j] = flip_pair_dict[(label1, label2)][0]
        
dist_result = {}
for label1 in labels:
    dist_result[label1] = [0] * len(labels)

for i, label1 in enumerate(labels):
    for j, label2 in enumerate(labels):
        if flip_pair_dict[(label1, label2)][0]!=0:
            dist_result[label1][j] = Decimal(flip_pair_dict[(label1, label2)][1]) / Decimal(flip_pair_dict[(label1, label2)][0])
        else:
            dist_result[label1][j] = 0

print("Count Statistics")       
print("Row Labels are emotion FLIP FROM EMOTION and Column Labels are FLIP TO EMOTION")
df_count = pd.DataFrame(count_result, pd.Index(labels))
print(df_count)
print("\n\n")
print("Distance between utterances of the flip")
print("Row Labels are emotion FLIP FROM EMOTION and Column Labels are FLIP TO EMOTION")
df_dist = pd.DataFrame(dist_result, pd.Index(labels))
print(df_dist)

Count Statistics
Row Labels are emotion FLIP FROM EMOTION and Column Labels are FLIP TO EMOTION
          neutral  anger  surprise  fear  joy  contempt  sadness  disgust
neutral         0    247       154   164  554       210      148       48
anger         260      0        40    38   48        54       44       18
surprise      193     30         0    27   58        25       17        3
fear          170     34        22     0   68        17       33        8
joy           568     46        55    41    0        63       47        6
contempt      202     64        32    17   64         0       21       14
sadness       164     34        20    32   51        16        0        7
disgust        38     21         9     6    4        13        6        0



Distance between utterances of the flip
Row Labels are emotion FLIP FROM EMOTION and Column Labels are FLIP TO EMOTION
         neutral anger surprise  fear   joy contempt sadness disgust
neutral        0  3.14     3.90  2.96  3.09    