In [82]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier

import tensorflow as tf
from tensorflow.keras.optimizers.schedules import PolynomialDecay

from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import TFAutoModelForSequenceClassification
from transformers import AdamWeightDecay
from datasets import Dataset, DatasetDict

print(tf.__version__)
print(tf.config.list_physical_devices())

# startegy for training on multiple gpus
mirrored_strategy = tf.distribute.MirroredStrategy()


2.11.0
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]


In [83]:
path = "/kaggle/input/edos-1m/"

dataset = pd.read_csv(path + "EDOS 1M.csv")
dataset = dataset.head(10000)
classes = dataset["eb+_emot"].unique()


In [89]:
#preparing the new dataset containing utterances pairs

df = dataset.merge(dataset, on='dialogue_id', how='inner') #self join
#creating auxialiary attributes
df['is_first'] = (df['turn_x'] == 1) & (df['turn_y'] == 1)
df['is_last'] = (df['turn_x'] == df['turn_y']) & (df['turn_y'] == df.groupby('dialogue_id')['turn_y'].transform(max))
#keep only first/last utterances and all the consecutive pairs               
df = df[df['is_first'] | df['is_last'] | (df['turn_x'] == df['turn_y'] - 1)]
#display(df,10) 

#df_preceding will be used to predict the last utterance, given the previous context, if it exists
df_preceding = df[df['is_last'] == 0]
df_preceding = df_preceding[['dialogue_id','turn_x','uttr_x','turn_y','uttr_y','eb+_emot_y','is_first']].rename(columns={'eb+_emot_y': 'label'})
#df_following will be used to predict the first utterance, given the following context, if it exists
df_following = df[df['is_first'] == 0]
df_following = df_following[['dialogue_id','turn_x','uttr_x','turn_y','uttr_y','eb+_emot_x','is_last']].rename(columns={'eb+_emot_x': 'label'})

display(df_preceding,10)
display(df_following,10)

Unnamed: 0,dialogue_id,turn_x,uttr_x,turn_y,uttr_y,label,is_first
0,97,1,You moron ! What fool washes diapers by the we...,1,You moron ! What fool washes diapers by the we...,angry,True
1,97,1,You moron ! What fool washes diapers by the we...,2,You useless fool !,furious,False
4,99,1,How dare you sleep !,1,How dare you sleep !,furious,True
5,99,1,How dare you sleep !,2,Up ! Go and clean the house .,prepared,False
8,100,1,Clean the kitchen .,1,Clean the kitchen .,prepared,True
...,...,...,...,...,...,...,...
27758,213856,2,I love the water . I miss the water . When tha...,3,"Diamond rose "" ? I thought it was "" jinx .",surprised,False
27764,213856,3,"Diamond rose "" ? I thought it was "" jinx .",4,"Anyway , you were saying ?",questioning,False
27770,213856,4,"Anyway , you were saying ?",5,"Oh , I-I grew up on the water , imagining I 'd...",nostalgic,False
27776,214055,1,Maybe now you can tell us what 's going on . T...,1,Maybe now you can tell us what 's going on . T...,hopeful,True


10

Unnamed: 0,dialogue_id,turn_x,uttr_x,turn_y,uttr_y,label,is_first
0,97,1,You moron ! What fool washes diapers by the we...,1,You moron ! What fool washes diapers by the we...,angry,True
1,97,1,You moron ! What fool washes diapers by the we...,2,You useless fool !,furious,False
4,99,1,How dare you sleep !,1,How dare you sleep !,furious,True
5,99,1,How dare you sleep !,2,Up ! Go and clean the house .,prepared,False
8,100,1,Clean the kitchen .,1,Clean the kitchen .,prepared,True
...,...,...,...,...,...,...,...
27758,213856,2,I love the water . I miss the water . When tha...,3,"Diamond rose "" ? I thought it was "" jinx .",surprised,False
27764,213856,3,"Diamond rose "" ? I thought it was "" jinx .",4,"Anyway , you were saying ?",questioning,False
27770,213856,4,"Anyway , you were saying ?",5,"Oh , I-I grew up on the water , imagining I 'd...",nostalgic,False
27776,214055,1,Maybe now you can tell us what 's going on . T...,1,Maybe now you can tell us what 's going on . T...,hopeful,True


10

Unnamed: 0,dialogue_id,turn_x,uttr_x,turn_y,uttr_y,label,is_last
1,97,1,You moron ! What fool washes diapers by the we...,2,You useless fool !,angry,False
3,97,2,You useless fool !,2,You useless fool !,furious,True
5,99,1,How dare you sleep !,2,Up ! Go and clean the house .,furious,False
7,99,2,Up ! Go and clean the house .,2,Up ! Go and clean the house .,prepared,True
9,100,1,Clean the kitchen .,2,"I cleaned the kitchen , ma 'am .",prepared,False
...,...,...,...,...,...,...,...
27764,213856,3,"Diamond rose "" ? I thought it was "" jinx .",4,"Anyway , you were saying ?",surprised,False
27770,213856,4,"Anyway , you were saying ?",5,"Oh , I-I grew up on the water , imagining I 'd...",questioning,False
27775,213856,5,"Oh , I-I grew up on the water , imagining I 'd...",5,"Oh , I-I grew up on the water , imagining I 'd...",nostalgic,True
27777,214055,1,Maybe now you can tell us what 's going on . T...,2,273 ) } – What contagion ? – My wife didn 't g...,hopeful,False


10

In [92]:
df_choice = 0 #0 for preceding, 1 for following
# train, validation and test split
if df_choice == 0:
    df = df_preceding
    train_X, valid_X, train_y, valid_y = train_test_split(df[['dialogue_id','uttr_x','uttr_y','is_first']], df['label'], test_size=0.15, stratify= None, shuffle=False)
else:
    df = df_following
    train_X, valid_X, train_y, valid_y = train_test_split(df[['dialogue_id','uttr_x','uttr_y','is_last']], df['label'], test_size=0.15, stratify= None, shuffle=False)

classes = df['label'].unique()
print(len(classes))

display(valid_X,10)
print("train size: ", len(train_X))
print("validation size: ", len(valid_X))

41


Unnamed: 0,dialogue_id,uttr_x,uttr_y,is_first
23494,184152,Can I help you ? I have to visit Thomas Burton...,Can I help you ? I have to visit Thomas Burton...,True
23495,184152,Can I help you ? I have to visit Thomas Burton...,"Hello , I 'm dr . Rockwell . Your uncle is my ...",False
23498,184297,The poor fellow thinks he was in love . A wret...,The poor fellow thinks he was in love . A wret...,True
23499,184297,The poor fellow thinks he was in love . A wret...,"And who is the object of his desire A young , ...",False
23502,184328,Nice to be back in Italy .,Nice to be back in Italy .,True
...,...,...,...,...
27758,213856,I love the water . I miss the water . When tha...,"Diamond rose "" ? I thought it was "" jinx .",False
27764,213856,"Diamond rose "" ? I thought it was "" jinx .","Anyway , you were saying ?",False
27770,213856,"Anyway , you were saying ?","Oh , I-I grew up on the water , imagining I 'd...",False
27776,214055,Maybe now you can tell us what 's going on . T...,Maybe now you can tell us what 's going on . T...,True


10

train size:  8500
validation size:  1500


In [93]:
# model metadata
model_name = "distilbert-base-uncased"
# map expected ids to their labels and viceversa
id2label = dict(zip(range(len(classes)), classes))
label2id = dict(zip(classes, range(len(classes))))
id2label


{0: 'angry',
 1: 'furious',
 2: 'prepared',
 3: 'acknowledging',
 4: 'trusting',
 5: 'confident',
 6: 'hopeful',
 7: 'caring',
 8: 'sentimental',
 9: 'anticipating',
 10: 'wishing',
 11: 'surprised',
 12: 'ashamed',
 13: 'questioning',
 14: 'sad',
 15: 'nostalgic',
 16: 'devastated',
 17: 'terrified',
 18: 'embarrassed',
 19: 'lonely',
 20: 'encouraging',
 21: 'suggesting',
 22: 'content',
 23: 'afraid',
 24: 'impressed',
 25: 'agreeing',
 26: 'apprehensive',
 27: 'proud',
 28: 'annoyed',
 29: 'anxious',
 30: 'grateful',
 31: 'excited',
 32: 'neutral',
 33: 'faithful',
 34: 'guilty',
 35: 'consoling',
 36: 'disgusted',
 37: 'disappointed',
 38: 'jealous',
 39: 'joyful',
 40: 'sympathizing'}

In [98]:
# building the datasets
if df_choice == 0:
    flag = "is_first"
else:
    flag= "is_last"
train_data = Dataset.from_pandas(pd.DataFrame({"text_1": train_X['uttr_x'],"text_2": train_X['uttr_y'], flag: train_X[flag], "label": np.argmax(pd.get_dummies(train_y).to_numpy(), axis=1)}), preserve_index=False)
valid_data = Dataset.from_pandas(pd.DataFrame({"text_1": valid_X['uttr_x'],"text_2": valid_X['uttr_y'], flag: valid_X[flag], "label": np.argmax(pd.get_dummies(valid_y).to_numpy(), axis=1)}), preserve_index=False)

# shuffling is performed at the previous operation -> we need to redefine valid_y
valid_y = valid_data['label']

data = DatasetDict()
data['train'] = train_data
data['validation'] = valid_data

print(data['train'][0])
data

{'text_1': 'You moron ! What fool washes diapers by the well !', 'text_2': 'You moron ! What fool washes diapers by the well !', 'is_first': True, 'label': 3}


DatasetDict({
    train: Dataset({
        features: ['text_1', 'text_2', 'is_first', 'label'],
        num_rows: 8500
    })
    validation: Dataset({
        features: ['text_1', 'text_2', 'is_first', 'label'],
        num_rows: 1500
    })
})

In [101]:
# load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# tokenize the data
def preprocess_function(examples):
    if df_choice == 0:
        if examples["is_first"]:
            return tokenizer(["",examples["text_2"]], truncation=True)
        else:
            return tokenizer([examples["text_1"],examples["text_2"]], truncation=True)
    else:
        if examples["is_last"]:
            return tokenizer([examples["text_1"],""], truncation=True)
        else:
            return tokenizer([examples["text_1"],examples["text_2"]], truncation=True)

tokenized_data = data.map(preprocess_function, batched=True)

print(tokenized_data["train"][0])
tokenized_data

  0%|          | 0/9 [00:00<?, ?ba/s]

TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]

In [88]:
#testing the tokenizer
batch_sentences = [
    "UTTR_X",
    "UTTR_Y",
]

encoded_dict = tokenizer([batch_sentences])
#decoded = tokenizer.decode(encoded_dict["input_ids"])
#print(decoded)
print(encoded_dict)


{'input_ids': [[101, 21183, 16344, 1035, 1060, 102, 21183, 16344, 1035, 1061, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}
