In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import shutil
from IPython.display import FileLink

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier

import tensorflow as tf
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy

from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import TFAutoModelForSequenceClassification
from transformers import AdamWeightDecay
from datasets import Dataset, DatasetDict

print(tf.__version__)
print(tf.config.list_physical_devices())

# startegy for training on multiple gpus
mirrored_strategy = tf.distribute.MirroredStrategy()




2.11.0
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [3]:
path = "/kaggle/input/edos-1m/"

#dataset_percentage = 0.4
dataset = pd.read_csv(path + "EDOS 1M.csv")
#dataset = dataset.head(int(len(dataset)*dataset_percentage))

#dataset = dataset.head(10000)
#classes = dataset["eb+_emot"].unique()


In [4]:
#preparing the new dataset containing utterances pairs

df = dataset.merge(dataset, on='dialogue_id', how='inner') #self join
#creating auxialiary attributes
df['is_first'] = (df['turn_x'] == 1) & (df['turn_y'] == 1)
df['is_last'] = (df['turn_x'] == df['turn_y']) & (df['turn_y'] == df.groupby('dialogue_id')['turn_y'].transform(max))
#keep only first/last utterances and all the consecutive pairs               
df = df[df['is_first'] | df['is_last'] | (df['turn_x'] == df['turn_y'] - 1)]
#display(df,10) 

#df_preceding will be used to predict the last utterance, given the previous context, if it exists
df_preceding = df[df['is_last'] == 0]
df_preceding = df_preceding[['dialogue_id','turn_x','uttr_x','turn_y','uttr_y','eb+_emot_y','is_first']].rename(columns={'eb+_emot_y': 'label'})
#df_following will be used to predict the first utterance, given the following context, if it exists
df_following = df[df['is_first'] == 0]
df_following = df_following[['dialogue_id','turn_x','uttr_x','turn_y','uttr_y','eb+_emot_x','is_last']].rename(columns={'eb+_emot_x': 'label'})
#replace utterances with empty strings for first and last samples of each conversation
df_preceding.loc[df_preceding['is_first'] == 1, 'uttr_x'] = ""
df_following.loc[df_following['is_last'] == 1, 'uttr_y'] = ""

display(df_preceding,10)
display(df_following,10)

Unnamed: 0,dialogue_id,turn_x,uttr_x,turn_y,uttr_y,label,is_first
0,97,1,,1,You moron ! What fool washes diapers by the we...,angry,True
1,97,1,You moron ! What fool washes diapers by the we...,2,You useless fool !,furious,False
4,99,1,,1,How dare you sleep !,furious,True
5,99,1,How dare you sleep !,2,Up ! Go and clean the house .,prepared,False
8,100,1,,1,Clean the kitchen .,prepared,True
...,...,...,...,...,...,...,...
9480212,8820621,2,"Well then , great .",3,This is so stupid . How can I be upset over so...,furious,False
9480218,8820621,3,This is so stupid . How can I be upset over so...,4,It 's negative ?,acknowledging,False
9480224,8820621,4,It 's negative ?,5,"No , it 's positive .",agreeing,False
9480230,8820637,1,,1,Thank God !,grateful,True


10

Unnamed: 0,dialogue_id,turn_x,uttr_x,turn_y,uttr_y,label,is_last
1,97,1,You moron ! What fool washes diapers by the we...,2,You useless fool !,angry,False
3,97,2,You useless fool !,2,,furious,True
5,99,1,How dare you sleep !,2,Up ! Go and clean the house .,furious,False
7,99,2,Up ! Go and clean the house .,2,,prepared,True
9,100,1,Clean the kitchen .,2,"I cleaned the kitchen , ma 'am .",prepared,False
...,...,...,...,...,...,...,...
9480218,8820621,3,This is so stupid . How can I be upset over so...,4,It 's negative ?,furious,False
9480224,8820621,4,It 's negative ?,5,"No , it 's positive .",acknowledging,False
9480229,8820621,5,"No , it 's positive .",5,,agreeing,True
9480231,8820637,1,Thank God !,2,So tell me one of your moves .,grateful,False


10

In [6]:
df_choice = 0 #0 for preceding, 1 for following
dataset_percentage = 0.3

# train, validation and test split
if df_choice == 0:
    df = df_preceding
    samples = int(len(df)*dataset_percentage)
    df = df.sample(n=samples).sort_index() #sort to ensure there is not information leakage between train and test
    train_X, valid_X, train_y, valid_y = train_test_split(df[['dialogue_id','uttr_x','uttr_y','is_first']], df['label'], test_size=0.15, stratify= None, shuffle=False)
else:
    df = df_following
    samples = int(len(df)*dataset_percentage)
    df = df.sample(n=samples).sort_index() #sort to ensure there is not information leakage between train and test
    train_X, valid_X, train_y, valid_y = train_test_split(df[['dialogue_id','uttr_x','uttr_y','is_last']], df['label'], test_size=0.15, stratify= None, shuffle=False)

classes = df['label'].unique()
print(len(classes))

display(train_X,10)
print("train size: ", len(train_X))
print("validation size: ", len(valid_X))

41


Unnamed: 0,dialogue_id,uttr_x,uttr_y,is_first
5,99,How dare you sleep !,Up ! Go and clean the house .,False
13,124,Your name means to trust . It also means truth...,You were given a great name . You 'd better li...,False
21,217,Your medicine . Open your mouth .,Granny ...,False
25,217,Granny ...,Don 't spill it .,False
34,265,Brandon said he 's headed down to Galway .,"Craking , Galway , it 's the same thing . He '...",False
...,...,...,...,...
8010632,1330823,This is really good . Now I 'm the one yelling...,"That is amazing , Beca .",False
8010651,1331422,Uriel . What 's up ?,Any problems with the procedure ? No . It 's t...,False
8010658,1331422,Any problems with the procedure ? No . It 's t...,It is absolutely necessary for me to stick to ...,False
8010686,1331519,"Suppose it doesn 't work , Lord ?","Suppose I rebuke a demon , and it refuses to l...",False


10

train size:  721502
validation size:  127325


In [None]:
# model metadata
model_name = "distilbert-base-uncased"
# map expected ids to their labels and viceversa
id2label = dict(zip(range(len(classes)), classes))
label2id = dict(zip(classes, range(len(classes))))
id2label


In [None]:
# building the datasets
if df_choice == 0:
    flag = "is_first"
else:
    flag= "is_last"
train_data = Dataset.from_pandas(pd.DataFrame({"text_1": train_X['uttr_x'],"text_2": train_X['uttr_y'], flag: train_X[flag], "label": np.argmax(pd.get_dummies(train_y).to_numpy(), axis=1)}), preserve_index=False)
valid_data = Dataset.from_pandas(pd.DataFrame({"text_1": valid_X['uttr_x'],"text_2": valid_X['uttr_y'], flag: valid_X[flag], "label": np.argmax(pd.get_dummies(valid_y).to_numpy(), axis=1)}), preserve_index=False)

# shuffling is performed at the previous operation -> we need to redefine valid_y
valid_y = valid_data['label']

data = DatasetDict()
data['train'] = train_data
data['validation'] = valid_data

print(data['train'][0])
data

In [None]:
# load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# tokenize the data
def preprocess_function(examples):
    return tokenizer(examples["text_1"],examples["text_2"], truncation=True)

cols = data["train"].column_names
tokenized_data = data.map(preprocess_function, batched=True, remove_columns=["text_1","text_2",flag])
#okkk
print(tokenized_data["train"][0])
tokenized_data

In [None]:
# metadata
batch_size = 16
num_epochs = 10
batches_per_epoch = len(tokenized_data["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
print("Total train steps: " + str(total_train_steps))
print("Batches per epoch: " + str(batches_per_epoch))

#optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
only_fine_tune = True

# convert datasets to a suitable format for tensorflow
tf_train_dataset = tokenized_data["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "label"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)

tf_validation_dataset = tokenized_data["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "label"],
    shuffle=False,
    batch_size=batch_size,
    collate_fn=data_collator
)

num_train_steps = len(tf_train_dataset) * num_epochs
print("Number of training steps: " + str(num_train_steps))

lr_scheduler = PolynomialDecay(initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps)

optimizer = Adam(learning_rate=lr_scheduler)

# create callback to save model at the end of each epoch
save_model_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath="./models/model_checkpoint",
    save_weights_only=False,
    monitor="val_accuracy",
    mode="max",
    save_best_only=True
)
    
# apply parallel computation on kaggle
#with mirrored_strategy.scope():
    # load the model
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(classes), id2label=id2label, label2id=label2id)

# leave only the classification layer trainable
if only_fine_tune:
    for i in range(1):
        model.layers[i].trainable = False

model.compile(
    optimizer = optimizer,
    loss = SparseCategoricalCrossentropy(from_logits=True),
    metrics = ["accuracy"]
)

model.summary()

In [None]:
# training
history = model.fit(
          x=tf_train_dataset,
          validation_data=tf_validation_dataset,
          epochs=num_epochs,                        
          callbacks = [save_model_callback]
)

# save the model
model.save_pretrained("./models/model1")


In [None]:
# validate the model -> accuracy should correspond to final val_accuracy
bert_y = np.argmax(model.predict(tf_validation_dataset)["logits"], axis=1)

print('Results for BERT-based classifier:')
print(classification_report(valid_y, bert_y, target_names=classes))


In [None]:
model.save('bert_model')
shutil.make_archive('bert_model', 'zip', 'bert_model')
FileLink(r'bert_model.zip')

In [None]:
# Get the top 3 predictions for each sample in the validation dataset
bert_y = model.predict(tf_validation_dataset)["logits"]
bert_y_top_3 = np.argsort(bert_y, axis=1)[:, -3:]

# Get the ground truth labels for the validation dataset
validation_labels = np.asarray(valid_y)

# Compute top 3 accuracy
top_3_accuracy = np.mean(np.any(bert_y_top_3 == validation_labels.reshape(validation_labels.shape[0], 1), axis=1))
print("Top-3 accuracy: "+str(top_3_accuracy))