In [1]:
%load_ext autoreload
%autoreload 2

# Datasets

## Empathetic Dialogue Data

In [17]:
import pandas as pd
import re

import torch
from transformers import WEIGHTS_NAME, AutoConfig, AutoModelForCausalLM, AutoTokenizer

tokenizer_name = "microsoft/DialoGPT-medium"
model_name = "microsoft/DialoGPT-medium"


tokenizer = AutoTokenizer.from_pretrained(
    tokenizer_name
)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [18]:
tokenizer.eos_token

'<|endoftext|>'

In [19]:
tokenizer.pad_token = "<|pad|>"

In [20]:
tokenizer.pad_token_id

50256

In [21]:
dataset_type = "test"

data_filename = f"../../data/empatheticdialogues/{dataset_type}.csv"

In [22]:
df = pd.read_csv(data_filename, on_bad_lines="skip", delimiter=",", lineterminator="\n")

In [23]:
df.shape

(5701, 8)

In [24]:
df.iloc[112]

conv_id                                           hit:275_conv:551
utterance_idx                                                    3
context                                                    jealous
prompt           Once_comma_ I had plans to meet up with an old...
speaker_idx                                                    117
utterance        Well_comma_ right after she gave me a big hug_...
selfeval                                               4|5|5_5|5|5
tags                                                           NaN
Name: 112, dtype: object

In [25]:
df["context"].value_counts()

surprised       291
grateful        221
proud           221
sentimental     205
excited         202
annoyed         198
sad             195
disgusted       190
disappointed    188
joyful          187
jealous         183
angry           181
embarrassed     179
impressed       178
caring          177
hopeful         175
nostalgic       173
prepared        173
content         172
anxious         171
lonely          171
confident       170
anticipating    166
afraid          164
apprehensive    158
terrified       155
furious         153
guilty          149
devastated      148
trusting        147
ashamed         143
faithful        117
Name: context, dtype: int64

In [26]:
context = sorted(list(df["context"].value_counts().keys()))

In [27]:
context

['afraid',
 'angry',
 'annoyed',
 'anticipating',
 'anxious',
 'apprehensive',
 'ashamed',
 'caring',
 'confident',
 'content',
 'devastated',
 'disappointed',
 'disgusted',
 'embarrassed',
 'excited',
 'faithful',
 'furious',
 'grateful',
 'guilty',
 'hopeful',
 'impressed',
 'jealous',
 'joyful',
 'lonely',
 'nostalgic',
 'prepared',
 'proud',
 'sad',
 'sentimental',
 'surprised',
 'terrified',
 'trusting']

In [28]:
df[["context", "speaker_idx", "utterance"]]

Unnamed: 0,context,speaker_idx,utterance
0,guilty,0,Yeah about 10 years ago I had a horrifying exp...
1,guilty,0,No I wasn't hit. It turned out they were drunk...
2,guilty,0,I don't know I was new to driving and hadn't e...
3,caring,45,Well_comma_ can you tell me about your experie...
4,caring,45,Oh my goodness_comma_ that's very scary! I hop...
...,...,...,...
5696,grateful,437,Glad you think so as well!
5697,disgusted,481,I saw a huge cockroach outside my house today!
5698,disgusted,481,Not yet since it's the weekend. We live in Tex...
5699,anxious,481,I have a big test on Monday_comma_ I am so ner...


In [29]:
# Check the lengths of each utterance
df["tokenized_len"] = df["utterance"].apply(lambda x: len(tokenizer.encode(x) + [tokenizer.eos_token_id]))

Token indices sequence length is longer than the specified maximum sequence length for this model (1842 > 1024). Running this sequence through the model will result in indexing errors


In [30]:
tokenizer.max_len_single_sentence

1024

In [31]:
model.config

GPT2Config {
  "_name_or_path": "microsoft/DialoGPT-medium",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1024,
  "n_head": 16,
  "n_inner": null,
  "n_layer": 24,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "conversational": {
      "max_length": 1000
    }
  },
  "transformers_version": "4.19.2",
  "use_cache": true,
  "vocab_size": 50257
}

In [32]:
# Filter out anything that exceed the model max length tokenizer
exceeds_length = df[df["tokenized_len"] > tokenizer.max_len_single_sentence]
print(exceeds_length.shape)
exceeds_length["utterance"]

(4, 9)


1111    My grandpa is coming to visit!,5|5|5_5|5|5,\nh...
1113    I ate a gas station burrito for lunch. It was ...
1144    I am waiting to receive my final grade for a s...
4103    I am by myself most of the time in a isolated ...
Name: utterance, dtype: object

In [33]:
df[df["tokenized_len"] <= tokenizer.max_len_single_sentence].shape

(5697, 9)

In [34]:
# Get rows that do not exceed the length
df = df[~df["conv_id"].isin(exceeds_length["conv_id"])].reset_index(drop=True)

In [35]:
df[["conv_id", "utterance_idx", "speaker_idx", "utterance"]]

Unnamed: 0,conv_id,utterance_idx,speaker_idx,utterance
0,hit:0_conv:0,1,0,Yeah about 10 years ago I had a horrifying exp...
1,hit:0_conv:0,3,0,No I wasn't hit. It turned out they were drunk...
2,hit:0_conv:0,5,0,I don't know I was new to driving and hadn't e...
3,hit:34_conv:69,1,45,Well_comma_ can you tell me about your experie...
4,hit:34_conv:69,3,45,Oh my goodness_comma_ that's very scary! I hop...
...,...,...,...,...
5690,hit:12413_conv:24826,3,437,Glad you think so as well!
5691,hit:12416_conv:24832,1,481,I saw a huge cockroach outside my house today!
5692,hit:12416_conv:24832,3,481,Not yet since it's the weekend. We live in Tex...
5693,hit:12423_conv:24847,1,481,I have a big test on Monday_comma_ I am so ner...


In [36]:
# Prepend the context (e.g. sentimental, sad, etc.) to the utterance
# df["text"] = df["context"] + " " + df["speaker_idx"].apply(lambda s: f"<speaker{str(s % 2)}>") + df["utterance"] + df["speaker_idx"].apply(lambda s: f"</speaker{str(s % 2)}>")
#df["text"] = df["context"] + " " + df["utterance"].apply(lambda x: x.replace("_comma_", ","))
# df["text"] = df["context"] + " " + df["utterance"] + df["speaker_idx"].apply(lambda s: f"<s{str(s % 2)}>")
#df.apply(lambda x: x.prompt in x.utterance, axis=1)
df["text"] = df["context"] + " " + df["utterance"]

In [37]:
df["text"][0]

'guilty Yeah about 10 years ago I had a horrifying experience. It was 100% their fault but they hit the water barrels and survived. They had no injuries but they almost ran me off the road.'

In [38]:
print(df.iloc[0]["prompt"])
print(df.iloc[0]["utterance"])

I felt guilty when I was driving home one night and a person tried to fly into my lane_comma_ and didn't see me. I honked and they swerved back into their lane_comma_ slammed on their brakes_comma_ and hit the water cones.
Yeah about 10 years ago I had a horrifying experience. It was 100% their fault but they hit the water barrels and survived. They had no injuries but they almost ran me off the road.


In [39]:
df.iloc[0]["prompt"]

"I felt guilty when I was driving home one night and a person tried to fly into my lane_comma_ and didn't see me. I honked and they swerved back into their lane_comma_ slammed on their brakes_comma_ and hit the water cones."

In [40]:
turn_token = tokenizer.eos_token
concat_df = df.groupby(["conv_id", "context"])["text"].apply(turn_token.join).reset_index()

In [41]:
text = concat_df.iloc[0].text
text

"guilty Yeah about 10 years ago I had a horrifying experience. It was 100% their fault but they hit the water barrels and survived. They had no injuries but they almost ran me off the road.<|endoftext|>guilty No I wasn't hit. It turned out they were drunk. I felt guilty but realized it was his fault.<|endoftext|>guilty I don't know I was new to driving and hadn't experienced anything like that. I felt like my horn made him swerve into the water barrels."

In [42]:
texts = text.split(turn_token)
texts

['guilty Yeah about 10 years ago I had a horrifying experience. It was 100% their fault but they hit the water barrels and survived. They had no injuries but they almost ran me off the road.',
 "guilty No I wasn't hit. It turned out they were drunk. I felt guilty but realized it was his fault.",
 "guilty I don't know I was new to driving and hadn't experienced anything like that. I felt like my horn made him swerve into the water barrels."]

In [43]:
# Get the count of utterances in each row
utterance_counts = concat_df["text"].apply(lambda r: len(r.split(turn_token)))

In [44]:
# Add column with utterance counts
concat_df["count"] = utterance_counts

In [45]:
# Get the most common utterance count
concat_df["count"].value_counts()

2    1984
3     483
4      69
1       2
Name: count, dtype: int64

In [46]:
# Get utterance count that's most common
num_turns = concat_df["count"].value_counts().idxmax()
print(num_turns)

2


In [47]:
labels = sorted(concat_df["context"].unique())
print(len(labels))
labels

32


['afraid',
 'angry',
 'annoyed',
 'anticipating',
 'anxious',
 'apprehensive',
 'ashamed',
 'caring',
 'confident',
 'content',
 'devastated',
 'disappointed',
 'disgusted',
 'embarrassed',
 'excited',
 'faithful',
 'furious',
 'grateful',
 'guilty',
 'hopeful',
 'impressed',
 'jealous',
 'joyful',
 'lonely',
 'nostalgic',
 'prepared',
 'proud',
 'sad',
 'sentimental',
 'surprised',
 'terrified',
 'trusting']

In [48]:
# Explode text into individual rows
preprocessed_df = concat_df["text"].apply(lambda r: [f"<s{i % 2}>{s}" for i, s in enumerate(r.split(turn_token))]).explode().reset_index()
preprocessed_df

Unnamed: 0,index,text
0,0,<s0>guilty Yeah about 10 years ago I had a hor...
1,0,<s1>guilty No I wasn't hit. It turned out they...
2,0,<s0>guilty I don't know I was new to driving a...
3,1,<s0>surprised I got something nice the other d...
4,1,<s1>surprised It was_comma_ and he does that t...
...,...,...
5690,2536,<s0>ashamed I borrowed a book from the library...
5691,2536,<s1>ashamed Yes it did_comma_ unfortunately
5692,2537,<s0>hopeful my husband lost a job but i'm hopi...
5693,2537,<s1>hopeful thank you so much!


In [49]:
preprocessed_df.iloc[112]

index                                                   51
text     <s1>trusting My friend freaked out but I was p...
Name: 112, dtype: object

In [50]:
df.iloc[112]

conv_id                                           hit:275_conv:551
utterance_idx                                                    3
context                                                    jealous
prompt           Once_comma_ I had plans to meet up with an old...
speaker_idx                                                    117
utterance        Well_comma_ right after she gave me a big hug_...
selfeval                                               4|5|5_5|5|5
tags                                                           NaN
tokenized_len                                                   26
text             jealous Well_comma_ right after she gave me a ...
Name: 112, dtype: object

In [51]:
# get speaker
def get_speaker(text):
    results = re.findall(r"^<s\d>", text)
    return results
    
    
def clean_text(text):
    text = re.sub(r"^<s\d>", "", text)
    text = re.sub(r"{turn_token}$", "", text)
    return re.sub(r"_comma_", ", ", text)

In [52]:
get_speaker(preprocessed_df["text"].iloc[0])

['<s0>']

In [53]:
preprocessed_df["speaker"] = preprocessed_df["text"].apply(lambda t: get_speaker(t)[0])

In [54]:
preprocessed_df

Unnamed: 0,index,text,speaker
0,0,<s0>guilty Yeah about 10 years ago I had a hor...,<s0>
1,0,<s1>guilty No I wasn't hit. It turned out they...,<s1>
2,0,<s0>guilty I don't know I was new to driving a...,<s0>
3,1,<s0>surprised I got something nice the other d...,<s0>
4,1,<s1>surprised It was_comma_ and he does that t...,<s1>
...,...,...,...
5690,2536,<s0>ashamed I borrowed a book from the library...,<s0>
5691,2536,<s1>ashamed Yes it did_comma_ unfortunately,<s1>
5692,2537,<s0>hopeful my husband lost a job but i'm hopi...,<s0>
5693,2537,<s1>hopeful thank you so much!,<s1>


In [55]:
preprocessed_df["text"] = preprocessed_df["text"].apply(lambda t: clean_text(t))

In [56]:
preprocessed_df.iloc[0].text

'guilty Yeah about 10 years ago I had a horrifying experience. It was 100% their fault but they hit the water barrels and survived. They had no injuries but they almost ran me off the road.'

In [57]:
preprocessed_df

Unnamed: 0,index,text,speaker
0,0,guilty Yeah about 10 years ago I had a horrify...,<s0>
1,0,guilty No I wasn't hit. It turned out they wer...,<s1>
2,0,guilty I don't know I was new to driving and h...,<s0>
3,1,"surprised I got something nice the other day, ...",<s0>
4,1,"surprised It was, and he does that type of th...",<s1>
...,...,...,...
5690,2536,ashamed I borrowed a book from the library the...,<s0>
5691,2536,"ashamed Yes it did, unfortunately",<s1>
5692,2537,hopeful my husband lost a job but i'm hoping h...,<s0>
5693,2537,hopeful thank you so much!,<s1>


In [58]:
preprocessed_df.drop(["index"], axis=1, inplace=True)

In [59]:
preprocessed_df.iloc[112]

text       trusting My friend freaked out but I was prett...
speaker                                                 <s1>
Name: 112, dtype: object

In [60]:
preprocessed_df.to_csv(f"../../data/empathetic_dialogue_processed_{dataset_type}.csv")

### Prepare data

In [63]:
from src.utils import prepare_data

In [64]:
preprocessed_df

Unnamed: 0,text,speaker
0,guilty Yeah about 10 years ago I had a horrify...,<s0>
1,guilty No I wasn't hit. It turned out they wer...,<s1>
2,guilty I don't know I was new to driving and h...,<s0>
3,"surprised I got something nice the other day, ...",<s0>
4,"surprised It was, and he does that type of th...",<s1>
...,...,...
5690,ashamed I borrowed a book from the library the...,<s0>
5691,"ashamed Yes it did, unfortunately",<s1>
5692,hopeful my husband lost a job but i'm hoping h...,<s0>
5693,hopeful thank you so much!,<s1>


In [65]:
preprocessed_df[preprocessed_df["speaker"] == "<s1>"]

Unnamed: 0,text,speaker
1,guilty No I wasn't hit. It turned out they wer...,<s1>
4,"surprised It was, and he does that type of th...",<s1>
6,trusting Maybe A move out of my state would be...,<s1>
8,"faithful My girlfriend, How about you? do you...",<s1>
11,trusting Surprisingly it's been a painless pro...,<s1>
...,...,...
5685,"annoyed No, she is fine. I had to find someth...",<s1>
5687,annoyed I would have totally flipped the circu...,<s1>
5689,disgusted It probably will.,<s1>
5691,"ashamed Yes it did, unfortunately",<s1>


In [70]:
preprocessed_df["text"] = preprocessed_df["text"].apply(lambda t: t.replace("_comma_", ","))

In [73]:
prepared_df = prepare_data(data=preprocessed_df, filter_by="speaker==<s1>", text_key="text", num_history=7, test_size=None)

In [74]:
prepared_df.sample()

Unnamed: 0,response,context,context/0,context/1,context/2,context/3,context/4,context/5
1008,afraid Well I knew he was following us and so ...,afraid Me and my mom were at this grocery stor...,"guilty Yes, I know thats what I was thinking....",guilty Once in college this guy I was just fri...,prepared Yes she kind of is. I dont think she ...,prepared We are going to eat tonight. Then tom...,prepared My wife has a big birthday coming up ...,caring I hope this feeling increases


In [75]:
prepared_df.to_csv(f"ed_train__{dataset_type}.csv")

In [76]:
from src.dataset import *

## Counsel Chat Data

In [12]:
counselchat_data = pd.read_csv("../../data/counsel_chat/counselchat-data.csv")

In [13]:
counselchat_data.shape

(1482, 9)

In [14]:
counselchat_data.head(5)

Unnamed: 0,questionID,questionTitle,questionText,questionUrl,topics,therapistName,therapistUrl,answerText,upvotes
0,5566fab2a64752d71ec3ca69,Escalating disagreements between mother and wife,My wife and mother are having tense disagreeme...,https://counselchat.com/questions/escalating-d...,Family Conflict,"Kristi King-Morgan, LMSW",https://counselchat.com/therapists/kristi-king...,<p>What you are describing is something psycho...,0
1,5566f94fa64752d71ec3ca64,I'm addicted to smoking. How can I stop?,"I'm planning to have baby, so I have to quit s...",https://counselchat.com/questions/i-m-addicted...,"Substance Abuse,Addiction",Rebecca Duellman,https://counselchat.com/therapists/rebecca-due...,<p>Hi. Good for you in planning ahead to do wh...,0
2,5567d26887a1cc0c3f3d8f46,Keeping secrets from my family,"I have secrets in my mind, and I don't know wh...",https://counselchat.com/questions/keeping-secr...,Family Conflict,Jeevna Bajaj,https://counselchat.com/therapists/jeevna-bajaj,<p>It sounds like keeping the secrets has beco...,0
3,556bed15c969ba5861709df5,The Underlying Causes of Being Possessive,I am extremely possessive in my relationships ...,https://counselchat.com/questions/the-underlyi...,"Behavioral Change,Social Relationships",Rebecca Duellman,https://counselchat.com/therapists/rebecca-due...,<p>Hi there. It's great you are able to realiz...,0
4,556ba115c969ba5861709de6,Can I control anxiety without medication?,I had a head injury a few years ago and my min...,https://counselchat.com/questions/can-i-contro...,Anxiety,Rebecca Duellman,https://counselchat.com/therapists/rebecca-due...,<p>You didn't say what or how many medications...,0


In [16]:
counselchat_data[["questionText", "answerText"]]

Unnamed: 0,questionText,answerText
0,My wife and mother are having tense disagreeme...,<p>What you are describing is something psycho...
1,"I'm planning to have baby, so I have to quit s...",<p>Hi. Good for you in planning ahead to do wh...
2,"I have secrets in my mind, and I don't know wh...",<p>It sounds like keeping the secrets has beco...
3,I am extremely possessive in my relationships ...,<p>Hi there. It's great you are able to realiz...
4,I had a head injury a few years ago and my min...,<p>You didn't say what or how many medications...
...,...,...
1477,My grandson's step-mother sends him to school ...,<p>Absolutely not!&nbsp;</p><p>It is never in ...
1478,My boyfriend is in recovery from drug addictio...,<p>I'm sorry you have tension between you and ...
1479,The birth mother attempted suicide several tim...,"<p>The true answer is, ""no one can really say ..."
1480,I think adult life is making him depressed and...,<p>How do you help yourself to believe you req...


In [24]:
answer_text = counselchat_data[["questionText", "answerText"]].iloc[0]["answerText"]
answer_text

"<p>What you are describing is something psychologists have termed &#34;triangulation&#34; which is what happens when one family member will not talk to the one they have a problem with and goes to a third member of the family to complain instead. You have been &#34;triangulated&#34; by your wife and mother.</p><p>This is often seen in families. It's seen everywhere. How many times have you had a problem with someone but you didn't go to them to tell them, you went to someone else to complain? It is usually difficult for a person to confront another, especially in relationships where there is a power differential. For example, I bet it's easier to complain to a coworker about your boss rather than go to the boss with your complaint.</p><p>I'm not saying triangulation is always a bad thing. Sometimes a third party mediator is needed to help solve problems between two people who disagree. That's what therapists do every day! Sometimes just getting someone else's perspective can help you 

In [25]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(answer_text, "html.parser")

In [27]:
soup.find_all("p")

[<p>What you are describing is something psychologists have termed "triangulation" which is what happens when one family member will not talk to the one they have a problem with and goes to a third member of the family to complain instead. You have been "triangulated" by your wife and mother.</p>,
 <p>This is often seen in families. It's seen everywhere. How many times have you had a problem with someone but you didn't go to them to tell them, you went to someone else to complain? It is usually difficult for a person to confront another, especially in relationships where there is a power differential. For example, I bet it's easier to complain to a coworker about your boss rather than go to the boss with your complaint.</p>,
 <p>I'm not saying triangulation is always a bad thing. Sometimes a third party mediator is needed to help solve problems between two people who disagree. That's what therapists do every day! Sometimes just getting someone else's perspective can help you see the is

In [17]:
import json

In [260]:
with open("../../data/counsel_chat/counsel_chat_250-tokens_full.json") as fp:
    cc_data = json.load(fp)

In [261]:
cc_data.keys()

dict_keys(['train', 'valid'])

In [262]:
print("training_examples: {num_train}\nvalid_examples: {num_val}".format(num_train=len(cc_data["train"]), num_val=len(cc_data["valid"])))

training_examples: 1839
valid_examples: 173


In [263]:
data["train"][:10]

[{'personality': [''],
  'utterances': [{'history': ["can i change my feeling of being worthless to everyone ? i ' m going through some things with my feelings and myself . i barely sleep and i do nothing but think about how i ' m worthless and how i shouldn ' t be here . i ' ve never tried or contemplated suicide . i ' ve always wanted to fix my issues , but i never get around to it . how can i change my feeling of being worthless to everyone ?"],
    'candidates': ['maybe lower your expectations for a bit',
     'if you are whole - heartedly committed to moving past the sexual and romantic parts of your relationship and just having a friendship than refraining from all the touching would be a good place to start',
     'very often , one person wants to deal with the conflict right away or shortly thereafter and the other person wants to wait',
     '" my best guess is that your boyfriend is triggered by some previous relationship , either romantic or in childhood',
     'can he do th

In [59]:
data["train"][100]["utterances"][0]["candidates"]

['marriage is a people growing process ; you mature into one another and to achieve fulfillment your continued development will be required',
 "is it possible what you ' re feeling is from being reminded of great distress you lived through when you were at a similar age as the kids around whom you feel this sudden and great paranoia and fear",
 'if it is possible , see if he would be willing to go to a counselor with you',
 'it is common with some types of seizures to have an altered state of reality during or shortly after the seizure , so that could also be related',
 'allow yourself some time to reflect on how you feel toward your bf',
 'i am wishing my very best to you',
 'since you are aware of your tendencies and how the interfere with your life , try to become aware of when you do these habits',
 'you know , it does sounds like you have imagined a motherly comfort figure for yourself',
 'at present , the american disability association ( ada ) only allows protection and guiltine

In [27]:
data["train"][100]["utterances"][0]["history"]

["my apartment manager won ' t let me keep an emotional support dog i have been diagnosed with general anxiety and depression by my family doctor . they wrote a prescription for me to have an emotional support dog , i have the paper work , and i gave it to my apartment manager . they said i can ' t keep the esd because i ' m not disabled . what do you suggest i do ?"]

In [61]:
# https://huggingface.co/j-hartmann/emotion-english-distilroberta-base
from transformers import pipeline
classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True)
classifier("I love this!")

Downloading:   0%|          | 0.00/0.98k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/313M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/294 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

[[{'label': 'anger', 'score': 0.004419783595949411},
  {'label': 'disgust', 'score': 0.0016119900392368436},
  {'label': 'fear', 'score': 0.0004138521908316761},
  {'label': 'joy', 'score': 0.9771687984466553},
  {'label': 'neutral', 'score': 0.005764586851000786},
  {'label': 'sadness', 'score': 0.002092392183840275},
  {'label': 'surprise', 'score': 0.008528684265911579}]]

In [62]:
from datasets import load_dataset

In [63]:
dataset = load_dataset("empathetic_dialogues")

Downloading builder script:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/750 [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset empathetic_dialogues/default (download: 26.72 MiB, generated: 23.97 MiB, post-processed: Unknown size, total: 50.69 MiB) to /home/jovyan/.cache/huggingface/datasets/empathetic_dialogues/default/0.1.0/09bbeed3882a67db98c73952fb3c1c9a85af83dc78f81454c2454382fd03f6cf...


Downloading data:   0%|          | 0.00/28.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/76673 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/12030 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10943 [00:00<?, ? examples/s]

Dataset empathetic_dialogues downloaded and prepared to /home/jovyan/.cache/huggingface/datasets/empathetic_dialogues/default/0.1.0/09bbeed3882a67db98c73952fb3c1c9a85af83dc78f81454c2454382fd03f6cf. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [64]:
dataset

DatasetDict({
    train: Dataset({
        features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags'],
        num_rows: 76673
    })
    validation: Dataset({
        features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags'],
        num_rows: 12030
    })
    test: Dataset({
        features: ['conv_id', 'utterance_idx', 'context', 'prompt', 'speaker_idx', 'utterance', 'selfeval', 'tags'],
        num_rows: 10943
    })
})

In [67]:
dataset["train"][0]

{'conv_id': 'hit:0_conv:1',
 'utterance_idx': 1,
 'context': 'sentimental',
 'prompt': 'I remember going to the fireworks with my best friend. There was a lot of people_comma_ but it only felt like us in the world.',
 'speaker_idx': 1,
 'utterance': 'I remember going to see the fireworks with my best friend. It was the first time we ever spent time alone together. Although there was a lot of people_comma_ we felt like the only people in the world.',
 'selfeval': '5|5|5_2|2|5',
 'tags': ''}

In [68]:
df

Unnamed: 0,conv_id,utterance_idx,context,prompt,speaker_idx,utterance,selfeval,tags
0,hit:0_conv:1,1,sentimental,I remember going to the fireworks with my best...,1,I remember going to see the fireworks with my ...,5|5|5_2|2|5,
1,hit:0_conv:1,2,sentimental,I remember going to the fireworks with my best...,0,Was this a friend you were in love with_comma_...,5|5|5_2|2|5,
2,hit:0_conv:1,3,sentimental,I remember going to the fireworks with my best...,1,This was a best friend. I miss her.,5|5|5_2|2|5,
3,hit:0_conv:1,4,sentimental,I remember going to the fireworks with my best...,0,Where has she gone?,5|5|5_2|2|5,
4,hit:0_conv:1,5,sentimental,I remember going to the fireworks with my best...,1,We no longer talk.,5|5|5_2|2|5,
...,...,...,...,...,...,...,...,...
76663,hit:12424_conv:24848,5,sentimental,I found some pictures of my grandma in the att...,389,Yeah reminds me of the good old days. I miss ...,5|5|5_5|5|5,
76664,hit:12424_conv:24849,1,surprised,I woke up this morning to my wife telling me s...,294,I woke up this morning to my wife telling me s...,5|5|5_5|5|5,
76665,hit:12424_conv:24849,2,surprised,I woke up this morning to my wife telling me s...,389,Oh hey that's awesome! That is awesome right?,5|5|5_5|5|5,
76666,hit:12424_conv:24849,3,surprised,I woke up this morning to my wife telling me s...,294,It is soooo awesome. We have been wanting a b...,5|5|5_5|5|5,


# Training Emotion Classifier

In [70]:
classification = df[["context", "utterance"]]

In [71]:
classification

Unnamed: 0,context,utterance
0,sentimental,I remember going to see the fireworks with my ...
1,sentimental,Was this a friend you were in love with_comma_...
2,sentimental,This was a best friend. I miss her.
3,sentimental,Where has she gone?
4,sentimental,We no longer talk.
...,...,...
76663,sentimental,Yeah reminds me of the good old days. I miss ...
76664,surprised,I woke up this morning to my wife telling me s...
76665,surprised,Oh hey that's awesome! That is awesome right?
76666,surprised,It is soooo awesome. We have been wanting a b...


In [73]:
classification.rename(columns={"context": "label", "utterance": "text"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  classification.rename(columns={"context": "label", "utterance": "text"}, inplace=True)


In [90]:
classification["text"] = classification["text"].apply(lambda i: i.replace("_comma_", ","))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  classification["text"] = classification["text"].apply(lambda i: i.replace("_comma_", ","))


In [171]:
classification

Unnamed: 0,label,text
0,sentimental,I remember going to see the fireworks with my ...
1,sentimental,"Was this a friend you were in love with, or ju..."
2,sentimental,This was a best friend. I miss her.
3,sentimental,Where has she gone?
4,sentimental,We no longer talk.
...,...,...
76663,sentimental,Yeah reminds me of the good old days. I miss ...
76664,surprised,I woke up this morning to my wife telling me s...
76665,surprised,Oh hey that's awesome! That is awesome right?
76666,surprised,It is soooo awesome. We have been wanting a b...


In [173]:
labels_map

[('afraid', 0),
 ('angry', 1),
 ('annoyed', 2),
 ('anticipating', 3),
 ('anxious', 4),
 ('apprehensive', 5),
 ('ashamed', 6),
 ('caring', 7),
 ('confident', 8),
 ('content', 9),
 ('devastated', 10),
 ('disappointed', 11),
 ('disgusted', 12),
 ('embarrassed', 13),
 ('excited', 14),
 ('faithful', 15),
 ('furious', 16),
 ('grateful', 17),
 ('guilty', 18),
 ('hopeful', 19),
 ('impressed', 20),
 ('jealous', 21),
 ('joyful', 22),
 ('lonely', 23),
 ('nostalgic', 24),
 ('prepared', 25),
 ('proud', 26),
 ('sad', 27),
 ('sentimental', 28),
 ('surprised', 29),
 ('terrified', 30),
 ('trusting', 31)]

In [112]:
from sklearn.model_selection import train_test_split

In [166]:
labels = classification[["label"]].sort_values("label").drop_duplicates().reset_index(drop=True)

In [174]:
labels_map = dict(list(zip(labels.label, labels.index)))

In [175]:
labels_map

{'afraid': 0,
 'angry': 1,
 'annoyed': 2,
 'anticipating': 3,
 'anxious': 4,
 'apprehensive': 5,
 'ashamed': 6,
 'caring': 7,
 'confident': 8,
 'content': 9,
 'devastated': 10,
 'disappointed': 11,
 'disgusted': 12,
 'embarrassed': 13,
 'excited': 14,
 'faithful': 15,
 'furious': 16,
 'grateful': 17,
 'guilty': 18,
 'hopeful': 19,
 'impressed': 20,
 'jealous': 21,
 'joyful': 22,
 'lonely': 23,
 'nostalgic': 24,
 'prepared': 25,
 'proud': 26,
 'sad': 27,
 'sentimental': 28,
 'surprised': 29,
 'terrified': 30,
 'trusting': 31}

In [181]:
train_df, val_df = train_test_split(classification, test_size=0.2)
train_df["label"].replace(labels_map, inplace=True)
val_df["label"].replace(labels_map, inplace=True)
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_df.reset_index(drop=True))

In [182]:
train_dataset

Dataset({
    features: ['label', 'text'],
    num_rows: 61334
})

In [183]:
dataset = DatasetDict({"train": train_dataset, "test": val_dataset})

In [184]:
dataset["train"][0]

{'label': 19,
 'text': 'Wow, that was very generous and courageous for her! How are they now?'}

In [185]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilroberta-base", num_labels=num_labels)

loading configuration file https://huggingface.co/distilroberta-base/resolve/main/config.json from cache at /home/jovyan/.cache/huggingface/transformers/42d6b7c87cbac84fcdf35aa69504a5ccfca878fcee2a1a9b9ff7a3d1297f9094.aa95727ac70adfa1aaf5c88bea30a4f5e50869c68e68bce96ef1ec41b5facf46
Model config RobertaConfig {
  "_name_or_path": "distilroberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_

In [186]:
from transformers import TrainingArguments

output_dir = "../../models/distilroberta-finetuned"

training_args = TrainingArguments(output_dir=output_dir)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [103]:
! ls ../../models

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
archives  hopperbot-medium


In [187]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

In [188]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [189]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir=output_dir, evaluation_strategy="epoch")

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [193]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

train_dataset = tokenized_datasets["train"].shuffle(seed=42)
eval_dataset = tokenized_datasets["test"].shuffle(seed=42)

#small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
#small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/distilroberta-base/resolve/main/config.json from cache at /home/jovyan/.cache/huggingface/transformers/42d6b7c87cbac84fcdf35aa69504a5ccfca878fcee2a1a9b9ff7a3d1297f9094.aa95727ac70adfa1aaf5c88bea30a4f5e50869c68e68bce96ef1ec41b5facf46
Model config RobertaConfig {
  "_name_or_path": "distilroberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.19.2",
  "type_vocab_siz

  0%|          | 0/62 [00:00<?, ?ba/s]

  0%|          | 0/16 [00:00<?, ?ba/s]

In [194]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [195]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 61334
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 23001
Trainer is attempting to log a value of "{0: 'LABEL_0', 1: 'LABEL_1', 2: 'LABEL_2', 3: 'LABEL_3', 4: 'LABEL_4', 5: 'LABEL_5', 6: 'LABEL_6', 7: 'LABEL_7', 8: 'LABEL_8', 9: 'LABEL_9', 10: 'LABEL_10', 11: 'LABEL_11', 12: 'LABEL_12', 13: 'LABEL_13', 14: 'LABEL_14', 15: 'LABEL_15', 16: 'LABEL_16', 17: 'LABEL_17', 18: 'LABEL_18', 19: 'LABEL_19', 20: 'LABEL_20', 21: 'LABEL_21', 22: 'LABEL_22', 23: 'LABEL_23', 24: 'LABEL_24', 25: 'LABEL_25', 26: 'LABEL_26', 27: 'LABEL_27', 28: 'LABEL_28', 2

Epoch,Training Loss,Validation Loss,Accuracy
1,2.5204,2.507685,0.284205
2,2.2878,2.438132,0.308791
3,1.9551,2.479506,0.319421


Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-500
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-500/config.json


Attempted to log scalar metric loss:
3.1307
Attempted to log scalar metric learning_rate:
4.891309073518543e-05
Attempted to log scalar metric epoch:
0.07


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-1000
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-1000/config.json


Attempted to log scalar metric loss:
2.8821
Attempted to log scalar metric learning_rate:
4.7826181470370854e-05
Attempted to log scalar metric epoch:
0.13


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-1500
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-1500/config.json


Attempted to log scalar metric loss:
2.7879
Attempted to log scalar metric learning_rate:
4.673927220555629e-05
Attempted to log scalar metric epoch:
0.2


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-2000
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-2000/config.json


Attempted to log scalar metric loss:
2.7616
Attempted to log scalar metric learning_rate:
4.565236294074171e-05
Attempted to log scalar metric epoch:
0.26


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-2500
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-2500/config.json


Attempted to log scalar metric loss:
2.7263
Attempted to log scalar metric learning_rate:
4.456545367592714e-05
Attempted to log scalar metric epoch:
0.33


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-3000
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-3000/config.json


Attempted to log scalar metric loss:
2.7273
Attempted to log scalar metric learning_rate:
4.3478544411112565e-05
Attempted to log scalar metric epoch:
0.39


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-3000/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-3500
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-3500/config.json


Attempted to log scalar metric loss:
2.6745
Attempted to log scalar metric learning_rate:
4.2391635146297984e-05
Attempted to log scalar metric epoch:
0.46


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-3500/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-4000
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-4000/config.json


Attempted to log scalar metric loss:
2.6662
Attempted to log scalar metric learning_rate:
4.1304725881483417e-05
Attempted to log scalar metric epoch:
0.52


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-4000/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-4500
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-4500/config.json


Attempted to log scalar metric loss:
2.6314
Attempted to log scalar metric learning_rate:
4.021781661666884e-05
Attempted to log scalar metric epoch:
0.59


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-4500/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-5000
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-5000/config.json


Attempted to log scalar metric loss:
2.6284
Attempted to log scalar metric learning_rate:
3.913090735185427e-05
Attempted to log scalar metric epoch:
0.65


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-5000/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-5500
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-5500/config.json


Attempted to log scalar metric loss:
2.6162
Attempted to log scalar metric learning_rate:
3.8043998087039694e-05
Attempted to log scalar metric epoch:
0.72


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-5500/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-6000
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-6000/config.json


Attempted to log scalar metric loss:
2.6259
Attempted to log scalar metric learning_rate:
3.695708882222512e-05
Attempted to log scalar metric epoch:
0.78


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-6000/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-6500
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-6500/config.json


Attempted to log scalar metric loss:
2.6059
Attempted to log scalar metric learning_rate:
3.5870179557410546e-05
Attempted to log scalar metric epoch:
0.85


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-6500/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-7000
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-7000/config.json


Attempted to log scalar metric loss:
2.5412
Attempted to log scalar metric learning_rate:
3.478327029259598e-05
Attempted to log scalar metric epoch:
0.91


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-7000/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-7500
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-7500/config.json


Attempted to log scalar metric loss:
2.5204
Attempted to log scalar metric learning_rate:
3.3696361027781405e-05
Attempted to log scalar metric epoch:
0.98


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-7500/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 15334
  Batch size = 8


Attempted to log scalar metric eval_loss:
2.5076847076416016
Attempted to log scalar metric eval_accuracy:
0.2842050345637146
Attempted to log scalar metric eval_runtime:
90.0022
Attempted to log scalar metric eval_samples_per_second:
170.374
Attempted to log scalar metric eval_steps_per_second:
21.299
Attempted to log scalar metric epoch:
1.0


Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-8000
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-8000/config.json


Attempted to log scalar metric loss:
2.4298
Attempted to log scalar metric learning_rate:
3.260945176296683e-05
Attempted to log scalar metric epoch:
1.04


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-8000/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-8500
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-8500/config.json


Attempted to log scalar metric loss:
2.3316
Attempted to log scalar metric learning_rate:
3.152254249815226e-05
Attempted to log scalar metric epoch:
1.11


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-8500/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-9000
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-9000/config.json


Attempted to log scalar metric loss:
2.3244
Attempted to log scalar metric learning_rate:
3.0435633233337686e-05
Attempted to log scalar metric epoch:
1.17


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-9000/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-9500
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-9500/config.json


Attempted to log scalar metric loss:
2.3098
Attempted to log scalar metric learning_rate:
2.9348723968523105e-05
Attempted to log scalar metric epoch:
1.24


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-9500/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-10000
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-10000/config.json


Attempted to log scalar metric loss:
2.3056
Attempted to log scalar metric learning_rate:
2.8261814703708534e-05
Attempted to log scalar metric epoch:
1.3


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-10000/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-10500
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-10500/config.json


Attempted to log scalar metric loss:
2.3398
Attempted to log scalar metric learning_rate:
2.717490543889396e-05
Attempted to log scalar metric epoch:
1.37


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-10500/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-11000
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-11000/config.json


Attempted to log scalar metric loss:
2.3433
Attempted to log scalar metric learning_rate:
2.6087996174079386e-05
Attempted to log scalar metric epoch:
1.43


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-11000/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-11500
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-11500/config.json


Attempted to log scalar metric loss:
2.2889
Attempted to log scalar metric learning_rate:
2.5001086909264816e-05
Attempted to log scalar metric epoch:
1.5


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-11500/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-12000
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-12000/config.json


Attempted to log scalar metric loss:
2.3036
Attempted to log scalar metric learning_rate:
2.391417764445024e-05
Attempted to log scalar metric epoch:
1.57


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-12000/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-12500
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-12500/config.json


Attempted to log scalar metric loss:
2.3061
Attempted to log scalar metric learning_rate:
2.2827268379635667e-05
Attempted to log scalar metric epoch:
1.63


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-12500/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-13000
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-13000/config.json


Attempted to log scalar metric loss:
2.2936
Attempted to log scalar metric learning_rate:
2.1740359114821097e-05
Attempted to log scalar metric epoch:
1.7


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-13000/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-13500
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-13500/config.json


Attempted to log scalar metric loss:
2.3285
Attempted to log scalar metric learning_rate:
2.0653449850006523e-05
Attempted to log scalar metric epoch:
1.76


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-13500/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-14000
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-14000/config.json


Attempted to log scalar metric loss:
2.3153
Attempted to log scalar metric learning_rate:
1.9566540585191952e-05
Attempted to log scalar metric epoch:
1.83


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-14000/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-14500
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-14500/config.json


Attempted to log scalar metric loss:
2.2559
Attempted to log scalar metric learning_rate:
1.8479631320377375e-05
Attempted to log scalar metric epoch:
1.89


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-14500/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-15000
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-15000/config.json


Attempted to log scalar metric loss:
2.2878
Attempted to log scalar metric learning_rate:
1.73927220555628e-05
Attempted to log scalar metric epoch:
1.96


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-15000/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 15334
  Batch size = 8


Attempted to log scalar metric eval_loss:
2.4381320476531982
Attempted to log scalar metric eval_accuracy:
0.30879092213382026
Attempted to log scalar metric eval_runtime:
89.9796
Attempted to log scalar metric eval_samples_per_second:
170.416
Attempted to log scalar metric eval_steps_per_second:
21.305
Attempted to log scalar metric epoch:
2.0


Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-15500
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-15500/config.json


Attempted to log scalar metric loss:
2.1943
Attempted to log scalar metric learning_rate:
1.630581279074823e-05
Attempted to log scalar metric epoch:
2.02


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-15500/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-16000
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-16000/config.json


Attempted to log scalar metric loss:
2.0036
Attempted to log scalar metric learning_rate:
1.5218903525933656e-05
Attempted to log scalar metric epoch:
2.09


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-16000/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-16500
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-16500/config.json


Attempted to log scalar metric loss:
1.9668
Attempted to log scalar metric learning_rate:
1.4131994261119083e-05
Attempted to log scalar metric epoch:
2.15


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-16500/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-17000
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-17000/config.json


Attempted to log scalar metric loss:
2.0054
Attempted to log scalar metric learning_rate:
1.3045084996304511e-05
Attempted to log scalar metric epoch:
2.22


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-17000/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-17500
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-17500/config.json


Attempted to log scalar metric loss:
2.0004
Attempted to log scalar metric learning_rate:
1.1958175731489935e-05
Attempted to log scalar metric epoch:
2.28


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-17500/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-18000
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-18000/config.json


Attempted to log scalar metric loss:
1.9661
Attempted to log scalar metric learning_rate:
1.0871266466675363e-05
Attempted to log scalar metric epoch:
2.35


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-18000/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-18500
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-18500/config.json


Attempted to log scalar metric loss:
1.9995
Attempted to log scalar metric learning_rate:
9.784357201860789e-06
Attempted to log scalar metric epoch:
2.41


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-18500/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-19000
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-19000/config.json


Attempted to log scalar metric loss:
1.9819
Attempted to log scalar metric learning_rate:
8.697447937046216e-06
Attempted to log scalar metric epoch:
2.48


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-19000/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-19500
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-19500/config.json


Attempted to log scalar metric loss:
1.986
Attempted to log scalar metric learning_rate:
7.610538672231643e-06
Attempted to log scalar metric epoch:
2.54


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-19500/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-20000
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-20000/config.json


Attempted to log scalar metric loss:
1.9451
Attempted to log scalar metric learning_rate:
6.523629407417069e-06
Attempted to log scalar metric epoch:
2.61


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-20000/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-20500
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-20500/config.json


Attempted to log scalar metric loss:
1.9905
Attempted to log scalar metric learning_rate:
5.436720142602496e-06
Attempted to log scalar metric epoch:
2.67


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-20500/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-21000
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-21000/config.json


Attempted to log scalar metric loss:
2.0059
Attempted to log scalar metric learning_rate:
4.3498108777879226e-06
Attempted to log scalar metric epoch:
2.74


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-21000/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-21500
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-21500/config.json


Attempted to log scalar metric loss:
1.9795
Attempted to log scalar metric learning_rate:
3.2629016129733493e-06
Attempted to log scalar metric epoch:
2.8


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-21500/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-22000
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-22000/config.json


Attempted to log scalar metric loss:
1.9223
Attempted to log scalar metric learning_rate:
2.1759923481587757e-06
Attempted to log scalar metric epoch:
2.87


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-22000/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-22500
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-22500/config.json


Attempted to log scalar metric loss:
1.9829
Attempted to log scalar metric learning_rate:
1.0890830833442024e-06
Attempted to log scalar metric epoch:
2.93


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-22500/pytorch_model.bin
Saving model checkpoint to ../../models/distilroberta-finetuned/checkpoint-23000
Configuration saved in ../../models/distilroberta-finetuned/checkpoint-23000/config.json


Attempted to log scalar metric loss:
1.9551
Attempted to log scalar metric learning_rate:
2.1738185296291465e-09
Attempted to log scalar metric epoch:
3.0


Model weights saved in ../../models/distilroberta-finetuned/checkpoint-23000/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 15334
  Batch size = 8


Attempted to log scalar metric eval_loss:
2.4795055389404297
Attempted to log scalar metric eval_accuracy:
0.3194208947437068
Attempted to log scalar metric eval_runtime:
89.9825
Attempted to log scalar metric eval_samples_per_second:
170.411
Attempted to log scalar metric eval_steps_per_second:
21.304
Attempted to log scalar metric epoch:
3.0




Training completed. Do not forget to share your model on huggingface.co/models =)




Attempted to log scalar metric train_runtime:
3591.6709
Attempted to log scalar metric train_samples_per_second:
51.23
Attempted to log scalar metric train_steps_per_second:
6.404
Attempted to log scalar metric total_flos:
2.438730670060339e+16
Attempted to log scalar metric train_loss:
2.3298980236924174
Attempted to log scalar metric epoch:
3.0


TrainOutput(global_step=23001, training_loss=2.3298980236924174, metrics={'train_runtime': 3591.6709, 'train_samples_per_second': 51.23, 'train_steps_per_second': 6.404, 'total_flos': 2.438730670060339e+16, 'train_loss': 2.3298980236924174, 'epoch': 3.0})

In [199]:
trainer.save_model(output_dir)

Saving model checkpoint to ../../models/distilroberta-finetuned
Configuration saved in ../../models/distilroberta-finetuned/config.json
Model weights saved in ../../models/distilroberta-finetuned/pytorch_model.bin


In [204]:
tokenizer.save_pretrained(output_dir)
tokenizer.save_vocabulary(output_dir)

tokenizer config file saved in ../../models/distilroberta-finetuned/tokenizer_config.json
Special tokens file saved in ../../models/distilroberta-finetuned/special_tokens_map.json


('../../models/distilroberta-finetuned/vocab.json',
 '../../models/distilroberta-finetuned/merges.txt')

In [211]:
id2label = dict((v,k) for k,v in labels_map.items())

In [212]:
from transformers import pipeline
classifier = pipeline("text-classification", model=output_dir, return_all_scores=True)

loading configuration file ../../models/distilroberta-finetuned/config.json
Model config RobertaConfig {
  "_name_or_path": "../../models/distilroberta-finetuned",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABEL_25",
    "26": "LABEL

In [215]:
classifier.model.config.id2label = id2label
classifier.model.config.label2id = labels_map

In [216]:
classifier.model.config

RobertaConfig {
  "_name_or_path": "../../models/distilroberta-finetuned",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "afraid",
    "1": "angry",
    "2": "annoyed",
    "3": "anticipating",
    "4": "anxious",
    "5": "apprehensive",
    "6": "ashamed",
    "7": "caring",
    "8": "confident",
    "9": "content",
    "10": "devastated",
    "11": "disappointed",
    "12": "disgusted",
    "13": "embarrassed",
    "14": "excited",
    "15": "faithful",
    "16": "furious",
    "17": "grateful",
    "18": "guilty",
    "19": "hopeful",
    "20": "impressed",
    "21": "jealous",
    "22": "joyful",
    "23": "lonely",
    "24": "nostalgic",
    "25": "prepared",
    "26": "proud",
    "27": "sad",
    "28": "sentimental",
    "29": "surprised",
    "30": "

In [220]:
classifier("I'm still grieving the passing of my father. Sometimes I think, I'll never get over it")

[[{'label': 'afraid', 'score': 0.001271251356229186},
  {'label': 'angry', 'score': 0.002481884090229869},
  {'label': 'annoyed', 'score': 0.0008107623434625566},
  {'label': 'anticipating', 'score': 0.00017449459119234234},
  {'label': 'anxious', 'score': 0.0008662308100610971},
  {'label': 'apprehensive', 'score': 0.00039876039954833686},
  {'label': 'ashamed', 'score': 0.00147321040276438},
  {'label': 'caring', 'score': 0.0030949898064136505},
  {'label': 'confident', 'score': 0.00012000662536593154},
  {'label': 'content', 'score': 0.00021913884847890586},
  {'label': 'devastated', 'score': 0.40299463272094727},
  {'label': 'disappointed', 'score': 0.005933469161391258},
  {'label': 'disgusted', 'score': 0.0012640400091186166},
  {'label': 'embarrassed', 'score': 0.0018726286944001913},
  {'label': 'excited', 'score': 0.00021567723888438195},
  {'label': 'faithful', 'score': 0.0010265909368172288},
  {'label': 'furious', 'score': 0.0019771941006183624},
  {'label': 'grateful', 'sc

In [221]:
classifier("Life is meaningless.")

[[{'label': 'afraid', 'score': 0.006920903921127319},
  {'label': 'angry', 'score': 0.016934223473072052},
  {'label': 'annoyed', 'score': 0.00985314417630434},
  {'label': 'anticipating', 'score': 0.001343411160632968},
  {'label': 'anxious', 'score': 0.004305040463805199},
  {'label': 'apprehensive', 'score': 0.004895302467048168},
  {'label': 'ashamed', 'score': 0.004045652225613594},
  {'label': 'caring', 'score': 0.004559752531349659},
  {'label': 'confident', 'score': 0.0012360070832073689},
  {'label': 'content', 'score': 0.008918428793549538},
  {'label': 'devastated', 'score': 0.11213884502649307},
  {'label': 'disappointed', 'score': 0.030838578939437866},
  {'label': 'disgusted', 'score': 0.00690632127225399},
  {'label': 'embarrassed', 'score': 0.005127838812768459},
  {'label': 'excited', 'score': 0.0015857134712859988},
  {'label': 'faithful', 'score': 0.013053747825324535},
  {'label': 'furious', 'score': 0.01415031123906374},
  {'label': 'grateful', 'score': 0.004609598

In [226]:
classifier(["Congratulations on your wedding!", "Who am I kidding? I'm never gonna get that.", "Going to that festival was my favorite childhood activity."])

[[{'label': 'afraid', 'score': 0.0009797802194952965},
  {'label': 'angry', 'score': 0.0005866821738891304},
  {'label': 'annoyed', 'score': 0.0006858629640191793},
  {'label': 'anticipating', 'score': 0.0569567009806633},
  {'label': 'anxious', 'score': 0.018783459439873695},
  {'label': 'apprehensive', 'score': 0.002131709596142173},
  {'label': 'ashamed', 'score': 0.001870174310170114},
  {'label': 'caring', 'score': 0.009022172540426254},
  {'label': 'confident', 'score': 0.036349739879369736},
  {'label': 'content', 'score': 0.06657946854829788},
  {'label': 'devastated', 'score': 0.003845076309517026},
  {'label': 'disappointed', 'score': 0.0017840979853644967},
  {'label': 'disgusted', 'score': 0.0005492960917763412},
  {'label': 'embarrassed', 'score': 0.0008198667201213539},
  {'label': 'excited', 'score': 0.1879899799823761},
  {'label': 'faithful', 'score': 0.03524446487426758},
  {'label': 'furious', 'score': 0.00036482090945355594},
  {'label': 'grateful', 'score': 0.01934

In [228]:
results = classifier("I'm still grieving the passing of my father. Sometimes I think, I'll never get over it")

In [230]:
results[0]

[{'label': 'afraid', 'score': 0.001271251356229186},
 {'label': 'angry', 'score': 0.002481884090229869},
 {'label': 'annoyed', 'score': 0.0008107623434625566},
 {'label': 'anticipating', 'score': 0.00017449459119234234},
 {'label': 'anxious', 'score': 0.0008662308100610971},
 {'label': 'apprehensive', 'score': 0.00039876039954833686},
 {'label': 'ashamed', 'score': 0.00147321040276438},
 {'label': 'caring', 'score': 0.0030949898064136505},
 {'label': 'confident', 'score': 0.00012000662536593154},
 {'label': 'content', 'score': 0.00021913884847890586},
 {'label': 'devastated', 'score': 0.40299463272094727},
 {'label': 'disappointed', 'score': 0.005933469161391258},
 {'label': 'disgusted', 'score': 0.0012640400091186166},
 {'label': 'embarrassed', 'score': 0.0018726286944001913},
 {'label': 'excited', 'score': 0.00021567723888438195},
 {'label': 'faithful', 'score': 0.0010265909368172288},
 {'label': 'furious', 'score': 0.0019771941006183624},
 {'label': 'grateful', 'score': 0.0006073678

In [233]:
max_item = max(results[0], key=lambda x:x["score"])

In [234]:
max_item

{'label': 'sad', 'score': 0.5276297926902771}

In [567]:
result = results[0]

In [576]:
result.sort(key=lambda item: item.get("score"), reverse=True)

In [578]:
result[:3]

[{'label': 'sad', 'score': 0.5276297926902771},
 {'label': 'devastated', 'score': 0.40299463272094727},
 {'label': 'sentimental', 'score': 0.024283558130264282}]

In [235]:
print(output_dir)

../../models/distilroberta-finetuned


In [245]:
from transformers import pipeline

label2id = {'afraid': 0, 'angry': 1, 'annoyed': 2, 'anticipating': 3, 'anxious': 4, 'apprehensive': 5, 'ashamed': 6, 'caring': 7, 'confident': 8, 'content': 9, 'devastated': 10, 'disappointed': 11, 'disgusted': 12, 'embarrassed': 13, 'excited': 14, 'faithful': 15, 'furious': 16, 'grateful': 17, 'guilty': 18, 'hopeful': 19, 'impressed': 20, 'jealous': 21, 'joyful': 22, 'lonely': 23, 'nostalgic': 24, 'prepared': 25, 'proud': 26, 'sad': 27, 'sentimental': 28, 'surprised': 29, 'terrified': 30, 'trusting': 31}
id2label = dict((v,k) for k,v in label2id.items())

classifier = pipeline("text-classification", model=output_dir, return_all_scores=True)
classifier.model.config.id2label = id2label
classifier.model.config.label2id = label2id

def classify_emotion(text):
    results = classifier(text)
    max_score = max(results[0], key=lambda x:x["score"])
    return max_score["label"]


loading configuration file ../../models/distilroberta-finetuned/config.json
Model config RobertaConfig {
  "_name_or_path": "../../models/distilroberta-finetuned",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABEL_25",
    "26": "LABEL

In [250]:
classify_emotion("I wonder whether I'll decide one day that I'm tired of people and simply run off to the woods to live a life of solitude.")

'lonely'

In [251]:
text = """
These are the thoughts that come to mind when I reflect on my performance: I am incompetent. I am worthless. I am not worth saving. I should be dead. I should have died years ago. I should  make it easier for everyone and just die now. 
"""

classify_emotion(text)

'ashamed'

In [238]:
print(labels_map)

{'afraid': 0, 'angry': 1, 'annoyed': 2, 'anticipating': 3, 'anxious': 4, 'apprehensive': 5, 'ashamed': 6, 'caring': 7, 'confident': 8, 'content': 9, 'devastated': 10, 'disappointed': 11, 'disgusted': 12, 'embarrassed': 13, 'excited': 14, 'faithful': 15, 'furious': 16, 'grateful': 17, 'guilty': 18, 'hopeful': 19, 'impressed': 20, 'jealous': 21, 'joyful': 22, 'lonely': 23, 'nostalgic': 24, 'prepared': 25, 'proud': 26, 'sad': 27, 'sentimental': 28, 'surprised': 29, 'terrified': 30, 'trusting': 31}


In [239]:
print(id2label)


{0: 'afraid', 1: 'angry', 2: 'annoyed', 3: 'anticipating', 4: 'anxious', 5: 'apprehensive', 6: 'ashamed', 7: 'caring', 8: 'confident', 9: 'content', 10: 'devastated', 11: 'disappointed', 12: 'disgusted', 13: 'embarrassed', 14: 'excited', 15: 'faithful', 16: 'furious', 17: 'grateful', 18: 'guilty', 19: 'hopeful', 20: 'impressed', 21: 'jealous', 22: 'joyful', 23: 'lonely', 24: 'nostalgic', 25: 'prepared', 26: 'proud', 27: 'sad', 28: 'sentimental', 29: 'surprised', 30: 'terrified', 31: 'trusting'}


In [563]:
help(json.dump)

Help on function dump in module json:

dump(obj, fp, *, skipkeys=False, ensure_ascii=True, check_circular=True, allow_nan=True, cls=None, indent=None, separators=None, default=None, sort_keys=False, **kw)
    Serialize ``obj`` as a JSON formatted stream to ``fp`` (a
    ``.write()``-supporting file-like object).
    
    If ``skipkeys`` is true then ``dict`` keys that are not basic types
    (``str``, ``int``, ``float``, ``bool``, ``None``) will be skipped
    instead of raising a ``TypeError``.
    
    If ``ensure_ascii`` is false, then the strings written to ``fp`` can
    contain non-ASCII characters if they appear in strings contained in
    ``obj``. Otherwise, all such characters are escaped in JSON strings.
    
    If ``check_circular`` is false, then the circular reference check
    for container types will be skipped and a circular reference will
    result in an ``RecursionError`` (or worse).
    
    If ``allow_nan`` is false, then it will be a ``ValueError`` to
    seriali

In [565]:
with open("../../models/distilroberta-finetuned__exported/label2id.json", "w+") as f:
    json.dump(label2id, f)
    
with open("../../models/distilroberta-finetuned__exported/id2label.json", "w+") as f:
    json.dump(id2label, f)
    

In [252]:
from src.utils import export_model, make_tarfile

In [256]:
help(export_model)

Help on function export_model in module src.utils:

export_model(model_path, output_path)



In [259]:
export_model(output_dir)

loading configuration file ../../models/distilroberta-finetuned/config.json
Model config RobertaConfig {
  "_name_or_path": "../../models/distilroberta-finetuned",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABEL_25",
    "26": "LABEL

Saved to ../../models/distilroberta-finetuned__exported


In [269]:
classify_emotion("The exhaustion of the soul can sometimes manifest itself as the exhaustion of the body.")

'anxious'

In [270]:
classify_emotion("I arrive home everyday feeling all my energy spent. There are days when I would much rather retire to my bed than eat dinner or read a book.")

'content'

In [271]:
classify_emotion("I have lost my sense of purpose. That may be why these dark feelings have remained. ")

'lonely'

In [272]:
classify_emotion("I can’t be honest with anyone, because nobody seems to validate what I’m going through. ")

'faithful'

In [273]:
classify_emotion("I’m absolutely miserable. I’m always made the villain here, rather than the one trying to get through her life. ")

'sad'

In [274]:
classify_emotion("Does anyone realize I’m actively suicidal? Probably not. I’ve not really indicated that I’ve been feeling suicidal.")

'apprehensive'

In [275]:
classify_emotion("Will I lie to try to get out of there without being put in the hospital?")

'guilty'

In [580]:
with open("test.json", "w+") as f:
    json.dump(id2label, f)

In [582]:
id2label[0]

'afraid'

# Test the Data

In [903]:
import pandas as pd

In [909]:
test_df = pd.read_csv("../../data/empatheticdialogues/valid.csv", on_bad_lines="skip")

In [911]:
test_df

Unnamed: 0,conv_id,utterance_idx,context,prompt,speaker_idx,utterance,selfeval,tags
0,hit:3_conv:6,1,terrified,Today_comma_as i was leaving for work in the m...,6,Today_comma_as i was leaving for work in the m...,4|5|5_5|5|5,
1,hit:3_conv:6,3,terrified,Today_comma_as i was leaving for work in the m...,6,Yeah_comma_i'm doing alright now_comma_ but wi...,4|5|5_5|5|5,
2,hit:3_conv:6,5,terrified,Today_comma_as i was leaving for work in the m...,6,The car was badly damaged_comma_i veered outsi...,4|5|5_5|5|5,
3,hit:4_conv:9,1,surprised,I was walking through my hallway a few week ag...,8,A few weeks ago_comma_ I was walking through m...,5|5|5_3|5|5,
4,hit:4_conv:9,3,surprised,I was walking through my hallway a few week ag...,8,I may have let out a scream that will have him...,5|5|5_3|5|5,
...,...,...,...,...,...,...,...,...
6313,hit:12350_conv:24701,5,prepared,When I went into labor with my son_comma_ it w...,791,In the end_comma_ yes--growing too big is actu...,5|5|5_5|5|5,
6314,hit:12361_conv:24722,1,prepared,One time I studied all night for my final exam!,547,One time I studied all night for my final exam!,4|4|5_4|4|3,
6315,hit:12361_conv:24722,3,prepared,One time I studied all night for my final exam!,547,It was for Organic Chemistry,4|4|5_4|4|3,
6316,hit:12392_conv:24785,1,furious,One of my coworkers has been arguing with his ...,791,One of my coworkers has been arguing with his ...,4|5|5_5|5|5,


In [915]:
df[["utterance"]].iloc[:]

Unnamed: 0,utterance
0,I remember going to see the fireworks with my ...
1,Was this a friend you were in love with_comma_...
2,This was a best friend. I miss her.
3,Where has she gone?
4,We no longer talk.
5,Oh was this something that happened because of...
6,it feels like hitting to blank wall when i se...
7,Oh ya? I don't really see how
8,dont you feel so.. its a wonder
9,I do actually hit blank walls a lot of times b...


In [917]:
len("I just don't feel confident about anything".split(" "))

7