In [25]:
import pandas as pd
import numpy as np
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import os
from datasets import load_dataset, concatenate_datasets
from transformers.pipelines.pt_utils import KeyDataset
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import  AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer, TextClassificationPipeline, pipeline

In [16]:
def get_label(output):
    highest_score = max(output[0], key=lambda x: x['score'])
    return highest_score['label'], highest_score["score"]
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

### Prepare Datasets

In [2]:
directory_path = 'datasets/manual_annotations'

combined_df = pd.DataFrame()

for filename in os.listdir(directory_path):
    if filename.endswith('.txt'):
        print(filename)
        file_path = os.path.join(directory_path, filename)
        df = pd.DataFrame()
        df["text"] = pd.read_csv(file_path, sep="\t")
        df["label"] = os.path.splitext(filename)[0]
        #all_data.append(df)
        
        combined_df = pd.concat([combined_df,df], ignore_index=True)

combined_df.head()

consoling.txt
expressing_relief.txt
expressing_care_concern.txt
sympathizing.txt
sharing_opinions.txt
acknowledging.txt
advising.txt
suggesting.txt
agreeing.txt
sharing_experience.txt
encouraging.txt
appreciating.txt
disapproving.txt
wishing.txt
questioning.txt


Unnamed: 0,text,label
0,"Aw, cheer up, friend. It's rough now, but it w...",consoling
1,Hopefully he is ok,consoling
2,hopefully they can get him some meds and he ge...,consoling
3,Hopefully you are not to mad because of it. Fl...,consoling
4,Hopefully it wont cost too much.,consoling


In [3]:
combined_df = combined_df[["label", "text"]].dropna()

In [4]:
combined_df["label"].value_counts()

questioning                125
acknowledging              115
agreeing                    48
consoling                   39
sympathizing                26
encouraging                 26
wishing                     21
sharing_opinions            19
suggesting                  19
sharing_experience          18
advising                    12
expressing_care_concern     10
expressing_relief            6
disapproving                 4
appreciating                 3
Name: label, dtype: int64

In [5]:
combined_df.replace({"agreeing":32, "acknowledging": 33,
                     "encouraging": 34, "consoling": 35,
                     "sympathizing": 36, "suggesting": 37,
                     "questioning": 38, "wishing": 39,
                     "neutral":40}, inplace=True)
combined_df.head()

Unnamed: 0,label,text
0,35,"Aw, cheer up, friend. It's rough now, but it w..."
1,35,Hopefully he is ok
2,35,hopefully they can get him some meds and he ge...
3,35,Hopefully you are not to mad because of it. Fl...
4,35,Hopefully it wont cost too much.


In [6]:
combined_df = combined_df[combined_df["label"].isin([32,33,34,35,36,37,38,39,40])]

In [7]:
label_range = set(range(32, 41))

file_path = 'datasets/lexically_extended_intent_data/train/'
for filename in os.listdir(file_path):
    if filename.endswith('.txt'):
        with open(file_path+filename, 'r') as file:
            for line in file:
                label, text = line.strip().split('<SEP>')
                label = int(label)

                if label in label_range:
                    new_row = pd.DataFrame({'label': [label], 'text': [text]})
                    combined_df = pd.concat([combined_df, new_row], ignore_index=True)
combined_df.label.value_counts()

38    17336
33     7821
40     2928
36     1840
32     1817
35     1287
37     1146
34     1082
39      774
Name: label, dtype: int64

In [8]:
combined_df.sample(frac=1)
combined_df.to_csv("EmpIntent_train.csv")

In [9]:
val_df = pd.DataFrame()

label_range = set(range(32, 41))

file_path = 'datasets/lexically_extended_intent_data/validation/'
for filename in os.listdir(file_path):
    if filename.endswith('.txt'):
        with open(file_path+filename, 'r') as file:
            for line in file:
                label, text = line.strip().split('<SEP>')
                label = int(label)

                if label in label_range:
                    new_row = pd.DataFrame({'label': [label], 'text': [text]})
                    val_df = pd.concat([val_df, new_row], ignore_index=True)
val_df.label.value_counts()

38    2388
33    1182
40     430
36     271
32     269
35     190
34     169
37     153
39     127
Name: label, dtype: int64

In [10]:
val_df.sample(frac=1)
val_df.to_csv("EmpIntent_validation.csv")

In [11]:
test_df = pd.DataFrame()

label_range = set(range(32, 41))

file_path = 'datasets/lexically_extended_intent_data/validation/'
for filename in os.listdir(file_path):
    if filename.endswith('.txt'):
        with open(file_path+filename, 'r') as file:
            for line in file:
                label, text = line.strip().split('<SEP>')
                label = int(label)

                if label in label_range:
                    new_row = pd.DataFrame({'label': [label], 'text': [text]})
                    test_df = pd.concat([test_df, new_row], ignore_index=True)
test_df.label.value_counts()

38    2388
33    1182
40     430
36     271
32     269
35     190
34     169
37     153
39     127
Name: label, dtype: int64

In [12]:
test_df.sample(frac=1)
test_df.to_csv("EmpIntent_test.csv")

In [16]:
train = pd.read_csv("EmpIntent_train.csv", index_col="Unnamed: 0").sample(frac=1)
valid = pd.read_csv("EmpIntent_validation.csv", index_col="Unnamed: 0").sample(frac=1)
test = pd.read_csv("EmpIntent_test.csv", index_col="Unnamed: 0").sample(frac=1)

In [17]:
train.replace({32:1, 33:2, 34:3, 35:4, 36:5, 37:6, 38:7, 39:8, 40:0}, inplace=True)
valid.replace({32:1, 33:2, 34:3, 35:4, 36:5, 37:6, 38:7, 39:8, 40:0}, inplace=True)
test.replace({32:1, 33:2, 34:3, 35:4, 36:5, 37:6, 38:7, 39:8, 40:0}, inplace=True)

In [19]:
train.to_csv("EmpIntent_train.csv")
valid.to_csv("EmpIntent_validation.csv")
test.to_csv("EmpIntent_test.csv")

### Train Bert Classifier

#### Loading and tokenizing data

In [2]:
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [3]:
train = load_dataset('csv', data_files="EmpIntent_train.csv")

valid = load_dataset('csv', data_files="EmpIntent_validation.csv")

test = load_dataset('csv', data_files="EmpIntent_test.csv")

Found cached dataset csv (/home/jovyan/.cache/huggingface/datasets/csv/default-a6b6de325c33cbfb/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset csv (/home/jovyan/.cache/huggingface/datasets/csv/default-cd8c89afda16d5f5/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/1 [00:00<?, ?it/s]

Found cached dataset csv (/home/jovyan/.cache/huggingface/datasets/csv/default-bcd8e1782386e58e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [6]:
train_tokenized = train.map(tokenize_function, batched=True)
valid_tokenized = valid.map(tokenize_function, batched=True)
test_tokenized = test.map(tokenize_function, batched=True)

Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/csv/default-a6b6de325c33cbfb/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-ae9e21808954c959.arrow
Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/csv/default-cd8c89afda16d5f5/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-7263e1cee1fc7f9f.arrow
Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/csv/default-bcd8e1782386e58e/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-00dec803a6f603b7.arrow


In [7]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
    }

#### training and saving classifier

In [8]:
label_num = 9

training_args = TrainingArguments("EmpIntent", evaluation_strategy="epoch", save_strategy="epoch", load_best_model_at_end=True)

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=label_num, ignore_mismatched_sizes=True)
trainer = Trainer(model=model, args=training_args, train_dataset=train_tokenized["train"], eval_dataset=valid_tokenized["train"], compute_metrics=compute_metrics)
trainer.train()
trainer.evaluate()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1724,0.170894,0.948059,0.886649,0.878232,0.898751
2,0.1425,0.121315,0.95308,0.888271,0.882423,0.905975
3,0.0919,0.105532,0.956169,0.896577,0.896187,0.902145


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [9]:
trainer.save_model("EmpatheticClassifier/bert_model")
tokenizer.save_pretrained("EmpatheticClassifier/bert_model")

('EmpatheticClassifier/bert_model/tokenizer_config.json',
 'EmpatheticClassifier/bert_model/special_tokens_map.json',
 'EmpatheticClassifier/bert_model/vocab.txt',
 'EmpatheticClassifier/bert_model/added_tokens.json',
 'EmpatheticClassifier/bert_model/tokenizer.json')

#### Evaluating on Test Set

In [11]:
pipe = pipeline("text-classification", model="EmpatheticClassifier/bert_model",
                tokenizer="EmpatheticClassifier/bert_model", device=0)

In [33]:
preds = []
for out in tqdm(pipe(KeyDataset(test["train"], "text"))):
    label = out["label"]
    number = ''.join(filter(str.isdigit, label))
    if number:
        preds.append(int(number))

  0%|          | 0/5179 [00:00<?, ?it/s]

In [34]:
preds[:5]

[7, 7, 6, 7, 2]

In [35]:
precision_recall_fscore_support(test["train"]["label"], preds, average='macro')

(0.896187111647092, 0.9021451019157549, 0.8965772870217843, None)

### Classifying Agent Dialogues

#### Defining prediction pipeline and loading data

In [5]:
tokenizer = AutoTokenizer.from_pretrained("EmpatheticClassifier/bert_model")
model = AutoModelForSequenceClassification.from_pretrained("EmpatheticClassifier/bert_model", local_files_only=True).cuda()
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, top_k = None, device=0)

In [8]:
agent_dials = pd.read_csv("datasets/input_output_translated.csv")
agent_dials.head()

Unnamed: 0.1,Unnamed: 0,id,code,prompt,seeker_post,response_post
0,0,0,R_-,Der Therapeut sagt:,I'm not at the finish yet and just at the mome...,It's okay that you doubt. It's perfectly norma...
1,1,1,R_-,Die Therapeutin sagt:,I'm not at the finish yet and just at the mome...,I can help you achieve your goals. We can crea...
2,2,2,R_-,Der erfreute Therapeut sagt:,I'm not at the finish yet and just at the mome...,"This attitude is very important, because I don..."
3,3,3,R_-,Der traurige Therapeut sagt:,I'm not at the finish yet and just at the mome...,"Yes, it may be that things aren't going so wel..."
4,4,4,R_-,Der verängstigte Therapeut sagt:,I'm not at the finish yet and just at the mome...,"You can do this."" The patient says:"


In [14]:
pred_label = []
pred_score = []

for index, row in agent_dials.iterrows():
    out = pipe.predict(row["response_post"])
    label, score = get_label(out)
    pred_label.append(label)
    pred_score.append(score)
    
agent_dials["label"] = pred_label
agent_dials["score"] = pred_score

agent_dials.to_csv("classified_agent_dials")



In [15]:
agent_dials.head()

Unnamed: 0.1,Unnamed: 0,id,code,prompt,seeker_post,response_post,label,score
0,0,0,R_-,Der Therapeut sagt:,I'm not at the finish yet and just at the mome...,It's okay that you doubt. It's perfectly norma...,LABEL_1,0.970899
1,1,1,R_-,Die Therapeutin sagt:,I'm not at the finish yet and just at the mome...,I can help you achieve your goals. We can crea...,LABEL_6,0.636805
2,2,2,R_-,Der erfreute Therapeut sagt:,I'm not at the finish yet and just at the mome...,"This attitude is very important, because I don...",LABEL_0,0.995945
3,3,3,R_-,Der traurige Therapeut sagt:,I'm not at the finish yet and just at the mome...,"Yes, it may be that things aren't going so wel...",LABEL_0,0.999413
4,4,4,R_-,Der verängstigte Therapeut sagt:,I'm not at the finish yet and just at the mome...,"You can do this."" The patient says:",LABEL_1,0.338438


In [19]:
agent_dials.replace({"LABEL_1":"agreeing", "LABEL_2":"acknowledging",
                     "LABEL_3":"encouraging", "LABEL_4":"consoling",
                     "LABEL_5":"sympathizing", "LABEL_6":"suggesting",
                     "LABEL_7":"questioning", "LABEL_8": "wishing",
                     "LABEL_0":"neutral"}, inplace=True)

In [20]:
agent_dials.head()

Unnamed: 0.1,Unnamed: 0,id,code,prompt,seeker_post,response_post,label,score
0,0,0,R_-,Der Therapeut sagt:,I'm not at the finish yet and just at the mome...,It's okay that you doubt. It's perfectly norma...,agreeing,0.970899
1,1,1,R_-,Die Therapeutin sagt:,I'm not at the finish yet and just at the mome...,I can help you achieve your goals. We can crea...,suggesting,0.636805
2,2,2,R_-,Der erfreute Therapeut sagt:,I'm not at the finish yet and just at the mome...,"This attitude is very important, because I don...",neutral,0.995945
3,3,3,R_-,Der traurige Therapeut sagt:,I'm not at the finish yet and just at the mome...,"Yes, it may be that things aren't going so wel...",neutral,0.999413
4,4,4,R_-,Der verängstigte Therapeut sagt:,I'm not at the finish yet and just at the mome...,"You can do this."" The patient says:",agreeing,0.338438


In [21]:
agent_dials.to_csv("datasets/classified_agent_dials.csv")