In [1]:
import re
import json
import random

import pandas as pd
import numpy as np
from pprint import pprint

In [2]:
chat_data = pd.read_csv(
    "https://raw.githubusercontent.com/nbertagnolli/counsel-chat/master/data/20200325_counsel_chat.csv"
)

In [3]:
chat_data

Unnamed: 0.1,Unnamed: 0,questionID,questionTitle,questionText,questionLink,topic,therapistInfo,therapistURL,answerText,upvotes,views,split
0,0,0,Can I change my feeling of being worthless to ...,I'm going through some things with my feelings...,https://counselchat.com/questions/can-i-change...,depression,"Sherry Katz, LCSWCouples and Family Therapist,...",https://counselchat.com/therapists/sherry-katz...,"If everyone thinks you're worthless, then mayb...",1,2899,train
1,1,0,Can I change my feeling of being worthless to ...,I'm going through some things with my feelings...,https://counselchat.com/questions/can-i-change...,depression,"Robin Landwehr, DBH, LPCC, NCCMental Health in...",https://counselchat.com/therapists/robin-landw...,"Hello, and thank you for your question and see...",1,3514,train
2,2,0,Can I change my feeling of being worthless to ...,I'm going through some things with my feelings...,https://counselchat.com/questions/can-i-change...,depression,Lee KingI use an integrative approach to treat...,https://counselchat.com/therapists/lee-king,First thing I'd suggest is getting the sleep y...,0,5,train
3,3,0,Can I change my feeling of being worthless to ...,I'm going through some things with my feelings...,https://counselchat.com/questions/can-i-change...,depression,"Shauntai Davis-YearginPersonalized, private on...",https://counselchat.com/therapists/shauntai-da...,Therapy is essential for those that are feelin...,0,31,train
4,4,0,Can I change my feeling of being worthless to ...,I'm going through some things with my feelings...,https://counselchat.com/questions/can-i-change...,depression,Jordan WhiteLicensed Social Worker at Oak Root...,https://counselchat.com/therapists/jordan-white,I first want to let you know that you are not ...,0,620,train
...,...,...,...,...,...,...,...,...,...,...,...,...
2124,2124,20,What happens in a counseling session?,"After first meeting the client, what is the pr...",https://counselchat.com/questions/what-happens...,counseling-fundamentals,Victoria HaagFind direction for your life...,https://counselchat.com/therapists/victoria-haag,There are probably no two therapists alike bec...,0,45,train
2125,2125,20,What happens in a counseling session?,"After first meeting the client, what is the pr...",https://counselchat.com/questions/what-happens...,counseling-fundamentals,Allison VelezIs your relationship worth it?,https://counselchat.com/therapists/allison-velez,"Each counselor may have a different process, s...",0,37,train
2126,2126,20,What happens in a counseling session?,"After first meeting the client, what is the pr...",https://counselchat.com/questions/what-happens...,counseling-fundamentals,David RouttPresident and Clinical Counselor at...,https://counselchat.com/therapists/david-routt,"After meeting a client, many Counselors will a...",0,56,train
2127,2127,20,What happens in a counseling session?,"After first meeting the client, what is the pr...",https://counselchat.com/questions/what-happens...,counseling-fundamentals,2nd Chance Counseling Service Online Addiction...,https://counselchat.com/therapists/2nd-chance-...,A good therapist will discuss what brought you...,0,27,train


# Transforming Data to Form for Hugging Face Model

Form of data:  

Each entry in personachat is a dict with two keys personality and utterances, the dataset is a list of entries:

* personality:  list of strings containing the personality of the agent

* utterances: list of dictionaries, each of which has two keys which are lists of strings.  
    * candidates: [next_utterance_candidate_1, ..., next_utterance_candidate_19]
        The last candidate is the ground truth response observed in the conversational data  
    * history: [dialog_turn_0, ... dialog_turn N], where N is an odd number since the other user starts every conversation.  

Preprocessing:
* Spaces before periods at end of sentences
* everything lowercase

## Example Data Entry:

```python
EXAMPLE_ENTRY = {
    "personality": [
        "i like to remodel homes .",
        "i like to go hunting .",
        "i like to shoot a bow .",
        "my favorite holiday is halloween .",
    ],
    "utterances": [
        {
            "candidates": [
                "my mom was single with 3 boys , so we never left the projects .",
                "i try to wear all black every day . it makes me feel comfortable .",
                "well nursing stresses you out so i wish luck with sister",
            ],
            "history": [
                "hi , how are you doing ? i'm getting ready to do some cheetah chasing to stay in shape .",
            ],
        },
        
        
        {
            "candidates": [
                "hello i am doing well how are you ?",
                "ll something like that . do you play games ?",
                "does anything give you relief ? i hate taking medicine ",
            ],
            "history": [
                "hi , how are you doing ? i'm getting ready to do some cheetah chasing to stay in shape .",
                "you must be very fast . hunting is one of my favorite hobbies .",
                "i am ! for my hobby i like to do canning or some whittling .",
            ],
        },
        
        
        {
            "candidates": [
                "yes they do but i say no to them lol",
                "i have trouble getting along with family .",
                "i live in texas , what kind of stuff do you do in ",
            ],
            "history": [
                "hi , how are you doing ? i'm getting ready to do some cheetah chasing to stay in shape .",
                "you must be very fast . hunting is one of my favorite hobbies .",
                "i am ! for my hobby i like to do canning or some whittling .",
                "i also remodel homes when i am not out bow hunting .",
                "that's neat . when i was in high school i placed 6th in 100m dash !",
            ],
        },
    ],
}
```

## Plan:

- Will not use personality initially
- history will just have the Q from the counsel-chat data

In [4]:
chat_data.iloc[0]

Unnamed: 0                                                       0
questionID                                                       0
questionTitle    Can I change my feeling of being worthless to ...
questionText     I'm going through some things with my feelings...
questionLink     https://counselchat.com/questions/can-i-change...
topic                                                   depression
therapistInfo    Sherry Katz, LCSWCouples and Family Therapist,...
therapistURL     https://counselchat.com/therapists/sherry-katz...
answerText       If everyone thinks you're worthless, then mayb...
upvotes                                                          1
views                                                         2899
split                                                        train
Name: 0, dtype: object

In [5]:
chat_data = chat_data[["questionTitle", "questionText", "answerText", "upvotes", "views"]]
chat_data

Unnamed: 0,questionTitle,questionText,answerText,upvotes,views
0,Can I change my feeling of being worthless to ...,I'm going through some things with my feelings...,"If everyone thinks you're worthless, then mayb...",1,2899
1,Can I change my feeling of being worthless to ...,I'm going through some things with my feelings...,"Hello, and thank you for your question and see...",1,3514
2,Can I change my feeling of being worthless to ...,I'm going through some things with my feelings...,First thing I'd suggest is getting the sleep y...,0,5
3,Can I change my feeling of being worthless to ...,I'm going through some things with my feelings...,Therapy is essential for those that are feelin...,0,31
4,Can I change my feeling of being worthless to ...,I'm going through some things with my feelings...,I first want to let you know that you are not ...,0,620
...,...,...,...,...,...
2124,What happens in a counseling session?,"After first meeting the client, what is the pr...",There are probably no two therapists alike bec...,0,45
2125,What happens in a counseling session?,"After first meeting the client, what is the pr...","Each counselor may have a different process, s...",0,37
2126,What happens in a counseling session?,"After first meeting the client, what is the pr...","After meeting a client, many Counselors will a...",0,56
2127,What happens in a counseling session?,"After first meeting the client, what is the pr...",A good therapist will discuss what brought you...,0,27


In [6]:
questions = chat_data["questionTitle"].copy() + " " + chat_data["questionText"].copy()

In [7]:
chat_data = chat_data.assign(question=questions)
chat_data["answer"] = chat_data["answerText"]
chat_data.drop(labels=["questionTitle", "questionText", "answerText"], axis=1, inplace=True)
chat_data

Unnamed: 0,upvotes,views,question,answer
0,1,2899,Can I change my feeling of being worthless to ...,"If everyone thinks you're worthless, then mayb..."
1,1,3514,Can I change my feeling of being worthless to ...,"Hello, and thank you for your question and see..."
2,0,5,Can I change my feeling of being worthless to ...,First thing I'd suggest is getting the sleep y...
3,0,31,Can I change my feeling of being worthless to ...,Therapy is essential for those that are feelin...
4,0,620,Can I change my feeling of being worthless to ...,I first want to let you know that you are not ...
...,...,...,...,...
2124,0,45,What happens in a counseling session? After fi...,There are probably no two therapists alike bec...
2125,0,37,What happens in a counseling session? After fi...,"Each counselor may have a different process, s..."
2126,0,56,What happens in a counseling session? After fi...,"After meeting a client, many Counselors will a..."
2127,0,27,What happens in a counseling session? After fi...,A good therapist will discuss what brought you...


Adding metric to determine the true response:  

$$score = 50 * upvotes + views$$

In [8]:
chat_data["score"] = 50 * chat_data["upvotes"] + chat_data["views"]
chat_data.drop(["upvotes", "views"], axis=1, inplace=True)
chat_data

Unnamed: 0,question,answer,score
0,Can I change my feeling of being worthless to ...,"If everyone thinks you're worthless, then mayb...",2949
1,Can I change my feeling of being worthless to ...,"Hello, and thank you for your question and see...",3564
2,Can I change my feeling of being worthless to ...,First thing I'd suggest is getting the sleep y...,5
3,Can I change my feeling of being worthless to ...,Therapy is essential for those that are feelin...,31
4,Can I change my feeling of being worthless to ...,I first want to let you know that you are not ...,620
...,...,...,...
2124,What happens in a counseling session? After fi...,There are probably no two therapists alike bec...,45
2125,What happens in a counseling session? After fi...,"Each counselor may have a different process, s...",37
2126,What happens in a counseling session? After fi...,"After meeting a client, many Counselors will a...",56
2127,What happens in a counseling session? After fi...,A good therapist will discuss what brought you...,27


# Preprocessing Data

## String Format
### On Test String

In [51]:
test_str = "can i change my feeling of being worthless to everyone?\xa0 i'm going /through some\\ things with my feelings and myself. i barely sleep and i do nothing but think about how i'm worthless and how i shouldn't be here.\n   i've never tried or contemplated suicide. i've always wanted to fix my issues, but i never get around to it.\n   how can i change my feeling of being worthless to everyone?"
test_str

"can i change my feeling of being worthless to everyone?\xa0 i'm going /through some\\ things with my feelings and myself. i barely sleep and i do nothing but think about how i'm worthless and how i shouldn't be here.\n   i've never tried or contemplated suicide. i've always wanted to fix my issues, but i never get around to it.\n   how can i change my feeling of being worthless to everyone?"

In [53]:
test_str = test_str.lower()
test_str = re.sub(r"(\xa0)|(\n)|(\\)|(/)", " ", test_str)
test_str = re.sub("(\.|\?|\!|,)", " \g<1> ", test_str)
test_str = re.sub(" {2,}", " ", test_str)

test_str

"can i change my feeling of being worthless to everyone ? i'm going through some things with my feelings and myself . i barely sleep and i do nothing but think about how i'm worthless and how i shouldn't be here . i've never tried or contemplated suicide . i've always wanted to fix my issues , but i never get around to it . how can i change my feeling of being worthless to everyone ? "

### On Actual Data

In [9]:
def preprocess_data(input_str):
    input_str = input_str.lower()
    input_str = re.sub(r"(\xa0)|(\n)|(\\)|(/)", " ", input_str)
    input_str = re.sub("(\.|\?|\!|,)", " \g<1> ", input_str)
    input_str = re.sub(" {2,}", " ", input_str)
    return input_str

In [10]:
chat_data["question"] = chat_data["question"].apply(preprocess_data)
chat_data["answer"] = chat_data["answer"].apply(preprocess_data)

In [11]:
chat_data

Unnamed: 0,question,answer,score
0,can i change my feeling of being worthless to ...,"if everyone thinks you're worthless , then may...",2949
1,can i change my feeling of being worthless to ...,"hello , and thank you for your question and se...",3564
2,can i change my feeling of being worthless to ...,first thing i'd suggest is getting the sleep y...,5
3,can i change my feeling of being worthless to ...,therapy is essential for those that are feelin...,31
4,can i change my feeling of being worthless to ...,i first want to let you know that you are not ...,620
...,...,...,...
2124,what happens in a counseling session ? after f...,there are probably no two therapists alike bec...,45
2125,what happens in a counseling session ? after f...,"each counselor may have a different process , ...",37
2126,what happens in a counseling session ? after f...,"after meeting a client , many counselors will ...",56
2127,what happens in a counseling session ? after f...,a good therapist will discuss what brought you...,27


## Clipping the Data to Max Length
  
TODO: clip at last sentence under max length instead of just a hard stop.

In [14]:
MAX_LEN = 250
chat_data["question"].apply(len).max()
chat_data["answer"].apply(len).max()

511

512

In [15]:
chat_data["question"] = chat_data["question"].apply(lambda q: q[:MAX_LEN] if len(q) > MAX_LEN else q)
chat_data["answer"] = chat_data["answer"].apply(lambda q: q[:MAX_LEN] if len(q) > MAX_LEN else q)

chat_data["question"].apply(len).max()
chat_data["answer"].apply(len).max()

250

250

## Reshaping Data

### On Test Data

In [10]:
test = {0 : [{"q": "q1"}, {"a": "ans1"}, {"s": "s1"}], 1: [{"q": "q1"}, {"a": "ans2"}, {"s": "s2"}], 2 : [{"q": "q2"}, {"a": "ans1"}, {"s": "s3"}]}

test

{0: [{'q': 'q1'}, {'a': 'ans1'}, {'s': 's1'}],
 1: [{'q': 'q1'}, {'a': 'ans2'}, {'s': 's2'}],
 2: [{'q': 'q2'}, {'a': 'ans1'}, {'s': 's3'}]}

In [11]:
df_array = []
for example in list(test.values()):
    df_array.append([list(dict_pair.values())[0] for dict_pair in example])
    
df_array

[['q1', 'ans1', 's1'], ['q1', 'ans2', 's2'], ['q2', 'ans1', 's3']]

In [100]:
df_array = np.array(df_array)
df_array
df_array.shape

array([['q1', 'ans1', 's1'],
       ['q1', 'ans2', 's2'],
       ['q2', 'ans1', 's3']], dtype='<U4')

(3, 3)

In [103]:
unique_questions = set(df_array[:,0])
unique_questions

{'q1', 'q2'}

In [131]:
list(test.values())

[[{'q': 'q1'}, {'a': 'ans1'}, {'s': 's1'}],
 [{'q': 'q1'}, {'a': 'ans2'}, {'s': 's2'}],
 [{'q': 'q2'}, {'a': 'ans1'}, {'s': 's3'}]]

### On Actual Data

In [16]:
df_dict = list(chat_data.to_dict("index").values())

df_array = []
for example in df_dict:
    df_array.append([value for value in example.values()])

In [17]:
df_array = np.array(df_array)
df_array.shape

(2129, 3)

In [18]:
unique_questions = list(set(df_array[:,0]))
random.shuffle(unique_questions)
len(unique_questions)

824

In [19]:
for q in unique_questions:
    if len(q) > MAX_LEN:
        print(len(q))

### Testing On Actual Data

In [232]:
unique_questions[1]
chat_data[chat_data["question"] == unique_questions[1]][["answer", "score"]]

"my toddler wants her daddy to die when she's mad at him i told her that if daddy dies , we will never see him again . she started crying because i wouldn't make her daddy die . "

Unnamed: 0,answer,score
688,children often have a difficult time expressin...,381
689,toddlers don't have the intellectual capacity ...,119
690,it's normal for a child to be so angry she wan...,226


In [233]:
answers = chat_data[chat_data["question"] == unique_questions[1]][["answer", "score"]]
answers = answers.sample(frac=1)
max_idx = answers["score"].idxmax()
answers = answers["answer"]
true_ans = answers.loc[max_idx]
answers = answers.drop(max_idx).tolist() + [true_ans]

In [234]:
answers
len(answers)

["it's normal for a child to be so angry she wanted someone to die , but this seems to be heading into a dangerous realm . i think it would be wise to have her seen by a child psychologist just to rule out anything more serious . ",
 "toddlers don't have the intellectual capacity to conceptualize . better to find out why she wants daddy to die than explain the future consequences of death to your toddler . a person must be around 9 , possibly 8 years old before absorbing the potential effects of an action taken in present time . pretty much your toddler demonstrated to you here age appropriate and limited understanding of the loss of a parent , by crying that you weren't able to magically make her father disappear . also , please pay attention and form your own opinions as to the reasons why your toddler would wish her father's death . she may be pointing out that the father behaves in frightening or harmful ways towards her . ",
 "children often have a difficult time expressing emotio

3

## Transforming Into Needed Data Shape

In [20]:
df_full = []

for q in unique_questions:
#     answers = chat_data[chat_data["question"] == q]["answer"].tolist()
    answers = chat_data[chat_data["question"] == q][["answer", "score"]]
    answers = answers.sample(frac=1)
    max_idx = answers["score"].idxmax()
    answers = answers["answer"]
    true_ans = answers.loc[max_idx]
    answers = answers.drop(max_idx).tolist() + [true_ans]
    
    df_full.append(
        {
            "personality" : [],
            "utterances": [
                {"candidates": answers, "history": [q]}
            ]
        }
    )

In [21]:
print(df_full[0]["utterances"][0]["candidates"])

['you wrote a very full family context w many directions to examine . do you know or can you find out what the grandma and your daughter were doing in the evening of the screaming episode ? is it possible the grandma abuses your daughter , or tells her']


In [22]:
# check to make sure data isn't too long
for entry in df_full:
    example = entry["utterances"][0]
    for candidate in example["candidates"]:
        if len(candidate) > MAX_LEN:
            print("\n\n\ncandidate:", candidate)
    for question in example["history"]:
        if len(question) > MAX_LEN:
            print("\n\nquestion:", question)

## Train Test Split

In [23]:
TRAIN_PCT = 0.8
train = df_full[:int(len(df_full) * 0.8)]
val = df_full[int(len(df_full) * 0.8):]

In [24]:
df_final = {"train": train, "valid": val}

## Save to JSON

### Testing On Test Data

In [82]:
EXAMPLE_ENTRY = {
    "personality": [
        "i like to remodel homes .",
        "i like to go hunting .",
        "i like to shoot a bow .",
        "my favorite holiday is halloween .",
    ],
    "utterances": [
        {
            "candidates": [
                "my mom was single with 3 boys , so we never left the projects .",
                "i try to wear all black every day . it makes me feel comfortable .",
                "well nursing stresses you out so i wish luck with sister",
            ],
            "history": [
                "hi , how are you doing ? i'm getting ready to do some cheetah chasing to stay in shape .",
            ],
        },
        
        
        {
            "candidates": [
                "hello i am doing well how are you ?",
                "ll something like that . do you play games ?",
                "does anything give you relief ? i hate taking medicine ",
            ],
            "history": [
                "hi , how are you doing ? i'm getting ready to do some cheetah chasing to stay in shape .",
                "you must be very fast . hunting is one of my favorite hobbies .",
                "i am ! for my hobby i like to do canning or some whittling .",
            ],
        },
        
        
        {
            "candidates": [
                "yes they do but i say no to them lol",
                "i have trouble getting along with family .",
                "i live in texas , what kind of stuff do you do in ",
            ],
            "history": [
                "hi , how are you doing ? i'm getting ready to do some cheetah chasing to stay in shape .",
                "you must be very fast . hunting is one of my favorite hobbies .",
                "i am ! for my hobby i like to do canning or some whittling .",
                "i also remodel homes when i am not out bow hunting .",
                "that's neat . when i was in high school i placed 6th in 100m dash !",
            ],
        },
    ],
}

In [83]:
test = {"train": [EXAMPLE_ENTRY]*3, "valid": [EXAMPLE_ENTRY]*2}
test = json.dumps(test)
test

'{"train": [{"personality": ["i like to remodel homes .", "i like to go hunting .", "i like to shoot a bow .", "my favorite holiday is halloween ."], "utterances": [{"candidates": ["my mom was single with 3 boys , so we never left the projects .", "i try to wear all black every day . it makes me feel comfortable .", "well nursing stresses you out so i wish luck with sister"], "history": ["hi , how are you doing ? i\'m getting ready to do some cheetah chasing to stay in shape ."]}, {"candidates": ["hello i am doing well how are you ?", "ll something like that . do you play games ?", "does anything give you relief ? i hate taking medicine "], "history": ["hi , how are you doing ? i\'m getting ready to do some cheetah chasing to stay in shape .", "you must be very fast . hunting is one of my favorite hobbies .", "i am ! for my hobby i like to do canning or some whittling ."]}, {"candidates": ["yes they do but i say no to them lol", "i have trouble getting along with family .", "i live in 

In [84]:
with open("./counsel_chat_data.json", "w") as f:
    f.write(test)

7002

### Saving Actual Data

In [26]:
df_final_json = json.dumps(df_final)
with open("./datasets/counsel_chat_data_250-maxlen.json", "w") as f:
    f.write(df_final_json)

781166