In [13]:
import os
os.environ['HF_HOME'] = '/run/cache/'

In [3]:
import pandas as pd
import uuid

def add_id_to_data(df):
    df['document'] = [str(uuid.uuid4())[:10] for _ in range(len(df))]
    return df


# Load the Parquet file into a DataFrame
df = pd.read_parquet('preprocessed_data.parquet')



# Add IDs to the original data
df = add_id_to_data(df)

In [4]:
df.head()

Unnamed: 0,Context,Response,document
0,a few years ago i was making love to my wife w...,[first step always is to do a medical rule out...,cb02ebd1-4
1,a lot of times i avoid situations where i am t...,[hello and thank you for your question first i...,0c9816ff-e
2,a year ago the love of my life left me and nev...,[who takes care of your son is a significant p...,cd3e9d25-8
3,about 3 years ago or so i was skinny but i was...,[hey i am so impressed with your efforts to as...,42f38c04-1
4,about 5 months ago my ex left without fully ex...,[since you realize your sense of trust was bro...,1141a8ca-3


In [5]:
!df -h

Filesystem      Size  Used Avail Use% Mounted on
overlay         100G   35G   66G  35% /
tmpfs            64M     0   64M   0% /dev
tmpfs           7.7G     0  7.7G   0% /sys/fs/cgroup
/dev/nvme0n1p1  100G   35G   66G  35% /run
tmpfs            14G     0   14G   0% /dev/shm
/dev/nvme2n1    2.0G  3.3M  1.9G   1% /home/jovyan
tmpfs            14G  124K   14G   1% /home/jovyan/.saturn
tmpfs            14G   12K   14G   1% /run/secrets/kubernetes.io/serviceaccount
tmpfs           7.7G   12K  7.7G   1% /proc/driver/nvidia
tmpfs           7.7G  4.4M  7.7G   1% /run/nvidia-persistenced/socket
tmpfs           7.7G     0  7.7G   0% /proc/acpi
tmpfs           7.7G     0  7.7G   0% /sys/firmware


In [8]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model_name = "google/flan-t5-large"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

def generate_questions(context, num_questions=2, max_context_length=300):
    # Truncate the context if it's too long
    context_tokens = tokenizer.encode(context, add_special_tokens=False, truncation=True, max_length=max_context_length)
    truncated_context = tokenizer.decode(context_tokens)

    prompt = f"""
    You are someone experiencing mental health challenges. Based on the following context, create {num_questions} questions that you might ask a mental health professional or support group:

    Context: {truncated_context}

    Generate {num_questions} questions:
    """
    
    input_ids = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).input_ids
    
    outputs = model.generate(
        input_ids,
        max_length=150,
        num_return_sequences=num_questions,
        num_beams=num_questions,
        do_sample=True,
        temperature=0.7,
        no_repeat_ngram_size=2
    )
    
    questions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return questions

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
sample_df = df.sample(n=10, random_state=42)

# List to store new rows
new_rows = []

# Generate questions and create new format
for index, row in sample_df.iterrows():
    questions = generate_questions(row['Context'])
    
    for i, question in enumerate(questions):
        new_rows.append({
            'question': question,
#             'documents': i,
            'document': row['document']
        })

# Create new dataframe with the desired format
questions_df = pd.DataFrame(new_rows)

# Reorder columns
questions_df = questions_df[['question', 'document']]

In [10]:
questions_df.head()

Unnamed: 0,question,document
0,What may be the reason for you being in a wome...,e739b2db-1
1,What may be the reason for you not seeing your...,e739b2db-1
2,How long has it been since your mother's death?,2510448d-8
3,How long has it been since your mother died?,2510448d-8
4,What may be the reason for you to be angry wit...,db959012-6


In [11]:
df.head()

Unnamed: 0,Context,Response,document
0,a few years ago i was making love to my wife w...,[first step always is to do a medical rule out...,cb02ebd1-4
1,a lot of times i avoid situations where i am t...,[hello and thank you for your question first i...,0c9816ff-e
2,a year ago the love of my life left me and nev...,[who takes care of your son is a significant p...,cd3e9d25-8
3,about 3 years ago or so i was skinny but i was...,[hey i am so impressed with your efforts to as...,42f38c04-1
4,about 5 months ago my ex left without fully ex...,[since you realize your sense of trust was bro...,1141a8ca-3


In [14]:
# sample_df = df.sample(n=10, random_state=42)

# List to store new rows
new_rows = []

# Generate questions and create new format
for index, row in df.iterrows():
    questions = generate_questions(row['Context'])
    
    for i, question in enumerate(questions):
        new_rows.append({
            'question': question,
#             'documents': i,
            'document': row['document']
        })

# Create new dataframe with the desired format
questions_df = pd.DataFrame(new_rows)

# Reorder columns
questions_df = questions_df[['question', 'document']]

In [15]:
questions_df.shape

(1662, 2)

In [17]:
questions_df.sample(10)

Unnamed: 0,question,document
584,What might I do if I am worried about something?,2616ddbe-a
455,What may be the reason for you wearing lingerie?,64b6dbcf-b
1419,What may be the reason for you to be apart fro...,8a099260-d
1107,What may be the reason for you feeling like this?,02d20d54-6
1299,What may be the reason why you aren't interest...,c0d80d3d-2
1052,What may be the reason you distanced yourself ...,1c309951-0
1557,What might be the reason for you being anxious...,06215297-8
702,What might be the reason you want to move out?,215a6cf0-c
855,What may be the reason she is angry?,0a6fa060-f
118,What may be the reason for you being down and ...,7af158e0-8


In [18]:
!df -h

Filesystem      Size  Used Avail Use% Mounted on
overlay         100G   38G   63G  38% /
tmpfs            64M     0   64M   0% /dev
tmpfs           7.7G     0  7.7G   0% /sys/fs/cgroup
/dev/nvme0n1p1  100G   38G   63G  38% /run
tmpfs            14G  4.0K   14G   1% /dev/shm
/dev/nvme2n1    2.0G  3.3M  1.9G   1% /home/jovyan
tmpfs            14G  124K   14G   1% /home/jovyan/.saturn
tmpfs            14G   12K   14G   1% /run/secrets/kubernetes.io/serviceaccount
tmpfs           7.7G   12K  7.7G   1% /proc/driver/nvidia
tmpfs           7.7G  8.2M  7.7G   1% /run/nvidia-persistenced/socket
tmpfs           7.7G     0  7.7G   0% /proc/acpi
tmpfs           7.7G     0  7.7G   0% /sys/firmware


In [19]:
# Save the original data with IDs
df.to_parquet('mental_health_data_with_ids.parquet')

# Save the questions data
questions_df.to_parquet('ground_truth_data.parquet')

In [20]:
# Load the Parquet file into a DataFrame
df1 = pd.read_parquet('mental_health_data_with_ids.parquet')
df1.shape

(831, 3)

In [22]:
df1.sample(10)

Unnamed: 0,Context,Response,document
264,im dealing with an illness that will never go ...,[is it possible that even though your illness ...,8c434cba-5
249,im a teenager and i just got my first job i am...,[getting your first job is an exciting terrify...,4ab46a4d-d
634,i think adult life is making him depressed and...,[how do you help yourself to believe you requi...,8b309f9b-2
454,we live with my expartners sister and her husb...,[hi new york im happy to hear that your daught...,aa4f24f2-1
157,i have no friends no hobbies and no interest i...,[hello and thank you for your question it cert...,9a92800e-2
738,my dad beat and mentally abused me so badly th...,[i am so sorry to hear that you are struggling...,d5e21679-8
341,my boyfriend is in recovery from drug addictio...,[my empathy goes out to you relationships are ...,5b00fb0e-2
636,i told him i liked him he called me lovely and...,[well as disappointed as you may very well fee...,87979059-7
564,i got engaged and everything was going well th...,[somehow you knew the marriage wouldnt have a ...,07a4a324-2
78,i been having anger problems a lot lately it o...,[with me knowing that a healthy relationship u...,5b1082aa-e


In [24]:
df2 = pd.read_parquet('ground_truth_data.parquet')
df2.shape

(1662, 2)

In [26]:
df2.tail(10)

Unnamed: 0,question,document
1652,What might be the reason for you not cutting?,a61f42d3-d
1653,What might be the reason for you not cutting a...,a61f42d3-d
1654,What might you do if you want to cut yourself?,3a65764a-e
1655,What may happen after you cut yourself?,3a65764a-e
1656,What might happen if you move in with your gir...,33b14d46-a
1657,What might happen if you moved in with your gi...,33b14d46-a
1658,What might be the reason for your boyfriend's ...,b5df98a6-c
1659,What might be the reason for your boyfriend's ...,b5df98a6-c
1660,What may be the reason you are attracted to ol...,310dbd5b-8
1661,What may be the reason for your attraction to ...,310dbd5b-8
