In [1]:
import os
os.environ['HF_HOME'] = '/run/cache/'

In [2]:
import pandas as pd
# import json
from typing import List
# import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from huggingface_hub import login

In [6]:
import os
print(os.listdir('Mental_Hea/data/'))

['hea.parquet', 'Mental_Health_FAQ.csv', 'preprocessed_data.parquet']


In [7]:
pd.set_option('display.max_colwidth', None)

df = pd.read_parquet('Mental_Hea/data/preprocessed_data.parquet')

df.sample(2)

Unnamed: 0,id,question,answer
6747,8c908f3e-6ac5-4538-9777-0b8fa31ee8ab,"i'm having trouble sleeping, what should i do?",let's explore what might be causing your insomnia. are you experiencing any stress or anxiety? we can work on developing healthy sleep habits and relaxation techniques to help you sleep better.
1373,9d8bd2ff-24c2-4752-9633-52dfeaa3d05e,i'm having trouble with my body image.,"body image issues can be difficult to deal with, but it's important to recognize that everyone has their own unique beauty. let's work on developing a more positive self-image and finding ways to appreciate your body for all it does for you."


In [14]:
# import os
from typing import Tuple
import torch
# from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

class ModelSingleton:
    _instance = None
    
    @staticmethod
    def get_instance() -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
        if ModelSingleton._instance is None:
            ModelSingleton()
        return ModelSingleton._instance.model, ModelSingleton._instance.tokenizer
    
    def __new__(cls):
        if cls._instance is None:
            cls._instance = super(ModelSingleton, cls).__new__(cls)
            cls._instance._initialize_model_and_tokenizer()
        return cls._instance
    
    def _initialize_model_and_tokenizer(self):
        model_name = "meta-llama/Llama-2-7b-hf"
        
        # Retrieve the Hugging Face token from environment variables
        token = os.getenv("HUGGINGFACE_HUB_TOKEN")
        if token is None:
            print("Warning: HUGGINGFACE_HUB_TOKEN is not set.")
        
        # Configure quantization
        quantization_config = BitsAndBytesConfig(load_in_8bit=True)

        # Load the model and tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=token)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name, quantization_config=quantization_config, torch_dtype=torch.float16, device_map="auto", use_auth_token=token
        )

# Usage
model, tokenizer = ModelSingleton.get_instance()

# Verify the model and tokenizer
print(f"Model: {model}")
print(f"Tokenizer: {tokenizer}")




Downloading config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Downloading generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Model: LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear8bitLt(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
  

In [11]:
example_record = df.iloc[0]  # Adjust the index to select a different record if needed
example_record

id                                                                                                                                                                                                                                                           2246f22c-9316-4de9-9b90-7fe4f9b80792
question                                                                                                                                                                                                                                  i'm feeling really anxious lately and i don't know why.
answer      it's common to feel anxious at times, and there can be many reasons for it. have there been any recent changes or stressors in your life that may be contributing to your anxiety? let's work together to identify any triggers and develop coping strategies to manage your anxiety.
Name: 0, dtype: object

In [30]:
df.sample(1)

Unnamed: 0,id,question,answer
4026,9cb38caa-74fc-4e4f-8caa-36d7e5b8c8f3,i have low self-esteem and feel like i'm not good enough. what can i do to improve my self-confidence?,"it's important to challenge negative self-talk and practice self-care activities like exercise, spending time with loved ones, and engaging in hobbies. additionally, seeking help from a therapist or mental health professional can provide support and guidance."


In [33]:
def generate_questions(context, answer, model, tokenizer):
    # Define the prompt template
    prompt_template = (
        "Given the following context and answer, generate 5 insightful questions that can be used for evaluating the ground truth:\n\n"
        "Context: {context}\n"
        "Answer: {answer}\n\n"
        "Questions:\n"
    )

    # Format the prompt with the provided context and answer
    prompt = prompt_template.format(context=context, answer=answer)
    
    # Tokenize the prompt and generate text
    inputs = tokenizer(prompt, return_tensors="pt")
    
    # Move inputs to the same device as the model
    device = model.device
    input_ids = inputs.input_ids.to(device)
    
    # Generate text using the model
    outputs = model.generate(input_ids, max_length=512, num_return_sequences=1, do_sample=True)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract questions from the generated text
    # Split generated text into lines
    lines = generated_text.split('\n')
    
    # Collect questions starting with numbers followed by a period
    questions = []
    for line in lines:
        if line.strip().startswith(f"{len(questions) + 1}."):
            question = line.strip().split('.', 1)[-1].strip()
            questions.append(question)
        if len(questions) >= 5:
            break
    
    return questions

# Example usage
if __name__ == "__main__":
    model, tokenizer = ModelSingleton.get_instance()
    
    # Sample context and answer
    context = "i have low self-esteem and feel like i'm not good enough. what can i do to improve my self-confidence?"
    answer = "it's important to challenge negative self-talk and practice self-care activities like exercise, spending time with loved ones, and engaging in hobbies. additionally, seeking help from a therapist or mental health professional can provide support and guidance"
    
    questions = generate_questions(context, answer, model, tokenizer)
    for i, question in enumerate(questions, 1):
        print(f"Question {i}: {question.strip()}")

Question 1: What specific actions can you take to challenge your negative self-talk?
Question 2: How can you practice self-care on a regular basis?
Question 3: How can you seek help from a therapist or mental health professional?
Question 4: What are some other strategies that you can use to improve your self-confidence?
Question 5: How can you practice self-compassion and accept yourself as you are?


In [34]:
def generate_questions_for_sample(df, model, tokenizer, n=500,  random_state=None):
    questions_data = []

    # Limit the DataFrame to the first n records
    df_subset = df.sample(n=n, random_state=random_state)
    
    for _, row in df_subset.iterrows():
        record_id = row['id']
        context = row['question']
        answer = row['answer']
        
        # Generate questions
        questions = generate_questions(context, answer, model, tokenizer)
        
        # Append questions with their IDs
        for question in questions:
            questions_data.append({'id': record_id, 'question': question})
    
    return questions_data

# Example usage
if __name__ == "__main__":
    model, tokenizer = ModelSingleton.get_instance()

    questions_data = generate_questions_for_sample(df, model, tokenizer)
    
    # Create a DataFrame from the questions_data and save it to a parquet file
    questions_df = pd.DataFrame(questions_data)
    questions_df.to_parquet('generated_questions.parquet', index=False)


