In [1]:
import os
os.environ['HF_HOME'] = '/run/cache/'

In [2]:
import pandas as pd
from typing import List
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from huggingface_hub import login
# from transformers import 

# from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
# import os
# import torch

In [3]:
import os
print(os.listdir('Mental_Hea/data/'))

['hea.parquet', 'Mental_Health_FAQ.csv', 'preprocessed_data.parquet']


In [4]:
pd.set_option('display.max_colwidth', None)

df = pd.read_parquet('Mental_Hea/data/preprocessed_data.parquet')

df.sample(2)

Unnamed: 0,id,question,answer
2055,8e16f86b-05bb-4b17-9473-6cc52a9f0534,i'm having trouble with my parents and i don't know how to communicate with them effectively.,let's work on communication skills to help you express your needs and feelings to your parents. would you be open to family therapy?
4119,2158ff7a-2595-4a66-9a47-08ee6bbcd93a,i'm having trouble with my motivation and productivity.,"motivation and productivity issues can be a common source of stress and anxiety, but it's important to address them before they become overwhelming. let's explore any underlying issues that may be contributing to your lack of motivation or productivity, and work on developing healthy habits and strategies to help you achieve your goals."


In [5]:
login(token=os.environ['HUGGINGFACE_HUB_TOKEN'])

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /run/cache/token
Login successful


In [6]:
class ModelSingleton:
    _instance = None
    
    @staticmethod
    def get_instance():
        if ModelSingleton._instance is None:
            ModelSingleton()
        return ModelSingleton._instance.model, ModelSingleton._instance.tokenizer
    
    def __new__(cls):
        if cls._instance is None:
            cls._instance = super(ModelSingleton, cls).__new__(cls)
            cls._instance._initialize_model_and_tokenizer()
        return cls._instance
    
    def _initialize_model_and_tokenizer(self):
        model_name = "mistralai/Mistral-7B-Instruct-v0.2"  # Update to Mistral 7B
        
        # Retrieve the Hugging Face token from environment variables
        token = os.getenv("HUGGINGFACE_HUB_TOKEN")
        if auto_token is None:
            print("Warning: HUGGINGFACE_HUB_TOKEN is not set.")
        
        # Configure quantization
        quantization_config = BitsAndBytesConfig(load_in_8bit=True)

        # Load the model and tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=token)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name, quantization_config=quantization_config, torch_dtype=torch.float16, device_map="auto", use_auth_token=token
        )

# Usage
model, tokenizer = ModelSingleton.get_instance()




Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
def generate_questions(context, answer, model, tokenizer):
    # Define the prompt template in Mistral 7B v2 instruction format
    prompt_template = (
        "<s>[INST] Context: {context}\nAnswer: {answer}\nPlease list 5 insightful questions based on this context and answer. [/INST]"
    )

#     prompt_template = (
#         "<s>[INST] Given the following context and answer, generate 5 insightful questions that can be used for evaluating the ground truth:\n"
#         "Context: {context}\n"
#         "Answer: {answer}\n"
#         "Questions: [/INST]"
#     )
    
    # Format the prompt with the provided context and answer
    prompt = prompt_template.format(context=context, answer=answer)
    
    # Tokenize the prompt and generate text
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
    
    # Move inputs to the same device as the model
    device = model.device
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)
    
    # Generate text using the model
    outputs = model.generate(input_ids, max_length=512, num_return_sequences=1, do_sample=True, pad_token_id=tokenizer.eos_token_id )
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract questions from the generated text
    # Split generated text into lines
    lines = generated_text.split('\n')
    
    # Collect questions starting with numbers followed by a period
    questions = []
    for line in lines:
        if line.strip().startswith(f"{len(questions) + 1}."):
            question = line.strip().split('.', 1)[-1].strip()
            questions.append(question)
        if len(questions) >= 5:
            break
    
    return questions

# Example usage
if __name__ == "__main__":
    model, tokenizer = ModelSingleton.get_instance()
    
    # Sample context and answer
    context = "I have low self-esteem and feel like I'm not good enough. What can I do to improve my self-confidence?"
    answer = "It's important to challenge negative self-talk and practice self-care activities like exercise, spending time with loved ones, and engaging in hobbies. Additionally, seeking help from a therapist or mental health professional can provide support and guidance."
    
    questions = generate_questions(context, answer, model, tokenizer)
    for i, question in enumerate(questions, 1):
        print(f"Question {i}: {question.strip()}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [34]:
def generate_questions_for_sample(df, model, tokenizer, n=500,  random_state=None):
    questions_data = []

    # Limit the DataFrame to the first n records
    df_subset = df.sample(n=n, random_state=random_state)
    
    for _, row in df_subset.iterrows():
        record_id = row['id']
        context = row['question']
        answer = row['answer']
        
        # Generate questions
        questions = generate_questions(context, answer, model, tokenizer)
        
        # Append questions with their IDs
        for question in questions:
            questions_data.append({'id': record_id, 'question': question})
    
    return questions_data

# Example usage
if __name__ == "__main__":
    model, tokenizer = ModelSingleton.get_instance()

    questions_data = generate_questions_for_sample(df, model, tokenizer)
    
    # Create a DataFrame from the questions_data and save it to a parquet file
    questions_df = pd.DataFrame(questions_data)
    questions_df.to_parquet('generated_questions.parquet', index=False)


