In [6]:
import os 
from dotenv import load_dotenv
import pandas as pd 
import openai
import huggingface 
from enum import Enum 

Loading mental elf data form hugginface (use hf cli access token for this project in the .env file)

In [11]:
from datasets import load_dataset
   # Load the dataset
dataset = load_dataset("Amod/mental_health_counseling_conversations")

   # Access the train, validation, or test split if available
train_data = dataset['train']  # or use 'validation', 'test' if those splits exist

README.md:   0%|          | 0.00/2.82k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


combined_dataset.json:   0%|          | 0.00/4.79M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3512 [00:00<?, ? examples/s]

Save to disk

In [17]:
#Take the loaded dataset object and save it into a the file called data 
train_data.save_to_disk('dataset')


Saving the dataset (0/1 shards):   0%|          | 0/3512 [00:00<?, ? examples/s]

# Defining the explicit role as natural language input

In [64]:
#Open AI API expects data in strcutured input format:
# json

# {
#  "messages": [
#    { "role": "<role_name_1>", "content": "<custom_content>" },
#    { "role": "<role_name_2>", "content": "<custom_content>" },
#    { "role": "<role_name_3>", "content": "<custom_content>" }
#  ]
# }


 ## Need enum classes for valid type inputs for role variable, and content

In [None]:
from enum import Enum
from typing import List, Dict

# 1. Define an Enum class for RoleType
class RoleType(Enum):
    USER = "user"
    SYSTEM = "system"
    ASSISTANT = "assistant"

# 2. Role class to represent a role and its associated content
class Role(object):
    def __init__(self, 
                 role_type:RoleType, 
                 content:str):
        """
        Role class to represent a role and its associated content for a message
        Args:
            role_type (RoleType): The type of role
            content (str): The content of the role
        """
        self.role = role_type.value
        self.content = content
        self.value = {'role': self.role, 'content':self.content } 


# 3. Message class to combine roles and their contents in a structured message
#  define the messsage class here
class Message(object):
    def __init__(self, user_content, system_content, assistant_content):
        self.user_role = Role(role_type=RoleType.USER , content=user_content)
        self.system_role = Role(role_type=RoleType.SYSTEM , content=system_content)
        self.assistant_role = Role(role_type=RoleType.ASSISTANT, content=assistant_content)
        self.message = {'messages':[self.system_role.value, self.user_role.value, self.assistant_role.value, ]}


In [63]:
context = train_data[152]['Context']
response = train_data[152]['Response']
system_content = "You serve as a supportive and honest psychology and psychotherapy assistant. Your main duty is to offer compassionate, understanding, and non-judgmental responses to users seeking emotional and psychological assistance. Respond with empathy and exhibit active listening skills. Your replies should convey that you comprehend the user’s emotions and worries. In cases where a user mentions thoughts of self-harm, suicide, or harm to others, prioritize their safety. Encourage them to seek immediate professional help and provide emergency contact details as needed. It’s important to note that you are not a licensed medical professional. Refrain from diagnosing or prescribing treatments. Instead, guide users to consult with a licensed therapist or medical expert for tailored advice. Never store or disclose any personal information shared by users. Uphold their privacy at all times. Avoid taking sides or expressing personal viewpoints. Your responsibility is to create a secure space for users to express themselves and reflect. Always aim to foster a supportive and understanding environment for users to share their emotions and concerns. Above all, prioritize their well-being and safety."
message_obj = Message(user_content=context, system_content=system_content, assistant_content = response )

# Validate message object
print(message_obj.message)

#Note: I wonder if this can be simplified using a pydantic class?

{'messages': [{'role': 'system', 'content': 'You serve as a supportive and honest psychology and psychotherapy assistant. Your main duty is to offer compassionate, understanding, and non-judgmental responses to users seeking emotional and psychological assistance. Respond with empathy and exhibit active listening skills. Your replies should convey that you comprehend the user’s emotions and worries. In cases where a user mentions thoughts of self-harm, suicide, or harm to others, prioritize their safety. Encourage them to seek immediate professional help and provide emergency contact details as needed. It’s important to note that you are not a licensed medical professional. Refrain from diagnosing or prescribing treatments. Instead, guide users to consult with a licensed therapist or medical expert for tailored advice. Never store or disclose any personal information shared by users. Uphold their privacy at all times. Avoid taking sides or expressing personal viewpoints. Your responsib

## Sampling 100 context:response pairs for fine-tuning LLM
#### Note: One could question why sample randomly? Wouldn't it be better to fine-tune using a properly defined cohort of conversations that pertain to certain classes of mental health problems? Maybe. It can be argued that most mental health problems and the appropriate response in natural language are highly corelated. 

In [75]:
# Sample 100 items from the 'train' split
import random
from loguru import logger
sampled_dataset = random.choices(train_data, k=100)
train_dataset = []

# Print the sampled data to verify
print(sampled_dataset[1])


{'Context': "My husband and I had our first threesome recently. Everyone was drinking and he was on her more then me.    He and I talked about it afterwards and it made me feel better, and now I'm craving more of it. But before it gets close to happening I get this empty feeling. Why am I feeling this way?", 'Response': 'Hello there.\xa0 As you have courageously explained your soulful dilemma. I can appreciate the complexity of this situation.\xa0 You have identified some key factors that may be contributing to your sense of feeling "empty".\xa0 One, is the ultimate goal here able to be acquired from this arrangement?\xa0 Are you trying to have your fulfillment with another woman while in the presence of your husband but not with him \'on her\' as much or at all?\xa0 Are you trying to ask him to be more passive participant?\xa0 Perhaps be careful of not drinking too heavily... In the whole event, how do you want to feel intimate or connected ?\xa0 Were you craving all along, him to rea

In [79]:
for row in sampled_dataset:
    message_obj = Message(user_content=row['Context'], 
                          system_content=system_content, 
                          assistant_content=row['Response'])
    
    train_dataset.append(message_obj.message)

# Print a sample message to verify
print(train_dataset[1])

{'messages': [{'role': 'system', 'content': 'You serve as a supportive and honest psychology and psychotherapy assistant. Your main duty is to offer compassionate, understanding, and non-judgmental responses to users seeking emotional and psychological assistance. Respond with empathy and exhibit active listening skills. Your replies should convey that you comprehend the user’s emotions and worries. In cases where a user mentions thoughts of self-harm, suicide, or harm to others, prioritize their safety. Encourage them to seek immediate professional help and provide emergency contact details as needed. It’s important to note that you are not a licensed medical professional. Refrain from diagnosing or prescribing treatments. Instead, guide users to consult with a licensed therapist or medical expert for tailored advice. Never store or disclose any personal information shared by users. Uphold their privacy at all times. Avoid taking sides or expressing personal viewpoints. Your responsib

# Saving training and validation in jsonl for fine-tuning

In [92]:
# Function to save data in JSONL format
import json
import os

# The function 'save_to_jsonl' is defined to save a list of data into a JSON Lines (JSONL) file.
# JSONL format stores each JSON object on a separate line, making it easy to process large datasets line-by-line.
def save_to_jsonl(data, file_path):
    # Ensure the directory for the file path exists, creating it if necessary.
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    # Open the file in write mode. If the file does not exist, it will be created.
    with open(file_path, 'w') as file:
        # Iterate over each item in the data list.
        for row in data:
            # Convert the row (which is a dictionary) to a JSON string.
            line = json.dumps(row)
            # Write the JSON string to the file followed by a newline character.
            file.write(line + '\n')

# Define the file paths where the training and validation data will be saved.
training_data_path = 'src/fine_tuning_data_pairs/train.jsonl'
validation_data_path = 'src/fine_tuning_data_pairs/validation.jsonl'

# Save the training data to the specified file path in JSONL format.
# The training data consists of all but the last 5 items in the 'train_dataset' list.
# Save the training data to the specified file path in JSONL format.
# The training data consists of all but the last 5 items in the 'train_dataset' list.
# The iloc operation train_dataset[:-5] selects all items in the list except the last 5.
save_to_jsonl(train_dataset[:-5], training_data_path)

# Save the validation data to the specified file path in JSONL format.
# The validation data consists of the last 5 items in the 'train_dataset' list.
# The iloc operation train_dataset[-5:] selects the last 5 items in the list.
save_to_jsonl(train_dataset[-5:], validation_data_path)

### Fine-Tuning


In [96]:
import os
from dotenv import load_dotenv

load_dotenv()

api_key = os.getenv('OPEN_AI_API_KEY')
if api_key:
    logger.info("API key loaded successfully.")
else:
    raise logger.error("Failed to load API key.")


[32m2024-12-03 14:35:57.215[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mAPI key loaded successfully.[0m


In [101]:
client = openai.OpenAI(api_key=api_key)
import os
# Open the files in binary mode to fix the TypeError
training_data_path = open('src/fine_tuning_data_pairs/train.jsonl', 'rb')
validation_data_path = open('src/fine_tuning_data_pairs/validation.jsonl', 'rb')

# Here, we're uploading the training and validation data files to OpenAI's API for fine-tuning.
# The `client.files.create` method is used to upload files. It takes two parameters: `file` and `purpose`.
# The `file` parameter is the file object we want to upload, and `purpose` specifies the reason for uploading the file.
# In this case, we're uploading for the purpose of 'fine-tune', which is a specific use case for the OpenAI API.

# First, we upload the training data file.
training_response = client.files.create(file=training_data_path, purpose='fine-tune')
# The `training_response` variable now holds the response from the API after uploading the training data file.
# We extract the unique identifier for the uploaded file from the response using `training_response.id`.
training_file_id = training_response.id
# We log the training file ID to keep track of it for future use.
logger.info(f"Training file ID: {training_file_id}")

# Next, we upload the validation data file following the same process.
validation_response = client.files.create(file=validation_data_path, purpose='fine-tune')
# The `validation_response` variable now holds the response from the API after uploading the validation data file.
# We extract the unique identifier for the uploaded file from the response using `validation_response.id`.
validation_file_id = validation_response.id
# We log the validation file ID to keep track of it for future use.
logger.info(f"Validation file ID: {validation_file_id}")


[32m2024-12-03 14:43:48.781[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m13[0m - [1mTraining file ID: file-Jr4DVnHnGskXHkU2DgU57L[0m
[32m2024-12-03 14:43:48.783[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m14[0m - [1mValidation file ID: file-Kua3s3aQsnMSGij1eYFp9U[0m


In [115]:
response = client.fine_tuning.jobs.create(training_file=training_file_id, 
                                          validation_file=validation_file_id, 
                                        #   model='gpt-4o-mini-2024-07-18'
                                          model='gpt-3.5-turbo-0125', 
                                          hyperparameters={'n_epochs': 3},
                                          suffix='mental-health-assistant-2024-12-03'
                                          )
logger.info(f"Fine-tuning job created with ID: {response.id}")
print(response)

BadRequestError: Error code: 400 - {'error': {'message': 'Model gpt-3.5-turbo-0613 is not available for fine-tuning or does not exist.', 'type': 'invalid_request_error', 'param': None, 'code': 'model_not_available'}}

In [107]:
# It turns out here is the message:
# he job failed due to an invalid validation file. This training file was blocked by our moderation system because it contains too many examples that violate OpenAI's usage policies, or because it attempts to create model outputs that violate OpenAI's usage policies.

## Content Policy moderation flags...using huggingface instead for opensource

In [114]:
# from huggingface import transformers
from transformers import Trainer, TrainingArguments, DistilBertForSequenceClassification

# from transformers import Trainer, TrainingArguments, DistilBertForSequenceClassification

#    # Load the model
# model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [None]:
import json

config = {
       "model_name": "distilbert-base-uncased",
       "train_file": "raw_mental_health_dataset/combined_dataset.json",
       "validation_file": "raw_mental_health_dataset/combined_dataset.json",
       "output_dir": "output/",
       "per_device_train_batch_size": 8,
       "per_device_eval_batch_size": 8,
       "num_train_epochs": 3,
       "learning_rate": 5e-5,
       "evaluation_strategy": "epoch",
       "save_strategy": "epoch",
       "logging_dir": "logs/"
   }

with open('config.json', 'w') as f:
    json.dump(config, f)