In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from datasets import load_dataset
from datasets import DatasetDict

In [2]:
from peft import LoraConfig, get_peft_model, PeftModel

2024-12-03 18:00:54.318844: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1733248854.333523  271940 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733248854.338086  271940 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-03 18:00:54.353627: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
from tensorflow import keras

In [4]:
device = "cuda:1" if torch.cuda.is_available() else "cpu"

model_name = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_8bit=True,
    trust_remote_code=True,
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
# Preprocess the dataset
dataset = load_dataset("json", data_files="/home/txie/Mental_Health/CS_584_Course_Project/MHNLP/fine_tuning_dataset.jsonl")

def preprocess_data(batch):
    results = {
        "input_ids": [],
        "attention_mask": [],
        "labels": []
    }

    for example in batch['messages']:
        prompt = ""
        completion = ""

        for message in example:
            if message["role"] in ["system", "user"]:
                prompt += f"{message['role'].capitalize()}: {message['content']}\n"
            elif message["role"] == "assistant":
                completion = message["content"]

        # Tokenize Prompt and Completion
        tokenized = tokenizer(
            text=prompt.strip(),
            text_target=completion.strip(),
            truncation=True,
            max_length=500,
            padding="max_length",
        )

        results["input_ids"].append(tokenized["input_ids"])
        results["attention_mask"].append(tokenized["attention_mask"])
        results["labels"].append(tokenized["labels"])

    return results

# Dataset preprocessing
tokenized_dataset = dataset.map(preprocess_data, batched=True)

# def tokenize_function(batch):
#     return tokenizer(batch["text"], truncation=True, padding=True, max_length=300)


# tokenized_dataset = dataset.map(tokenize_function, batched=True)

# def process_labels(examples):
#     examples["label"] = [int(label) for label in examples["label"]]
#     return examples

# tokenized_dataset = tokenized_dataset.map(process_labels)

train_test_split = tokenized_dataset["train"].train_test_split(test_size=0.2)

# Create a tokenized dataset with the train and test splits
tokenized_dataset = DatasetDict({
    "train": train_test_split["train"],
    "validation": train_test_split["test"]
})

lengths = [len(data["input_ids"]) for data in tokenized_dataset["train"]]
print(f"Average Length: {sum(lengths) / len(lengths)}")

print(tokenized_dataset)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Average Length: 500.0
DatasetDict({
    train: Dataset({
        features: ['messages', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 800
    })
    validation: Dataset({
        features: ['messages', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 200
    })
})


In [7]:
print(tokenized_dataset["train"][1])

{'messages': [{'role': 'system', 'content': 'You are a professional therapist. Your patient is telling you about their basic information and some answers to a mental health servey. Based on the information, please infer whether the patient has depression and/or anxiety disorders. 0: Neither, 1: Either or both. Based on this correspondence, you only need to answer the number.'}, {'role': 'user', 'content': "Age: 42. Sex: Female. Race: White. Sexual orientation: Bisexual, Queer. Financial situation right now: Rarely stressful. Financial situation while growing up: Always stressful. Within the last 12 months I never worried about not having stable housing. I am currently enrolled in a degree of Master's program. I am currently enrolled as a full-time student. My current overall GPA is no grade. I felt that emotional or mental difficulties have hurt my academic performance for none in the last 12 months. I am not sure that I am confident I will be able to finish my degree no matter what ch

In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="/home/txie/Mental_Health/CS_584_Course_Project/MHNLP/",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=500,
    evaluation_strategy="steps",
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
)

trainer.train()

In [None]:
peft_model.save_pretrained("/home/txie/Mental_Health/CS_584_Course_Project/MHNLP/fine_tuned", save_embedding_layers=True)
tokenizer.save_pretrained("/home/txie/Mental_Health/CS_584_Course_Project/MHNLP/fine_tuned")