In [1]:
%%capture
%pip install -U bitsandbytes
%pip install -U transformers
%pip install -U accelerate
%pip install -U peft
%pip install -U trl

In [2]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from trl import setup_chat_format
from sklearn.model_selection import train_test_split

In [3]:
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          pipeline,
                          logging)
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)

In [4]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/'CS 7650'

df = pd.read_excel('reddit_data.xlsx')

# Report the number of sentences.
print('Number of training sentences: {:,}\n'.format(df.shape[0]))

# Display 10 random rows from the data.
df.sample(10)

Mounted at /content/drive
/content/drive/MyDrive/CS 7650
Number of training sentences: 24,057



Unnamed: 0,ID,Title,Text,Date,Label,Is_MH
9825,1ft2ekq,Which of these U.S. cities should I visit for ...,Hi! \n\nI'm looking to go with my partner on a...,2024-09-30 18:13:55,none,No
9054,1fsgjoe,Indiana,Anyone know a good psychiatrist in Indiana ?,2024-09-29 22:28:00,anxiety,Yes
5733,1fk0l2w,What are some coping skills you've learned?,I have therapy tomorrow and were definitely go...,2024-09-18 18:56:28,bipolar,Yes
7027,1foet19,I'm going to die a virgin,Throwaway account:\n\nFor context I’m 24(M). N...,2024-09-24 15:08:57,ptsd,Yes
3850,1f2wp32,food actually looks weird right now.,has anyone experienced food looking spoiled an...,2024-08-28 00:22:09,anorexia,Yes
310,11nz89p,Tf do normal people on diets do when going ove...,Seriously like I don’t understand what the oth...,2023-03-10 20:02:36,anorexia,Yes
23315,vjha6m,My housemate keeps commenting on my OCD and it...,Today my housemate decided to inform me that *...,2022-06-24 05:52:28,OCD,Yes
1962,1azoa15,todays lunch before my figure skating practice,,2024-02-25 13:46:00,anorexia,Yes
4869,1fde21j,I need to reach out but I’m scared,I’ve never been told by anyone that I have an ...,2024-09-10 09:51:57,anorexia,Yes
16826,c8i25z,Does anyone else dissociate when there are lar...,I feel overwhelmed and dissociated whenever I ...,2019-07-03 01:43:15,ptsd,Yes


In [5]:
df.shape

(24057, 6)

In [6]:
df['Title'] = df['Title'].fillna('')
df['Text'] = df['Text'].fillna('')
df['Full_Text'] = df['Title'] + ' ' + df['Text']
df['Full_Text'] = df['Full_Text'].astype(str)
df['Label'] = np.where(df['Label'] == 'anorexia', 'eating disorder', df['Label'])
df_subset1 = df[['ID', 'Full_Text','Label']]
df_subset1 = df_subset1[0:24000]
df_subset1 = df_subset1.rename(columns={'Full_Text' : 'text'})
df_subset1 = df_subset1.rename(columns={'text' : 'statement', 'Label': 'status'})
df_subset1.head()

Unnamed: 0,ID,statement,status
0,100jk9m,The Person Who Traumatized You DMed you on Ins...,ptsd
1,100t81n,PTSD from antibiotics I took an antibiotic cal...,ptsd
2,100xfwl,guys i made a phone call!!!!!!!! basically my ...,anxiety
3,100y8dh,The Time when I was Sexually Harassed online (...,ptsd
4,101kgaa,Having a panic attack due to real event. need ...,OCD


In [7]:
df_subset1.shape

(24000, 3)

In [8]:
#Create a function to clean text in the datafrmae
def clean_text(text):

  # Remove emojis
  text = re.sub(r'[:;][\w\d-]*[:;]', '', text)

  # Remove emoticons
  text = re.sub(r'\([^\)]*\)', '', text)

  # Remove hyperlinks
  text = re.sub(r'https?://[^\s]+', '', text)

  # Remove non-English characters
  text = re.sub(r'[^\w\s]', '', text)

  # Convert all text into lowercase
  text = text.lower()

  return text

In [9]:
import re
df_subset1['statement'] = df_subset1['statement'].apply(clean_text)
df_subset1.sample(10)

Unnamed: 0,ID,statement,status
8722,1fs0vyf,win i went to a big fleamarket despite being s...,OCD
8908,1fsbec6,how far in advance for eurotar tickets hey eve...,none
14649,4pvwlx,psa when trying to comfort someone with anxiet...,anxiety
7284,1fpaeej,do you write in order or does it get jumbled u...,none
22289,qcqq2f,the excess fat on my body feels unnatural i ca...,eating disorder
3986,1f4b8hm,i feel guilty hi\n\nso ive been with my gf f...,OCD
19473,i93698,in the ed waiting room the whole time ive bee...,eating disorder
22258,q78n7o,is it possible that i have mental health probl...,OCD
23640,xe4mzq,my partner had photos of his ex we just had a ...,ptsd
984,15bndlu,does anyone else get scared that theyre going ...,ptsd


In [10]:
import sklearn
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df_subset1, test_size=0.20, random_state=123)

In [11]:
def generate_prompt(data_point):
    return f"""
            Classify the text into none, depression, anxiety, bipolar, ptsd, schizophrenia, eating disorder, or OCD and return the answer as the corresponding mental health disorder label.
text: {data_point["statement"]}
label: {data_point["status"]}""".strip()

def generate_test_prompt(data_point):
    return f"""
            Classify the text into none, depression, anxiety, bipolar, ptsd, schizophrenia, eating disorder, or OCD and return the answer as the corresponding mental health disorder label.
text: {data_point["statement"]}
label: """.strip()

In [12]:
X_train = train_df.copy()
X_test = test_df.copy()
X_train.loc[:,'text'] = X_train.apply(generate_prompt, axis=1)

In [13]:
X_train

Unnamed: 0,ID,statement,status,text
17100,cuaz2e,remind yourself any ounce of hope is fake and ...,depression,"Classify the text into none, depression, anxie..."
20507,kkn7o6,do you think ocd is entirely negative other pe...,OCD,"Classify the text into none, depression, anxie..."
11455,1fu5mrk,rant i had a dream last night that i kept cras...,anxiety,"Classify the text into none, depression, anxie..."
9433,1fsrswg,scared to eat sometimes does anybody else ha...,OCD,"Classify the text into none, depression, anxie..."
12477,1fus0wm,i feel like im not enough how do i just become...,depression,"Classify the text into none, depression, anxie..."
...,...,...,...,...
15377,8jp8m6,are you a woman with an anxiety disorder who w...,anxiety,"Classify the text into none, depression, anxie..."
21602,nh98gf,to the people who say things like make sure yo...,ptsd,"Classify the text into none, depression, anxie..."
17730,e7wrda,who else is so lonely that you catch yourself ...,ptsd,"Classify the text into none, depression, anxie..."
15725,9qurbf,embarrassed myself by showing up to the salon ...,bipolar,"Classify the text into none, depression, anxie..."


In [14]:
y_true = X_test.loc[:,'status']
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["text"])

In [15]:
X_train.status.value_counts()

Unnamed: 0_level_0,count
status,Unnamed: 1_level_1
none,2808
OCD,2426
anxiety,2381
bipolar,2356
depression,2331
schizophrenia,2319
ptsd,2303
eating disorder,2276


In [16]:
y_true.value_counts()

Unnamed: 0_level_0,count
status,Unnamed: 1_level_1
none,692
schizophrenia,623
bipolar,616
ptsd,608
depression,602
anxiety,575
OCD,548
eating disorder,536


In [17]:
train_data = Dataset.from_pandas(X_train[["text"]])
train_data['text'][3]

'Classify the text into none, depression, anxiety, bipolar, ptsd, schizophrenia, eating disorder, or OCD and return the answer as the corresponding mental health disorder label.\ntext: scared to  eat sometimes  does anybody else have scares of food contamination like someone putting something in your drink maggotsparasitic worms hiding in food even though you physically check even thinking unopened water bottles have something in there thats gonna kill me im just fully annoyed and upset that i cant eat my full meals that much anymore\nlabel: OCD'

In [41]:
midpoint = len(X_test) // 10
eval_data = Dataset.from_pandas(X_test[["text"]][:midpoint])

In [42]:
eval_data.shape

(480, 2)

In [19]:
import kagglehub

In [20]:
#!pip install transformers bitsandbytes



In [21]:
#pip install llama-stack

Collecting llama-stack
  Downloading llama_stack-0.0.58-py3-none-any.whl.metadata (12 kB)
Collecting blobfile (from llama-stack)
  Downloading blobfile-3.0.0-py3-none-any.whl.metadata (15 kB)
Collecting fire (from llama-stack)
  Downloading fire-0.7.0.tar.gz (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting llama-models>=0.0.58 (from llama-stack)
  Downloading llama_models-0.0.58-py3-none-any.whl.metadata (8.2 kB)
Collecting llama-stack-client>=0.0.58 (from llama-stack)
  Downloading llama_stack_client-0.0.58-py3-none-any.whl.metadata (15 kB)
Collecting python-dotenv (from llama-stack)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting tiktoken (from llama-models>=0.0.58->llama-stack)
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting pyaml (from l

In [22]:
!huggingface-cli login



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `alpaca` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `alpaca`


In [23]:
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer

In [24]:
pip install -U bitsandbytes



In [25]:
from transformers import AutoModelForCausalLM
from transformers import BitsAndBytesConfig

base_model_name = "meta-llama/Llama-3.1-8B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype="float16",
    quantization_config=bnb_config,
)

model.config.use_cache = False
model.config.pretraining_tp = 1

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

In [26]:
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

tokenizer.pad_token_id = tokenizer.eos_token_id

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [27]:
def predict(test, model, tokenizer):
    y_pred = []
    categories = ["none", "depression", "anxiety", "bipolar", "eating disorder", "OCD", "ptsd", "schizophrenia"]

    for i in tqdm(range(len(test))):
        prompt = test.iloc[i]["text"]
        pipe = pipeline(task="text-generation",
                        model=model,
                        tokenizer=tokenizer,
                        max_new_tokens=2,
                        temperature=0.1)

        result = pipe(prompt)
        answer = result[0]['generated_text'].split("label:")[-1].strip()

        # Determine the predicted category
        for category in categories:
            if category.lower() in answer.lower():
                y_pred.append(category)
                break
        else:
            y_pred.append("none")

    return y_pred

In [28]:
X_test.shape

(4800, 1)

In [29]:
y_pred = predict(X_test[0:100], model, tokenizer)


100%|██████████| 100/100 [00:46<00:00,  2.14it/s]


In [30]:
def evaluate(y_true, y_pred):
    labels = ["none", "depression", "anxiety", "bipolar", "eating disorder", "OCD", "ptsd", "schizophrenia"]
    mapping = {label: idx for idx, label in enumerate(labels)}

    def map_func(x):
        return mapping.get(x, -1)

    y_true_mapped = np.vectorize(map_func)(y_true)
    y_pred_mapped = np.vectorize(map_func)(y_pred)


    accuracy = accuracy_score(y_true=y_true_mapped, y_pred=y_pred_mapped)
    print(f'Accuracy: {accuracy:.3f}')


    unique_labels = set(y_true_mapped)

    for label in unique_labels:
        label_indices = [i for i in range(len(y_true_mapped)) if y_true_mapped[i] == label]
        label_y_true = [y_true_mapped[i] for i in label_indices]
        label_y_pred = [y_pred_mapped[i] for i in label_indices]
        label_accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {labels[label]}: {label_accuracy:.3f}')


    class_report = classification_report(y_true=y_true_mapped, y_pred=y_pred_mapped, target_names=labels, labels=list(range(len(labels))))
    print('\nClassification Report:')
    print(class_report)

    conf_matrix = confusion_matrix(y_true=y_true_mapped, y_pred=y_pred_mapped, labels=list(range(len(labels))))
    print('\nConfusion Matrix:')
    print(conf_matrix)

In [31]:
evaluate(y_true[0:100], y_pred)

Accuracy: 0.660
Accuracy for label none: 0.909
Accuracy for label depression: 1.000
Accuracy for label anxiety: 1.000
Accuracy for label bipolar: 0.500
Accuracy for label eating disorder: 0.538
Accuracy for label OCD: 0.250
Accuracy for label ptsd: 0.364
Accuracy for label schizophrenia: 0.385

Classification Report:
                 precision    recall  f1-score   support

           none       0.57      0.91      0.70        22
     depression       0.54      1.00      0.70        13
        anxiety       0.59      1.00      0.74        10
        bipolar       0.83      0.50      0.62        10
eating disorder       1.00      0.54      0.70        13
            OCD       1.00      0.25      0.40         8
           ptsd       1.00      0.36      0.53        11
  schizophrenia       1.00      0.38      0.56        13

       accuracy                           0.66       100
      macro avg       0.82      0.62      0.62       100
   weighted avg       0.79      0.66      0.64      

In [32]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)
modules

['down_proj', 'o_proj', 'q_proj', 'up_proj', 'k_proj', 'v_proj', 'gate_proj']

In [33]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [43]:
peft_config = LoraConfig(
    lora_alpha=128,
    lora_dropout=0,
    r=32,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules,
)

In [44]:
training_arguments = TrainingArguments(
    output_dir= '/outputs/',
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    logging_steps=1,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=False,
    lr_scheduler_type="cosine",
    report_to="wandb",
    eval_strategy="steps",
    eval_steps = 0.2
)

In [45]:
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    max_seq_length=256,
    packing=False,
    dataset_kwargs={
    "add_special_tokens": False,
    "append_concat_token": False,
    }
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/19200 [00:00<?, ? examples/s]

Map:   0%|          | 0/480 [00:00<?, ? examples/s]

In [None]:
trainer.train( )


  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
240,2.0265,2.140399
480,1.9866,2.125482
720,2.2378,2.10448


  return fn(*args, **kwargs)


In [None]:
output_dir= '/outputs/'
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

In [None]:
y_pred = predict(X_test[midpoint:], model, tokenizer)
evaluate(y_true[midpoint:], y_pred)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
# Get attention weights for a specific input
inputs = tokenizer("", return_tensors="pt")
outputs = model(**inputs, output_attentions=True)

# Extract attention weights (for all heads, pick one for visualization)
attention = outputs.attentions[-1]  # Last layer attention
head_attention = attention[0][0]  # First example, first head

# Visualize
plt.figure(figsize=(10, 8))
sns.heatmap(head_attention.detach().cpu().numpy(), cmap="coolwarm", xticklabels=inputs.tokens, yticklabels=inputs.tokens)
plt.title("Attention Weights")
plt.show()