In [None]:
#!rm -rf ./llama3.2-1b-finetuned
#!rm -rf ./lora_weights
#!pip uninstall -y torch torchvision torchaudio transformers datasets peft huggingface_hub tf-keras
#!pip uninstall -y pandas scikit-learn openpyxl tqdm ipywidgets
#!pip cache purge
#!df -h

In [None]:
'''!pip install datasets
!pip install transformers
!pip install tf-keras
!pip install peft
!pip install openpyxl
!pip install torch
!pip install pandas
!pip install huggingface_hub
!pip install scikit-learn
!pip install ipywidgets
!pip install tqdm'''

'!pip install datasets\n!pip install transformers\n!pip install tf-keras\n!pip install peft\n!pip install openpyxl\n!pip install torch\n!pip install pandas\n!pip install huggingface_hub\n!pip install scikit-learn\n!pip install ipywidgets\n!pip install tqdm'

In [None]:
import torch
import pandas as pd
from datasets import Dataset
from huggingface_hub import login

from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from peft import LoraConfig, TaskType, get_peft_model, PeftModel

E0000 00:00:1754345018.633766    1059 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754345018.639147    1059 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1754345018.652716    1059 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1754345018.652735    1059 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1754345018.652737    1059 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1754345018.652739    1059 computation_placer.cc:177] computation placer already registered. Please check linka

# Processing Data

In [None]:
def processing_data(file_path):

  if not isinstance(file_path, str):

    raise ValueError("Not a string.")

  # Reading Excel File
  df_heart_dis = pd.read_excel(file_path)

  # Removing Duplicate Rows
  df_heart_dis = df_heart_dis.drop_duplicates()

  # Removing Empty Rows
  df_heart_dis = df_heart_dis.dropna()

  # Removing Uncessary columns
  columns_to_drop = ["height", "weight", "age", "bp_category_encoded", "id"]

  df_heart_dis.drop(columns = columns_to_drop, inplace = True)

  # Converting 2 = "Male" & 1 = "Female"
  df_heart_dis["gender"] = df_heart_dis["gender"].apply(lambda x: "Male" if x == 2 else "Female")

  # Converting 1 = "Yes" & 0 = "No"
  df_heart_dis["cardio"] = df_heart_dis["cardio"].apply(lambda x: "Yes" if x == 1 else "No")

  return df_heart_dis

In [None]:
df_heart_dis = processing_data("cardiovascular_dataset.xlsx")

print(df_heart_dis.head(5))

   gender  ap_hi  ap_lo  cholesterol  gluc  smoke  alco  active cardio  \
0    Male    110     80            1     1      0     0       1     No   
1  Female    140     90            3     1      0     0       1    Yes   
2  Female    130     70            3     1      0     0       0    Yes   
3    Male    150    100            1     1      0     0       1    Yes   
4  Female    100     60            1     1      0     0       0     No   

   age_years        bmi           bp_category  
0         50  21.967120  Hypertension Stage 1  
1         55  34.927679  Hypertension Stage 2  
2         51  23.507805  Hypertension Stage 1  
3         48  28.710479  Hypertension Stage 2  
4         47  23.011177                Normal  


# Converting Data into a Prompt

In [None]:
def data_to_prompt (df_heart):

    prompt = (
        f'The patient is a {df_heart["age_years"]} years old {df_heart["gender"]}. '
        f'BMI is {round(df_heart["bmi"], 2)}. '
        f'The patient {"does" if df_heart["smoke"] == 1 else "does not"} smoke, '
        f'{"does" if df_heart["alco"] == 1 else "does not"} drink alcohol, and '
        f'is physically {"active" if df_heart["active"] == 1 else "not active"}. '
        f'Blood pressure is {df_heart["ap_hi"]} / {df_heart["ap_lo"]}. '
        f'Blood pressure category: {df_heart["bp_category"]}. '
    )

    # Cholesterol
    if df_heart["cholesterol"] == 1:

      prompt += "Cholesterol is normal. "

    elif df_heart["cholesterol"] == 2:

      prompt += "Cholesterol is above normal. "

    else:

      prompt += "Cholesterol is well above normal. "

    # Glucose
    if df_heart["gluc"] == 1:

      prompt += "Glucose is normal. "

    elif df_heart["gluc"] == 2:

      prompt += "Glucose is above normal. "

    else:

      prompt += "Glucose is well above normal.\n"

    prompt += "Does the patient have heart disease? Answer:"

    return prompt

In [None]:
df_heart_dis["prompt"] = df_heart_dis.apply(data_to_prompt, axis = 1) # just prompt
df_heart_dis["prompt_ans"] = df_heart_dis["prompt"] + " " + df_heart_dis["cardio"] # prompt + answer

print(df_heart_dis["prompt"][0] + "\n")
print(df_heart_dis["prompt_ans"][0] + "\n")

The patient is a 50 years old Male. BMI is 21.97. The patient does not smoke, does not drink alcohol, and is physically active. Blood pressure is 110 / 80. Blood pressure category: Hypertension Stage 1. Cholesterol is normal. Glucose is normal. Does the patient have heart disease? Answer:

The patient is a 50 years old Male. BMI is 21.97. The patient does not smoke, does not drink alcohol, and is physically active. Blood pressure is 110 / 80. Blood pressure category: Hypertension Stage 1. Cholesterol is normal. Glucose is normal. Does the patient have heart disease? Answer: No



# Training, Validation & Testing

In [None]:
# Preparing dataset
dataset = Dataset.from_pandas(df_heart_dis[["prompt", "prompt_ans", "cardio"]])

print(f'Total: {len(dataset)} examples.')

# Splitting Data
dataset = dataset.train_test_split(test_size = 0.2, seed = 42)
train_val_dataset = dataset["train"]
test_dataset = dataset["test"] # 20% testing

test_dataset = test_dataset.remove_columns(["prompt_ans"])# prompt and label needed only

train_val_dataset = train_val_dataset.train_test_split(test_size = 0.125, seed = 42)
train_dataset = train_val_dataset["train"] # 70% training
val_dataset = train_val_dataset["test"] # 10% validation

print(f'Training: {len(train_dataset)} examples.')
print(f'Validation: {len(val_dataset)} examples.')
print(f'Testing: {len(test_dataset)} examples.\n')

print(train_dataset[0], "\n")
print(val_dataset[0], "\n")
print(test_dataset[0])

Total: 68205 examples.
Training: 47743 examples.
Validation: 6821 examples.
Testing: 13641 examples.

{'prompt': 'The patient is a 55 years old Female. BMI is 28.57. The patient does not smoke, does not drink alcohol, and is physically active. Blood pressure is 130 / 80. Blood pressure category: Hypertension Stage 1. Cholesterol is normal. Glucose is normal. Does the patient have heart disease? Answer:', 'prompt_ans': 'The patient is a 55 years old Female. BMI is 28.57. The patient does not smoke, does not drink alcohol, and is physically active. Blood pressure is 130 / 80. Blood pressure category: Hypertension Stage 1. Cholesterol is normal. Glucose is normal. Does the patient have heart disease? Answer: No', 'cardio': 'No'} 

{'prompt': 'The patient is a 45 years old Female. BMI is 43.21. The patient does smoke, does drink alcohol, and is physically active. Blood pressure is 170 / 80. Blood pressure category: Hypertension Stage 1. Cholesterol is well above normal. Glucose is normal. 

# Loading Llama

In [None]:
# Access Token from Hugging Face
login(token = "use_your_own")

In [None]:
# Setting device
device = "cuda"

# Setting model
model_id = "meta-llama/Llama-3.2-1B"

model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype = torch.bfloat16).to(device)

# Setting tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code = True)
tokenizer.pad_token = tokenizer.eos_token

# Tokenization

In [None]:
def tokenizing_data(examples):

    input_ids_batch = []
    attention_mask_batch = []
    label_ids_batch = []

    #print(len(examples["prompt"]))

    for prompt, prompt_ans in zip(examples["prompt"], examples["prompt_ans"]):

        tokenized_prompt = tokenizer(prompt, truncation = True, max_length = 100, padding = "max_length",
                                     return_attention_mask = True)

        tokenized_prompt_ans = tokenizer(prompt_ans, truncation = True, max_length = 100, padding = "max_length",
                                         return_attention_mask = True)

        # Find label start index (length of prompt input ids without padding)
        label_start_idx = 0

        for input_id in tokenized_prompt["input_ids"]:

            if input_id != tokenizer.pad_token_id:

                label_start_idx += 1

        # list of n -100
        label_ids = [-100] * len(tokenized_prompt["input_ids"])

        # label ("Yes" or "No") token ids
        label_ids[label_start_idx:] = tokenized_prompt_ans["input_ids"][label_start_idx:]

        input_ids_batch.append(tokenized_prompt_ans["input_ids"])
        attention_mask_batch.append(tokenized_prompt_ans["attention_mask"])
        label_ids_batch.append(label_ids)

    return {"input_ids": input_ids_batch, "attention_mask": attention_mask_batch, "label_ids": label_ids_batch,}

In [None]:
# Tokenizing dataset
batch_size = 32
trainset_tokenized = train_dataset.map(tokenizing_data, batched = True, batch_size = batch_size)
valset_tokenized = val_dataset.map(tokenizing_data, batched = True, batch_size = batch_size)

print(trainset_tokenized[0])

Map:   0%|          | 0/47743 [00:00<?, ? examples/s]

Map:   0%|          | 0/6821 [00:00<?, ? examples/s]

{'prompt': 'The patient is a 55 years old Female. BMI is 28.57. The patient does not smoke, does not drink alcohol, and is physically active. Blood pressure is 130 / 80. Blood pressure category: Hypertension Stage 1. Cholesterol is normal. Glucose is normal. Does the patient have heart disease? Answer:', 'prompt_ans': 'The patient is a 55 years old Female. BMI is 28.57. The patient does not smoke, does not drink alcohol, and is physically active. Blood pressure is 130 / 80. Blood pressure category: Hypertension Stage 1. Cholesterol is normal. Glucose is normal. Does the patient have heart disease? Answer: No', 'cardio': 'No', 'input_ids': [128000, 791, 8893, 374, 264, 220, 2131, 1667, 2362, 29738, 13, 47224, 374, 220, 1591, 13, 3226, 13, 578, 8893, 1587, 539, 16603, 11, 1587, 539, 7172, 13200, 11, 323, 374, 22655, 4642, 13, 20671, 7410, 374, 220, 5894, 611, 220, 1490, 13, 20671, 7410, 5699, 25, 39515, 531, 2711, 22891, 220, 16, 13, 921, 35244, 374, 4725, 13, 8444, 94697, 374, 4725, 13,

# Training Llama

In [None]:
peft_config = LoraConfig(
    r = 16, # number of weights per input and output neurons
    lora_alpha = 32, # scaling factor for LoRa update
    target_modules = ["q_proj", "v_proj"], # modules to apply LoRa
    task_type = TaskType.CAUSAL_LM, # specific task
    lora_dropout = 0.1, # dropout rate applied to LoRa weight matrices
    bias = "none" # no biases are trained
)

model = get_peft_model(model, peft_config) # adds LoRa weight matrixes to the original model
model.print_trainable_parameters() # prints number of parameters in the model are trainable vs total parameters

trainable params: 1,703,936 || all params: 1,237,518,336 || trainable%: 0.1377


In [None]:
training_args = TrainingArguments(
    output_dir = "./llama3.2-1b-finetuned",
    per_device_train_batch_size = 8, # number of examples in one batch processed at a time
    gradient_accumulation_steps = 4, # backward pass after n batches
    num_train_epochs = 10,
    learning_rate = 2e-4,
    bf16 = True,
    logging_strategy = "epoch",
    save_strategy = "epoch",
    eval_strategy = "epoch",
    report_to = "none",
    label_names = ["label_ids"],
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer = tokenizer,
    mlm = False,
)

trainer = Trainer(
    model = model, # model
    args = training_args, # training params
    train_dataset = trainset_tokenized,
    eval_dataset = valset_tokenized,
    data_collator = data_collator,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.2532,0.228147
2,0.2232,0.226157
3,0.2212,0.224255
4,0.2201,0.223
5,0.2192,0.222388
6,0.2185,0.22209
7,0.2179,0.221586
8,0.2173,0.221211
9,0.2167,0.220818
10,0.2161,0.220643


TrainOutput(global_step=14920, training_loss=0.22234243234424744, metrics={'train_runtime': 4127.323, 'train_samples_per_second': 115.675, 'train_steps_per_second': 3.615, 'total_flos': 2.79253595049984e+17, 'train_loss': 0.22234243234424744, 'epoch': 10.0})

In [None]:
# Saving LoRa adapter weights into a file
model.save_pretrained("lora_weights")

# Evaluating Llama

In [None]:
# Loading Model with trained LoRa adapter weights
'''
# Setting device
device = "cuda"

# Setting model
model_id = "meta-llama/Llama-3.2-1B"

base_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype = torch.bfloat16).to(device)

trained_model = PeftModel.from_pretrained(base_model, "lora_weights")

# Setting tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code = True)
tokenizer.pad_token = tokenizer.eos_token'''

'\n# Setting device\ndevice = "cuda"\n\n# Setting model\nmodel_id = "meta-llama/Llama-3.2-1B"\n\nbase_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype = torch.bfloat16).to(device)\n\ntrained_model = PeftModel.from_pretrained(base_model, "lora_weights")\n\n# Setting tokenizer\ntokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code = True)\ntokenizer.pad_token = tokenizer.eos_token'

In [None]:
def compute_metrics(true_labels, pred_labels):

  tp, fp = 0, 0
  tn, fn = 0, 0

  for true, pred in zip(true_labels, pred_labels):

    if true == "Yes" and pred == "Yes":

      tp += 1

    elif true == "No" and pred == "No":

      tn += 1

    elif true == "No" and pred == "Yes":

      fp += 1

    else:

      fn += 1

  accuracy = (tp + tn) / (tp + tn + fp + fn)
  precision = tp / (tp + fp) if (tp + fp) > 0 else 0
  recall = tp / (tp + fn) if (tp + fn) > 0 else 0
  f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

  return accuracy, precision, recall, f1_score

In [None]:
def eval(model, tokenizer, dataset):

    model.eval()

    # To avoid warnings during inference
    model.generation_config.temperature = None
    model.generation_config.top_p = None

    preds = []
    true_labels = []

    for example in dataset:

        true_label = example["cardio"]

        prompt = example["prompt"]

        # Tokenize the prompt
        inputs = tokenizer(prompt, return_tensors = "pt").to(model.device)

        with torch.no_grad():

            outputs = model.generate(
                input_ids = inputs["input_ids"],
                attention_mask = inputs["attention_mask"],
                max_new_tokens = 2, # generate only "Yes" or "No"
                do_sample = False, # Greedy decoding (most likely next token)
                pad_token_id = tokenizer.eos_token_id,
            )

        # Decode and extract the answer
        decoded = tokenizer.decode(outputs[0], skip_special_tokens = True)
        pred = decoded.replace(prompt, "").strip().rstrip('.') # get first word after prompt

        #print(decoded)
        #print(pred)

        preds.append(pred)
        true_labels.append(true_label)

    # Compute metrics
    accuracy, precision, recall, f1_score = compute_metrics(true_labels, preds)

    print("Accuracy:{}".format(accuracy * 100))
    print("Precision:{}".format(precision))
    print("Recall:{}".format(recall))
    print("F1-Score:{}".format(f1_score))

In [None]:
print("------Training Dataset------")
eval(model, tokenizer, train_dataset)
print("\n------Validation Dataset------")
eval(model, tokenizer, val_dataset)
print("\n------Testing Dataset------")
eval(model, tokenizer, test_dataset)

------Training Dataset------
Accuracy:73.7699767505184
Precision:0.771652761223679
Recall:0.662234155905042
F1-Score:0.7127686414826028

------Validation Dataset------
Accuracy:73.12710746224894
Precision:0.7663299663299663
Recall:0.6664714494875549
F1-Score:0.7129209083790131

------Testing Dataset------
Accuracy:73.22776922513012
Precision:0.7737887765772046
Recall:0.6535178098322049
F1-Score:0.7085860197893392
