# Installation and Load packages

In [1]:
!pip install datasets peft -qq
!pip install accelerate -qq
!pip install bitsandbytes -qq

In [None]:
!pip install torch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/cu118
!pip install --upgrade --pre transformers accelerate --extra-index-url https://download.pytorch.org/whl/cu118
!pip install bitsandbytes==0.43.2 --prefer-binary --extra-index-url https://pypi.org/simple


Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch==2.2.0
  Downloading https://download.pytorch.org/whl/cu118/torch-2.2.0%2Bcu118-cp310-cp310-linux_x86_64.whl (811.7 MB)
[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m811.6/811.7 MB[0m [31m185.2 MB/s[0m eta [36m0:00:01[0m00:01[0m

In [None]:
!pip install wandb scikit-learn

## GPU - details

In [None]:
import torch

print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

if torch.cuda.is_available():
    print("Device name:", torch.cuda.get_device_name(0))
else:
    print("No GPU detected.")


# Load libraries, Login HuggingFace API & WandB API

- **HuggingFace API:** To get access of Model Llama-3 (8 Billion)
- **WandB (Weigths & Biases):** To supervise perform of model and hyperparameter Tuning

In [None]:
# from google.colab import userdata
from huggingface_hub import login

login(token="YOUR_HF_API_KEY")
# Access Key for llama Model (HuggingFace)

from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    Trainer,
    BitsAndBytesConfig)

from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from bitsandbytes.optim import AdamW8bit


In [None]:
# for hyperparameter tuning report
import wandb


wandb.login()
# WANDB KEY

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33myashnayi00[0m ([33myashnayi00-university-of-new-haven[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

# Load Llama-3.1-8B model 

In [7]:
# model_name = "meta-llama/Llama-3.1-8B-Instruct"
model_name = "meta-llama/Llama-3.1-8B"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False
    )


tokenizer = AutoTokenizer.from_pretrained(model_name)

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# model.config.pretraining_tp = 1
# model.config.use_cache = False


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [8]:
print(f"meta-llama/Llama-3-8B:\n{base_model}")

meta-llama/Llama-3-8B:
LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (n

In [9]:
print(f"{base_model.config}")

LlamaConfig {
  "_attn_implementation_autoset": true,
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": false,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bit": true,
    "load_in_8bit": false,
    "quant_method": "bitsandbytes

### Trainable parameters - Model

In [10]:
def trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    return f"- Trainable model parameters: {trainable_params}.\n- All model parameters: {all_param}.\n- Percentage of trainable model parameters: {100 * trainable_params / all_param:.2f}%"

print(trainable_parameters(base_model))

- Trainable model parameters: 1050939392.
- All model parameters: 4540600320.
- Percentage of trainable model parameters: 23.15%


### Assign datasetPH.json

Data is split in to train and test.
- Train size: 80%
- Test size: 20%

In [11]:
import json
with open("./dataset/datasetPH.json", "r") as f:
    data = json.load(f)

if isinstance(data, dict):
    print("Data is a dictionary. Converting values to a list for splitting.")
    data = list(data.values())

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

with open("./dataset/train_datasetPH.json", "w") as f:
    json.dump(train_data, f, indent=2)

with open("./dataset/test_datasetPH.json", "w") as f:
    json.dump(test_data, f, indent=2)

print(f"Train size: {len(train_data)}")
print(f"Test size: {len(test_data)}")


Data is a dictionary. Converting values to a list for splitting.
Train size: 160
Test size: 41


In [12]:
data[0]

{'paper_id': 'ED012836',
 'title': 'Adult Basic Education Work Book in Basic Arithmetic. Parts I and II.',
 'author': 'Graham, Minnie M.',
 'publication_year': 1966,
 'source': 'Danbury Public Schools, Connecticut',
 'doi_or_url': '',
 'topic_category': 'Adult Education / Arithmetic Instruction',
 'document_type': 'Workbook',
 'abstract': 'These workbooks provide teaching materials and drill exercises in multiplication for adult basic education learners in Danbury, Connecticut. Part I covers multiplication by numbers two through nine, while Part II expands to ten through twelve, including dollars and cents, and offers speed and accuracy drills.',
 'key_findings': 'Instructional workbooks tailored for adult learners can assist in foundational arithmetic, especially multiplication, through structured drills and exercises.',
 'problem_statement': 'Adult learners require appropriately designed arithmetic materials to support basic educational needs at elementary levels.',
 'objectives': 'T

### Tokenization of dataset and Normalize 

In [17]:
# def tokenize_function(examples):
#     texts = []
#     for i in range(len(examples["title"])):
#         entry_parts = []

#         for key in examples.keys():
#             value = examples[key][i]
#             if isinstance(value, dict):
#                 for subkey, subval in value.items():
#                     entry_parts.append(f"{key}.{subkey}: {subval}")
#             elif isinstance(value, list):
#                 entry_parts.append(f"{key}: {', '.join(map(str, value))}")
#             else:
#                 entry_parts.append(f"{key}: {value}")

#         combined_text = "\n".join(entry_parts)
#         texts.append(combined_text)

#     return tokenizer(texts, truncation=True, padding="max_length", max_length=256)

def tokenize_function(examples):
    prompts = []
    for i in range(len(examples["title"])):
        persona = "You are a public policy analyst specializing in educational reform.\n"
        instruction = "Summarize the key findings from the report below. Your output should include:\n- Three bullet points summarizing the findings\n- One paragraph about implications\n- A JSON tag with `impact` set to positive, negative, or neutral\n"
        context = "This report evaluates a new adult education intervention implemented in Connecticut.\n"
        format_guide = "Use professional and concise tone. Output must be structured: bullet points, paragraph, then JSON.\n"
        few_shot = "Example Input: \"The policy resulted in 70% improvement in adult math scores and lowered dropout rates.\"\nExample Output:\n- Improved math proficiency by 70%\n- Reduced dropout rates significantly\n- High engagement among learners\nImplication: These results show the program is effective and could be scaled to other regions.\n{\"impact\": \"positive\"}\n"

        title = examples["title"][i] or ""
        abstract = examples["abstract"][i] or ""
        findings = examples["key_findings"][i] or ""
        full_text = f"Title: {title}\nAbstract: {abstract}\nFindings: {findings}"

        full_prompt = persona + instruction + context + format_guide + few_shot + "Now analyze this:\n" + full_text
        prompts.append(full_prompt)

    return tokenizer(prompts, truncation=True, padding="max_length", max_length=512)


In [18]:
def normalize_entry(entry):
    normalized = {}
    for key, value in entry.items():
        if isinstance(value, dict):
            for subkey, subval in value.items():
                normalized[f"{key}.{subkey}"] = str(subval) if subval is not None else ""
        elif isinstance(value, list):
            normalized[key] = ", ".join(map(str, value))
        elif value is None:
            normalized[key] = ""
        else:
            normalized[key] = str(value)
    return normalized

# Normalize each entry
train_data_clean = [normalize_entry(entry) for entry in train_data]
test_data_clean = [normalize_entry(entry) for entry in test_data]


In [19]:
train_dataset_hf = Dataset.from_list(train_data_clean)
test_dataset_hf = Dataset.from_list(test_data_clean)

### Train & Test - Tokenization 

In [20]:
tokenized_train = train_dataset_hf.map(tokenize_function, batched=True)
tokenized_train.set_format(type="torch")
print("Tokenization complete with all features.")

Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Tokenization complete with all features.


In [21]:
tokenized_test = test_dataset_hf.map(tokenize_function, batched=True)
tokenized_test.set_format(type="torch")
print("Tokenization complete with all features.")

Map:   0%|          | 0/41 [00:00<?, ? examples/s]

Tokenization complete with all features.


# Configer - PEFT, LoRA & QLoRA

In [22]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [23]:
base_model.gradient_checkpointing_enable()

base_model = prepare_model_for_kbit_training(base_model)
peft_model = get_peft_model(base_model, lora_config)

peft_model.config.use_cache = False

print("After PEFT wrapping:")
print(trainable_parameters(peft_model))

After PEFT wrapping:
- Trainable model parameters: 3407872.
- All model parameters: 4544008192.
- Percentage of trainable model parameters: 0.07%


In [24]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = predictions.argmax(-1)
    accuracy = (preds == labels).astype(float).mean().item()
    return {"accuracy": accuracy}

# Train PH-Llama-3.0 Model & Evaluation 

In [25]:
import torch
import os
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

training_args = TrainingArguments(
    output_dir="./PH-Llama-3.0",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1, 
    gradient_accumulation_steps=1,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=10,
    save_steps=100,
    eval_strategy="steps",
    eval_steps=50,
    save_total_limit=2,
    fp16=True,
    report_to="wandb"
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
#     compute_metrics=compute_metrics,
    optimizers=(AdamW8bit(peft_model.parameters(), lr=2e-5), None)
)

torch.cuda.empty_cache() # Force Clear Cache Before Training

print("Starting training...")
trainer.train()
print("Training complete.")


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Starting training...


  Expected `list[str]` but got `tuple` - serialized value may not be as expected
  Expected `list[str]` but got `tuple` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(


Step,Training Loss,Validation Loss
50,2.0882,1.997144
100,1.1569,1.110544
150,0.931,0.912048
200,0.8927,0.840778
250,0.8199,0.79951
300,0.8451,0.788504
350,0.799,0.782161
400,0.8677,0.776603
450,0.7634,0.773298
500,0.7546,0.769548


Training complete.


In [26]:
eval_results = trainer.evaluate()
print("Evaluation Results:")
print(eval_results)

Evaluation Results:
{'eval_loss': 0.7612014412879944, 'eval_runtime': 39.1578, 'eval_samples_per_second': 1.047, 'eval_steps_per_second': 1.047, 'epoch': 5.0}


# Generate Text by Trained Model

In [27]:
def generate_text(prompt, max_length=100, temperature=0.7, top_p=0.95):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
    inputs = {key: value.to(peft_model.device) for key, value in inputs.items()}
    
    outputs = peft_model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_length,
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
        pad_token_id=tokenizer.eos_token_id
    )
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

prompt = "Using the dataset from the Peterson-KFF Health System Tracker on U.S. healthcare quality, provide a comprehensive analysis comparing the United States to other high-income countries. In your response, summarize key metrics such as life expectancy, all-cause mortality, maternal mortality, and rates of premature death. Discuss the impact of socioeconomic factors and healthcare utilization on these outcomes, and explain why the U.S. may perform worse on several indicators despite high per capita spending."
print(generate_text(prompt, max_length=512))

Using the dataset from the Peterson-KFF Health System Tracker on U.S. healthcare quality, provide a comprehensive analysis comparing the United States to other high-income countries. In your response, summarize key metrics such as life expectancy, all-cause mortality, maternal mortality, and rates of premature death. Discuss the impact of socioeconomic factors and healthcare utilization on these outcomes, and explain why the U.S. may perform worse on several indicators despite high per capita spending. Your output should include a structured summary, highlighting the key findings and implications for healthcare reform.
Use the following Python libraries:
pandas: For data manipulation and analysis
matplotlib: For data visualization
seaborn: For enhanced visualization capabilities
numpy: For numerical computations
Note: This is an exploratory analysis; do not focus on modeling or optimization.
Output: A structured summary with key findings, implications, and visualizations (e.g., tables,

In [28]:
def generate_text(prompt, max_length=100, temperature=0.7, top_p=0.95):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
    inputs = {key: value.to(peft_model.device) for key, value in inputs.items()}
    
    outputs = peft_model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_length,
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
        pad_token_id=tokenizer.eos_token_id
    )
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# Test the generation function:
# sample_prompt = "Explain the impact of fine-tuning on the performance of language models. Does fine-tuning improve the performance of a language model? Explain."
prompt = "What are the key drivers of racial disparities in health outcomes in the United States? Your response should include statistical comparisons (e.g., maternal mortality, premature death, DALY rates), the influence of socioeconomic and systemic factors, and any racial-specific insights. Please also include relevant article context and direct links to the sources or referenced reports where applicable."
print(generate_text(prompt, max_length=1012))

What are the key drivers of racial disparities in health outcomes in the United States? Your response should include statistical comparisons (e.g., maternal mortality, premature death, DALY rates), the influence of socioeconomic and systemic factors, and any racial-specific insights. Please also include relevant article context and direct links to the sources or referenced reports where applicable. Your response should be concise, focused, and structured to facilitate easy comprehension.
What are the key drivers of racial disparities in health outcomes in the United States? Your response should include statistical comparisons (e.g., maternal mortality, premature death, DALY rates), the influence of socioeconomic and systemic factors, and any racial-specific insights. Please also include relevant article context and direct links to the sources or referenced reports where applicable. Your response should be concise, focused, and structured to facilitate easy comprehension.
This question 

In [29]:
# Save your fine-tuned model to a local directory
model_save_path = "./PH-Llama-3.0"
trainer.save_model(model_save_path)

tokenizer.save_pretrained(model_save_path)

('./PH-Llama-3.0/tokenizer_config.json',
 './PH-Llama-3.0/special_tokens_map.json',
 './PH-Llama-3.0/tokenizer.json')

In [30]:
torch.save(peft_model.state_dict(), "./PH-Llama-3.0/PH-Llama-3.1.pth")

In [31]:
from huggingface_hub import HfApi, HfFolder, Repository

from huggingface_hub import login
login(token="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")

trainer.push_to_hub("iyashnayi/PH-Llama-3.0")

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/13.6M [00:00<?, ?B/s]

PH-Llama-3.1.pth:   0%|          | 0.00/8.14G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/iyashnayi/PH-Llama-3.0/commit/dc7ddede590e7e13c901f89e1703c06b6d11aadf', commit_message='iyashnayi/PH-Llama-3.0', commit_description='', oid='dc7ddede590e7e13c901f89e1703c06b6d11aadf', pr_url=None, repo_url=RepoUrl('https://huggingface.co/iyashnayi/PH-Llama-3.0', endpoint='https://huggingface.co', repo_type='model', repo_id='iyashnayi/PH-Llama-3.0'), pr_revision=None, pr_num=None)