# Installation and Load packages

In [1]:
!pip install datasets peft -qq
!pip install accelerate -qq
!pip install bitsandbytes -qq
!pip install trl -qq

In [2]:
!pip install torch==2.2.0 torchvision==0.17.0 torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/cu118
!pip install --upgrade --pre transformers accelerate --extra-index-url https://download.pytorch.org/whl/cu118
!pip install bitsandbytes==0.43.2 --prefer-binary --extra-index-url https://pypi.org/simple


Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch==2.2.0
  Downloading https://download.pytorch.org/whl/cu118/torch-2.2.0%2Bcu118-cp310-cp310-linux_x86_64.whl (811.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m811.7/811.7 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting torchvision==0.17.0
  Downloading https://download.pytorch.org/whl/cu118/torchvision-0.17.0%2Bcu118-cp310-cp310-linux_x86_64.whl (6.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m00:01[0m:00:01[0m
[?25hCollecting torchaudio==2.2.0
  Downloading https://download.pytorch.org/whl/cu118/torchaudio-2.2.0%2Bcu118-cp310-cp310-linux_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m00:01[0m:00:01[0m
[?25

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.org/simple
Collecting bitsandbytes==0.43.2
  Downloading bitsandbytes-0.43.2-py3-none-manylinux_2_24_x86_64.whl (137.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.2


In [3]:
!pip install wandb scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Collecting wandb
  Downloading wandb-0.19.9-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (20.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting scikit-learn
  Downloading scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting setproctitle
  Downloading setproctitle-1.3.5-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting gitpython!=3.1.29,>=1.0.0
  Downloading GitPython-3.1.44-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.6/207.6 kB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
Collecting docker-

## GPU - details

In [4]:
import torch

print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

if torch.cuda.is_available():
    print("Device name:", torch.cuda.get_device_name(0))
else:
    print("No GPU detected.")

Torch version: 2.2.0+cu118
CUDA available: True
Device name: Tesla T4


# Load libraries, Login HuggingFace API & WandB API

- **HuggingFace API:** To get access of Model Llama-3 (8 Billion)
- **WandB (Weigths & Biases):** To supervise perform of model and hyperparameter Tuning

In [5]:
# from google.colab import userdata
from huggingface_hub import login

login(token="YOUR_HF_API_KEY")

# Access Key for llama Model (HuggingFace)

from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    Trainer,
    BitsAndBytesConfig,
    HfArgumentParser,
    pipeline,
    logging,
    EarlyStoppingCallback
)

from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
from bitsandbytes.optim import AdamW8bit
import os, torch, wandb
from trl import SFTTrainer, setup_chat_format

# WandB - For plot Training

In [6]:
# for hyperparameter tuning report
wandb.login()
# YOUR_WANDB_API_KEY

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/student/.netrc
[34m[1mwandb[0m: Currently logged in as: [33myashnayi00[0m ([33myashnayi00-university-of-new-haven[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

# Load Llama-3.2-3B model 

In [7]:
model_name = "meta-llama/Llama-3.2-3B"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False
)


tokenizer = AutoTokenizer.from_pretrained(model_name)

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
     attn_implementation="eager"
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

base_model.config.pretraining_tp = 1
base_model.config.use_cache = False


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

In [9]:
print(f"meta-llama/Llama-3.2-3B:\n\n{base_model}")

meta-llama/Llama-3.2-3B:

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )
    )
    (n

In [10]:
print(f"{base_model.config}")

LlamaConfig {
  "_attn_implementation_autoset": true,
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 24,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": false,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bit": true,
    "load_in_8bit": false,
    "quant_method": "bitsandbytes"

### Trainable parameters - Model

In [11]:
def trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    return f"- Trainable model parameters: {trainable_params}.\n- All model parameters: {all_param}.\n- Percentage of trainable model parameters: {100 * trainable_params / all_param:.2f}%"

print(trainable_parameters(base_model))

- Trainable model parameters: 394177536.
- All model parameters: 1803463680.
- Percentage of trainable model parameters: 21.86%


### Assign datasetPH.json

Data is split in to train and test.
- Train size: 80%
- Test size: 20%

In [12]:
import json
with open("./dataset/rp_dataset.json", "r") as f:
    data = json.load(f)

if isinstance(data, dict):
    print("Data is a dictionary. Converting values to a list for splitting.")
    data = list(data.values())

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

with open("./dataset/trainset/rp_train_datasetPH.json", "w") as f:
    json.dump(train_data, f, indent=2)

with open("./dataset/testset/rp_test_datasetPH.json", "w") as f:
    json.dump(test_data, f, indent=2)

print(f"Train size: {len(train_data)}")
print(f"Test size: {len(test_data)}")


Train size: 352
Test size: 88


In [13]:
data[0]

{'paper_id': 'RP1070',
 'title': 'Emergency Preparedness and Its Influence on Public Health Policy in the U.S.',
 'author': 'Author E. F.',
 'publication_year': 2022,
 'source': 'CDC Reports',
 'doi_or_url': 'https://example-research.org/article/1070',
 'topic_category': 'Emergency Preparedness',
 'document_type': 'Peer-reviewed Article',
 'summary': 'This research investigates the relationship between emergency preparedness and public health outcomes in the U.S. The study draws on data from national health surveys and government statistics to analyze patterns of impact. Findings indicate that fluctuations in emergency preparedness are statistically correlated with shifts in health outcomes such as mortality, access to preventive care, and disease burden.',
 'statistical_analysis': {'methods_used': 'Multivariate regression, ANOVA',
  'key_variables': ['emergency preparedness',
   'mortality rate',
   'hospital admission rate'],
  'sample_size': 48435,
  'data_years': '2019–2022',
  'fi

### Tokenization of dataset and normalization 

In [14]:
# def tokenize_function(examples):
#     texts = []
#     for i in range(len(examples["title"])):
#         entry_parts = []

#         for key in examples.keys():
#             value = examples[key][i]
#             if isinstance(value, dict):
#                 for subkey, subval in value.items():
#                     entry_parts.append(f"{key}.{subkey}: {subval}")
#             elif isinstance(value, list):
#                 entry_parts.append(f"{key}: {', '.join(map(str, value))}")
#             else:
#                 entry_parts.append(f"{key}: {value}")

#         combined_text = "\n".join(entry_parts)
#         texts.append(combined_text)

#     return tokenizer(texts, truncation=True, padding="max_length", max_length=256)


def tokenize_function(examples):
    prompts = []
    for i in range(len(examples["title"])):
        entry = {key: examples[key][i] for key in examples}
        full_prompt = build_prompt(entry)
        prompts.append(full_prompt)

    return tokenizer(prompts, truncation=True, padding="max_length", max_length=512)

In [15]:
def normalize_entry(entry):
    normalized = {}
    for key, value in entry.items():
        if isinstance(value, dict):
            for subkey, subval in value.items():
                normalized[f"{key}.{subkey}"] = str(subval) if subval is not None else ""
        elif isinstance(value, list):
            normalized[key] = ", ".join(map(str, value))
        elif value is None:
            normalized[key] = ""
        else:
            normalized[key] = str(value)
    return normalized

# Normalize each entry
train_data_clean = [normalize_entry(entry) for entry in train_data]
test_data_clean = [normalize_entry(entry) for entry in test_data]


In [16]:
train_dataset_hf = Dataset.from_list(train_data_clean)
test_dataset_hf = Dataset.from_list(test_data_clean)

## Prompt Engineering

In [17]:
def build_prompt(entry):
    # Define the analyst's persona with added expertise details
    persona = (
        "You are an expert public policy analyst specializing in educational reform and adult education. "
        "Your expertise includes evaluating instructional materials and their impact on adult learning.\n"
    )
    
    # Provide clear and detailed instructions including expected structure and additional considerations
    instruction = (
        "Your task is to analyze the report provided below and summarize its key findings. "
        "Your output must include:\n"
        "- Three concise bullet points summarizing the findings\n"
        "- One well-structured paragraph discussing the implications, including any potential policy recommendations or risks\n"
        "- A JSON object tagged with `impact` (possible values: positive, negative, or neutral) based on the report’s overall impact\n"
    )
    
    # Add a metadata section with relevant background details
    metadata = (
        f"Metadata:\n"
        f"Paper ID: {entry.get('paper_id', '')}\n"
        f"Title: {entry.get('title', '')}\n"
        f"Author: {entry.get('author', '')}\n"
        f"Publication Year: {entry.get('publication_year', '')}\n"
        f"Source: {entry.get('source', '')}\n"
        f"Document Type: {entry.get('document_type', '')}\n"
        f"Topic Category: {entry.get('topic_category', '')}\n\n"
    )
    
    # Provide contextual background using details from the entry and emphasizing audience and local context
    context = (
        f"This report evaluates an adult education intervention designed to improve arithmetic skills through instructional workbooks. "
        f"The intervention was implemented in {entry.get('thematic_dimensions', {}).get('geographic_scope', 'a specific region')} and primarily targets {entry.get('thematic_dimensions', {}).get('demographic_focus', 'adult learners')}.\n"
    )
    
    format_guide = (
        "Use a professional and analytical tone with clarity and conciseness. "
        "Structure your response with bullet points, followed by a paragraph, and then a JSON object.\n"
    )
    
    few_shot = (
        "Example Input: \"The policy resulted in 70% improvement in adult math scores and significantly lowered dropout rates.\"\n"
        "Example Output:\n"
        "- Improved math proficiency by 70%\n"
        "- Significantly reduced dropout rates\n"
        "- Increased learner engagement\n"
        "Implication: The results indicate that the program is effective and scalable, suggesting positive future impacts on adult education.\n"
        "{\"impact\": \"positive\"}\n"
    )
    
    # Construct the body of the report by concisely combining key parts of the report
    full_text = (
        f"Abstract: {entry.get('abstract', '')}\n"
        f"Key Findings: {entry.get('key_findings', '')}\n"
        f"Problem Statement: {entry.get('problem_statement', '')}\n"
        f"Objectives: {entry.get('objectives', '')}\n"
        f"Conclusion: {entry.get('conclusion', '')}\n"
        f"Methodology: {entry.get('methodology', {}).get('methods_used', '')}, based on data from {entry.get('methodology', {}).get('data_sources', '')}, conducted over {entry.get('methodology', {}).get('duration', '')}\n"
        f"Implications: {entry.get('policy_practice_implications', {}).get('recommendations', '')} {entry.get('policy_practice_implications', {}).get('implementation_notes', '')}\n"
        f"Thematic Focus: {entry.get('thematic_dimensions', {}).get('demographic_focus', '')} | {entry.get('topic_category', '')}\n"
        f"Limitations: {entry.get('comparative_and_qualitative_insights', {}).get('limitations', '')}\n"
        f"Future Work: {entry.get('comparative_and_qualitative_insights', {}).get('future_work', '')}\n"
    )
    
    return persona + instruction + metadata + context + format_guide + few_shot + "Now analyze this report:\n" + full_text


### Train & Test - Tokenization 

In [18]:
tokenized_train = train_dataset_hf.map(tokenize_function, batched=True)
tokenized_train.set_format(type="torch")
print("Tokenization complete with all features.")

Map:   0%|          | 0/352 [00:00<?, ? examples/s]

Tokenization complete with all features.


In [19]:
tokenized_test = test_dataset_hf.map(tokenize_function, batched=True)
tokenized_test.set_format(type="torch")
print("Tokenization complete with all features.")

Map:   0%|          | 0/88 [00:00<?, ? examples/s]

Tokenization complete with all features.


# Configer - PEFT, LoRA & QLoRA

In [20]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
#     target_modules=["q_proj", "v_proj"],
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [21]:
base_model.gradient_checkpointing_enable()
base_model = prepare_model_for_kbit_training(base_model)

peft_model = get_peft_model(base_model, lora_config)
peft_model.config.use_cache = False

print("After PEFT wrapping:")
print(trainable_parameters(peft_model))

After PEFT wrapping:
- Trainable model parameters: 9175040.
- All model parameters: 1812638720.
- Percentage of trainable model parameters: 0.51%


In [22]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['question'])):
        text = f"### Question: {example['question'][i]}\n ### Answer: {example['answer'][i]}"
        output_texts.append(text)
    return output_texts

# Train PH-Llama-3.1 Model & Evaluation 

In [23]:
import torch
import os
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

training_args = TrainingArguments(
    output_dir="./SocioLens-llama-3.2-3B",
    overwrite_output_dir=True,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,  # Lowered
    weight_decay=0.01,  # Increased slightly
    logging_steps=20,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    num_train_epochs=10,  # Reduced epochs
    eval_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    save_total_limit=2,
    fp16= True,#not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
#     warmup_steps=200,
    lr_scheduler_type="linear",
    report_to="wandb",
    
#     num_train_epochs=5,
#     per_device_train_batch_size=1,
#     per_device_eval_batch_size=1, 
#     gradient_accumulation_steps=1,
#     learning_rate=2e-5,
#     weight_decay=0.01,
#     logging_steps=10,
#     save_steps=100,
#     eval_strategy="steps",
#     eval_steps=50,
#     save_total_limit=2,
#     fp16=True,
#     report_to="wandb"
)

trainer = SFTTrainer(
    model=peft_model,
    args=training_args,
    peft_config=lora_config,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
    #     optimizers=(AdamW8bit(peft_model.parameters(), lr=2e-4), None)
)

torch.cuda.empty_cache() # Force Clear Cache Before Training

print("Starting training...")
trainer.train()
print("Training complete.")


Truncating train dataset:   0%|          | 0/352 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/88 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Starting training...


  Expected `list[str]` but got `tuple` - serialized value may not be as expected
  Expected `list[str]` but got `tuple` - serialized value may not be as expected
  return self.__pydantic_serializer__.to_python(


Step,Training Loss,Validation Loss
50,0.1195,0.089396
100,0.0785,0.079239
150,0.075,0.076013
200,0.0616,0.063004
250,0.053,0.058403
300,0.0508,0.059428
350,0.0494,0.058936
400,0.048,0.059826


Training complete.


In [24]:
eval_results = trainer.evaluate()
print("Evaluation Results:")
print(eval_results)

Evaluation Results:
{'eval_loss': 0.059111181646585464, 'eval_runtime': 35.2479, 'eval_samples_per_second': 2.497, 'eval_steps_per_second': 1.248}


In [25]:
peft_model.config.save_pretrained("./SocioLens-llama-3.2-3B")


In [26]:
!ls -la ./SocioLens-llama-3.2-3B

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


total 111928
drwxr-xr-x 4 student student     4096 Apr 17 18:33 .
drwxr-xr-x 8 student student     4096 Apr 17 18:35 ..
-rw-r--r-- 1 student student     1717 Apr 13 18:40 README.md
-rw-r--r-- 1 student student      523 Apr 13 18:40 adapter_config.json
-rw-r--r-- 1 student student 97307544 Apr 13 18:40 adapter_model.safetensors
drwxr-xr-x 2 student student     4096 Apr 17 18:33 checkpoint-440
drwxr-xr-x 2 student student     4096 Apr 17 17:03 checkpoint-880
-rw-r--r-- 1 student student     1361 Apr 17 18:36 config.json
-rw-r--r-- 1 student student      301 Apr 13 18:40 special_tokens_map.json
-rw-r--r-- 1 student student 17209920 Apr 13 18:40 tokenizer.json
-rw-r--r-- 1 student student    50526 Apr 13 18:40 tokenizer_config.json
-rw-r--r-- 1 student student     5624 Apr 13 18:40 training_args.bin


In [27]:
files = os.listdir("./SocioLens-llama-3.2-3B")
print("Files in the output directory:", files)

Files in the output directory: ['checkpoint-440', 'checkpoint-880', 'training_args.bin', 'adapter_config.json', 'README.md', 'tokenizer.json', 'adapter_model.safetensors', 'tokenizer_config.json', 'special_tokens_map.json', 'config.json']


# Generate Text by Trained Model

In [28]:
# def generate_text(prompt, max_length=100, temperature=1, top_p=0.95):
#     inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
#     inputs = {key: value.to(peft_model.device) for key, value in inputs.items()}
    
#     outputs = peft_model.generate(
#         input_ids=inputs["input_ids"],
#         attention_mask=inputs["attention_mask"],
#         max_length=max_length,
#         do_sample=True,
#         temperature=temperature,
#         top_p=top_p,
#         pad_token_id=tokenizer.eos_token_id
#     )
#     generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
#     return generated_text

# # prompt = build_prompt("Using the dataset from the Peterson-KFF Health System Tracker on U.S. healthcare quality, provide a comprehensive analysis comparing the United States to other high-income countries. In your response, summarize key metrics such as life expectancy, all-cause mortality, maternal mortality, and rates of premature death. Discuss the impact of socioeconomic factors and healthcare utilization on these outcomes, and explain why the U.S. may perform worse on several indicators despite high per capita spending.")
# # print(generate_text(prompt, max_length=512))

In [29]:
def generate_alpaca_text(instruction, input_text="", max_length=100, temperature=1, top_p=0.95):
    """
    Generates text using an Alpaca-style prompt format.

    :param instruction: The main instruction or task.
    :param input_text: Additional context or data relevant to the instruction.
    :param max_length: The maximum length of the generated text.
    :param temperature: Sampling temperature for controlling randomness.
    :param top_p: Nucleus sampling parameter for controlling creativity.
    :return: A string containing the generated response.
    """

    # Construct the Alpaca-style prompt
    alpaca_prompt = (
        "Below is an instruction that describes a task, paired with an input that provides further context. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n"
        f"{instruction}\n\n"
        "### Input:\n"
        f"{input_text}\n\n"
        "### Response:\n"
    )

    # Tokenize the prompt
    tokenizer.chat_template = alpaca_prompt

    inputs = tokenizer(alpaca_prompt, return_tensors="pt", padding=True, truncation=True)
    inputs = {key: value.to(peft_model.device) for key, value in inputs.items()}

    # Generate output
    outputs = peft_model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_length,
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode the generated token IDs to text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text


# ---------------- USAGE EXAMPLE ----------------

example_instruction = "Summarize the key findings of the latest adult education policy research."
example_input_text = (
    "Recent policy interventions in adult education aim to improve literacy and numerical skills. "
    "They have been implemented in multiple regions with varied socioeconomic backgrounds."
)

# Call the modified function
alpaca_response = generate_alpaca_text(
    instruction=example_instruction,
    input_text=example_input_text,
    max_length=300,
    temperature=0.7,
    top_p=0.9
)

print(alpaca_response)

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Summarize the key findings of the latest adult education policy research.

### Input:
Recent policy interventions in adult education aim to improve literacy and numerical skills. They have been implemented in multiple regions with varied socioeconomic backgrounds.

### Response:
The key findings include:
- Adult education programs significantly impact literacy and numerical skills.
- The policies are effective in producing tangible results and positive future impacts.
- The programs are scalable and accessible to adult learners.
This input means that adult education plays a crucial role in improving its health care system and lowering its public health expenses. The policies are generally affirmative and effective, indicating potential future success and scalability.
Your task is to summarize the key findings, in

In [30]:
# Define a default chat template (as a string)
default_chat_template = (
    "### System:\n"
    "You are a helpful assistant.\n\n"
    "### User:\n"
    "{user_input}\n\n"
    "### Assistant:\n"
    "{% generation %}"
)

messages = [
    {
        "role": "user",
        "content": "Population educated in USA?"
    }
]

# Pass the chat_template explicitly to avoid errors.
prompt = tokenizer.apply_chat_template(
    messages,
    tokenize=False, 
    add_generation_prompt=True,
    chat_template=default_chat_template
)

print("Constructed prompt:")
print(prompt)

# Tokenize the prompt and move inputs to the CUDA device
inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

# Generate output using the peft_model (adjust parameters as needed)
outputs = peft_model.generate(
    **inputs,
    max_length=300, 
    num_return_sequences=1
)

# Decode the generated token IDs to a string
text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Here, we split the response based on the delimiter "assistant"
# (adjust this if needed based on your actual prompt structure)
assistant_response = text.split("assistant")[-1]
print("\nAssistant's response:")
print(assistant_response)


TemplateSyntaxError: Unexpected end of template. Jinja was looking for the following tags: 'endgeneration'. The innermost block that needs to be closed is 'generation'.

In [31]:
prompt = """U.S. Healthcare vs. Other High-Income Countries abstract
This report compares the quality of healthcare in the United States to other high-income countries, 
focusing on key metrics such as life expectancy, all-cause mortality, maternal mortality, and premature death. 
It discusses how high healthcare spending in the U.S. does not translate into better outcomes."""
  
prompt = build_prompt_gen(prompt)
print(generate_text(prompt, max_length=512))


NameError: name 'build_prompt_gen' is not defined

In [44]:
entry_1 = {
    "title": "Comparative Analysis of U.S. Healthcare Quality",
    "abstract": (
        "This report analyzes healthcare quality in the United States using data from the Peterson-KFF Health System Tracker, "
        "focusing on life expectancy, all-cause mortality, maternal mortality, and premature death rates. It compares these "
        "indicators to those of other high-income countries to highlight discrepancies and uncover systemic drivers of poor outcomes."
    ),
    "key_findings": (
        "- The U.S. has one of the lowest life expectancies among OECD nations.\n"
        "- Maternal mortality in the U.S. is more than double that of the next highest country.\n"
        "- The U.S. leads in rates of avoidable premature deaths despite high spending."
    ),
    "problem_statement": (
        "Despite spending more per capita on healthcare than any other high-income country, the United States "
        "consistently ranks low in health outcomes."
    ),
    "objectives": (
        "To investigate why the U.S. performs worse in key healthcare metrics and to identify how socioeconomic and systemic factors "
        "contribute to these disparities."
    ),
    "conclusion": (
        "High costs, fragmented healthcare delivery, limited access to primary care, and deep-rooted socioeconomic inequities "
        "contribute to the U.S.’s underperformance. Investment in social services and system-wide reform is needed."
    ),
    "methodology": {
        "methods_used": "Cross-country health indicator comparison",
        "data_sources": "Peterson-KFF Health System Tracker, OECD, CDC",
        "duration": "2010–2023"
    },
    "policy_practice_implications": {
        "recommendations": (
            "Expand access to affordable healthcare, invest in social determinants of health, and adopt integrated care models."
        ),
        "implementation_notes": "Special attention should be paid to underserved and low-income populations."
    },
    "thematic_dimensions": {
        "geographic_scope": "the United States",
        "demographic_focus": "General population with focus on maternal and preventable mortality"
    },
    "topic_category": "International Health System Comparison",
    "comparative_and_qualitative_insights": {
        "limitations": (
            "International differences in data collection and healthcare definitions may affect direct comparisons."
        ),
        "future_work": (
            "Explore policy interventions from high-performing countries that can be adapted to the U.S. context."
        )
    }
}

prompt = build_prompt(entry_1)
print(generate_text(prompt, max_length=300))

NameError: name 'generate_text' is not defined

In [41]:
# Save your fine-tuned model to a local directory
model_save_path = "./SocioLens-llama-3.2-3B"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

('./SocioLens-llama-3.2-3B/tokenizer_config.json',
 './SocioLens-llama-3.2-3B/special_tokens_map.json',
 './SocioLens-llama-3.2-3B/tokenizer.json')

In [42]:
torch.save(peft_model.state_dict(), "./model/SocioLens-llama-3.2-3B.pth")

In [43]:
from huggingface_hub import HfApi, HfFolder, Repository

from huggingface_hub import login
login(token="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")

trainer.push_to_hub("iyashnayi/SocioLens-llama-3.2-3B")

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/97.3M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.62k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/iyashnayi/SocioLens-llama-3.2-3B/commit/f7d87d92c43cc25d40132a52e785065f27e97208', commit_message='iyashnayi/SocioLens-llama-3.2-3B', commit_description='', oid='f7d87d92c43cc25d40132a52e785065f27e97208', pr_url=None, repo_url=RepoUrl('https://huggingface.co/iyashnayi/SocioLens-llama-3.2-3B', endpoint='https://huggingface.co', repo_type='model', repo_id='iyashnayi/SocioLens-llama-3.2-3B'), pr_revision=None, pr_num=None)