In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [None]:
import json
import pandas as pd
from datasets import Dataset

# Step 1: Load your different JSON files
json_file1 = "/content/drive/MyDrive/test_vector_db/datasetPH.json"
json_file2 = "/content/drive/MyDrive/test_vector_db/Merged Q& A data.json"
json_file3 = "/content/drive/MyDrive/test_vector_db/structured_to_text_all_years2.json"

# Load each file
with open(json_file1, 'r') as f:
    data1 = json.load(f)

with open(json_file2, 'r') as f:
    data2 = json.load(f)

with open(json_file3, 'r') as f:
    data3 = json.load(f)

# Step 2: Create a function to convert each format to instruction-response pairs
def process_format1(data):
    """Process detailed nested JSON structure into instruction-response pairs"""
    pairs = []

    for paper_id, item in data.items():
        # Build a detailed instruction using available metadata
        instruction = f"Analyze the policy implications of the paper titled '{item['title']}' authored by {item['author']} ({item['publication_year']})."

        # Build a comprehensive response from nested fields
        policy = item.get("policy_practice_implications", {})
        comp_qual = item.get("comparative_and_qualitative_insights", {})

        recommendations = policy.get("recommendations", "No recommendations provided.")
        impl_notes = policy.get("implementation_notes", "")
        limitations = comp_qual.get("limitations", "")
        future_work = comp_qual.get("future_work", "")

        response_parts = [
            f"Recommendations: {recommendations}",
            f"Implementation Notes: {impl_notes}",
        ]
        if limitations:
            response_parts.append(f"Limitations: {limitations}")
        if future_work:
            response_parts.append(f"Future Work: {future_work}")

        response = "\n".join(response_parts)

        pairs.append({
            "instruction": instruction,
            "response": response
        })

    return pairs


def process_format2(data):
    """Process Q&A style JSON dataset with entries of questions and answers"""
    pairs = []

    for item in data.get("entries", []):
        # Get question and answer from the item
        question = item.get('question', '')
        answer = item.get('answer', '')

        # Clean Unicode from both question and answer
        clean_question = question.encode('ascii', 'ignore').decode('ascii')
        clean_answer = answer.encode('ascii', 'ignore').decode('ascii')

        # Replace specific symbols with text equivalents
        clean_question = clean_question.replace("%", " percent")
        clean_answer = clean_answer.replace("%", " percent")
        clean_answer = clean_answer.replace("µg/m³", "ug/m3")

        # Format as instruction and response
        instruction = f"{clean_question}"
        response = f"{clean_answer}"

        pairs.append({"instruction": instruction, "response": response})

    return pairs

def process_format3(data):
    """Process educational statistics from state-year records"""
    pairs = []
    for item in data:
        year = item.get("year", "Unknown Year")
        state = item.get("state", "Unknown State")
        text = item.get("text", "")

        # Create instruction
        instruction = f"Summarize the educational attainment statistics for {state} in {year}."

        # Clean text - remove Unicode and replace % with "percent"
        cleaned_text = text.encode('ascii', 'ignore').decode('ascii')  # Remove all Unicode
        cleaned_text = cleaned_text.replace("!!", " ").replace("Estimate", "")
        cleaned_text = cleaned_text.replace("%", " percent")

        # Format text with better readability without truncation
        cleaned_text = " ".join(cleaned_text.split())

        response = f"Here is a summary of the educational data for {state} in {year}:\n{cleaned_text}"
        pairs.append({"instruction": instruction, "response": response})

    return pairs


# Step 3: Process each file with its corresponding function
pairs1 = process_format1(data1)
pairs2 = process_format2(data2)
pairs3 = process_format3(data3)

# Step 4: Combine all instruction-response pairs
all_pairs = pairs1 + pairs2 + pairs3

# Step 5: Convert to proper format for training
# Option A: JSONL format
with open("policy_training_data.jsonl_April20", "w") as f:
    for pair in all_pairs:
        f.write(json.dumps(pair) + "\n")

# Option B: Create a Hugging Face Dataset object directly
#train_dataset = Dataset.from_pandas(pd.DataFrame(all_pairs))

# Optional: Split into train and validation sets
#train_val = train_dataset.train_test_split(test_size=0.1)
#train_data = train_val['train']
#val_data = train_val['test']

# Save the processed datasets (optional)
#train_data.save_to_disk("policy_train_dataset")
#val_data.save_to_disk("policy_val_dataset")

In [None]:
pip install bitsandbytes transformers accelerate peft trl datasets

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineG

In [None]:
!pip install --upgrade transformers

Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Downloading transformers-4.51.3-py3-none-any.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.51.1
    Uninstalling transformers-4.51.1:
      Successfully uninstalled transformers-4.51.1
Successfully installed transformers-4.51.3


In [None]:
!pip install --upgrade trl



In [None]:
import os
import torch
from datasets import load_dataset, DatasetDict
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer,SFTConfig

# Configuration
model_id = "meta-llama/Llama-3.2-3B"
output_dir = "/content/drive/MyDrive/Capstone project/policy_analyst_model"

# QLoRA configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)
model.config.use_cache = False
model = prepare_model_for_kbit_training(model)

# LoRA configuration
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

# Apply LoRA
model = get_peft_model(model, lora_config)

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Load dataset
raw_data = load_dataset("json", data_files="/content/policy_training_data.jsonl")["train"]

# Combine 'instruction' and 'response' into a new 'text' field
def combine_instruction_response(example):
    example['text'] = example['instruction'] + tokenizer.eos_token + example['response']
    return example

raw_data = raw_data.map(combine_instruction_response)

# Split into train and validation (90% train, 10% validation)
data = raw_data.train_test_split(test_size=0.1, seed=42)


# Training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    learning_rate=2e-4,
    save_steps=100,
    logging_steps=10,
    #evaluation_strategy="steps",
    #eval_steps=100,
    save_total_limit=2,
    fp16=True,
    group_by_length=True,
)

# Initialize trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=data["train"],
    eval_dataset=data["test"],
    #tokenizer=tokenizer,
    peft_config=lora_config,
    #dataset_text_field="text",
    #max_seq_length=2048,
    #packing=True,
)

# Train the model
trainer.train()

# Save the model
trainer.save_model()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Converting train dataset to ChatML:   0%|          | 0/1093 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/1093 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1093 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/1093 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/122 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/122 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/122 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/122 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mshresthabhandari2[0m ([33mshresthabhandari2-na[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,1.8386
20,1.8927
30,1.859
40,0.943
50,1.4941
60,1.0489
70,1.0352
80,1.0587
90,0.6725
100,1.2499
