In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import re
from collections import OrderedDict

# ==== CONFIG ====
base_model_id = "tiiuae/falcon-rw-1b"  # or your own base path if different
adapter_model_ids = {
    "toxicity": "Kavanavnlp/falcon-1b-toxicity-qa-10k",
    "medical": "Kavanavnlp/falcon-1b-medical-qa-10k",
    "finance": "Kavanavnlp/falcon-1b-finance-qa-10k"
}
output_dir = "./merged_falcon_model"

# ==== Exclude only embeddings and final layer norm — keep lm_head ====
exclude_param_names_regex = [
    r"^model.embed_tokens.*",
    r"^model.final_layer_norm.*"
]

def should_exclude(name, patterns):
    return any(re.match(p, name) for p in patterns)

# ==== Load base model ====
print(f"🔹 Loading base model: {base_model_id}")
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=torch.float16,
    device_map="cpu"
)

# ==== Load and merge LoRA adapters ====
merged_models = []
for task, adapter_id in adapter_model_ids.items():
    print(f"🔹 Loading adapter for {task} from: {adapter_id}")
    adapter_model = PeftModel.from_pretrained(base_model, adapter_id)
    adapter_model = adapter_model.merge_and_unload()  # Merge LoRA weights into full model
    merged_models.append(adapter_model)

# ==== Merge parameters ====
print("🔹 Merging model parameters ...")
merged_state_dict = OrderedDict()

for name, base_param in base_model.named_parameters():
    if should_exclude(name, exclude_param_names_regex):
        merged_state_dict[name] = base_param.data.clone()
        continue

    # Start from base param
    avg_param = base_param.data.clone().float()

    # Add from all adapters
    for model in merged_models:
        avg_param += model.state_dict()[name].float()

    # Average across base + N adapters
    avg_param /= (len(merged_models) + 1)
    merged_state_dict[name] = avg_param.to(base_param.dtype)

# === Fix: Ensure lm_head.weight is present ===
if "lm_head.weight" not in merged_state_dict:
    print("🔸 Adding lm_head.weight from base model")
    merged_state_dict["lm_head.weight"] = base_model.state_dict()["lm_head.weight"]

# ==== Load final merged weights ====
print("🔹 Loading merged weights into base model")
base_model.load_state_dict(merged_state_dict)

# ==== Save final merged model ====
print(f"💾 Saving merged model to {output_dir}")
base_model.save_pretrained(output_dir)
AutoTokenizer.from_pretrained(base_model_id).save_pretrained(output_dir)

print("✅ Done! Merged model is ready.")


🔹 Loading base model: tiiuae/falcon-rw-1b
🔹 Loading adapter for toxicity from: Kavanavnlp/falcon-1b-toxicity-qa-10k
🔹 Loading adapter for medical from: Kavanavnlp/falcon-1b-medical-qa-10k
🔹 Loading adapter for finance from: Kavanavnlp/falcon-1b-finance-qa-10k
🔹 Merging model parameters ...
🔸 Adding lm_head.weight from base model
🔹 Loading merged weights into base model
💾 Saving merged model to ./merged_falcon_model


tokenizer_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

✅ Done! Merged model is ready.


In [None]:
import json
from transformers import pipeline

base_model_id = "tiiuae/falcon-rw-1b"
merged_model_dir = "./merged_falcon_model"

# Load generation pipelines
pipe_base = pipeline("text-generation", model=base_model_id, tokenizer=base_model_id, device=0)
pipe_merged = pipeline("text-generation", model=merged_model_dir, tokenizer=merged_model_dir, device=0)

# Load prompts
with open("eval_prompts.json") as f:
    prompts = json.load(f)

results = []
for p in prompts:
    prompt = p["prompt"]

    base_out = pipe_base(prompt, max_new_tokens=150, do_sample=False)[0]["generated_text"]
    merged_out = pipe_merged(prompt, max_new_tokens=150, do_sample=False)[0]["generated_text"]

    results.append({
        "id": p["id"],
        "domain": p["domain"],
        "prompt": prompt,
        "base_response": base_out.strip(),
        "merged_response": merged_out.strip()
    })

# Save responses
with open("model_outputs.json", "w") as f:
    json.dump(results, f, indent=2)

print("✅ Model outputs saved.")




Device set to use cuda:0
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


✅ Model outputs saved.


In [None]:
# !pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@main
# !pip install accelerate einops bitsandbytes transformers datasets


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import os

BASE_MODEL = "tiiuae/falcon-rw-1b"
MERGED_MODEL = "/content/drive/MyDrive/merged_falcon_model"
TASKS = "nq_open,commonsense_qa,toxigen"
OUTPUT_DIR = "./eval_outputs"

os.makedirs(OUTPUT_DIR, exist_ok=True)


## **Toxicity**

In [None]:
!lm-eval \
  --model hf \
  --model_args pretrained=tiiuae/falcon-rw-1b \
  --tasks nq_open,commonsense_qa,toxigen \
  --device cuda \
  --batch_size 32 \
  --trust_remote_code \
  --output_path eval_outputs/base_results.json


2025-05-10 02:25:27.098215: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-10 02:25:27.114743: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746843927.135738    7129 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746843927.142171    7129 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-10 02:25:27.163086: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig

base_model_id = "tiiuae/falcon-rw-1b"
output_dir = "/content/drive/MyDrive/merged_falcon_model"

# Load base model again
base_model = AutoModelForCausalLM.from_pretrained(base_model_id, torch_dtype=torch.float16)

# Now load your merged_state_dict (assumes you've kept it)
# base_model.load_state_dict(merged_state_dict)  # <-- do this only if you still have merged weights

# Save full model
base_model.save_pretrained(output_dir)
AutoConfig.from_pretrained(base_model_id).save_pretrained(output_dir)
AutoTokenizer.from_pretrained(base_model_id).save_pretrained(output_dir)

print("✅ Merged model with weights, config, and tokenizer saved.")


✅ Merged model with weights, config, and tokenizer saved.


In [None]:
!lm-eval \
  --model hf \
  --model_args pretrained=/content/drive/MyDrive/merged_falcon_model \
  --tasks nq_open,commonsense_qa,toxigen \
  --device cuda \
  --batch_size 32 \
  --trust_remote_code \
  --output_path eval_outputs/merged_results.json


2025-05-10 02:39:09.741754: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746844749.762887   10943 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746844749.769485   10943 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-10:02:39:16 INFO     [__main__:428] Passed `--trust_remote_code`, setting environment variable `HF_DATASETS_TRUST_REMOTE_CODE=true`
2025-05-10:02:39:16 INFO     [__main__:440] Selected Tasks: ['commonsense_qa', 'nq_open', 'toxigen']
2025-05-10:02:39:16 INFO     [evaluator:185] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
2025-05-10:02:39:16 INF

## **Medical**

In [None]:
# !pip install git+https://github.com/google-research/bleurt.git

In [None]:
!lm-eval \
  --model hf \
  --model_args pretrained=./merged_falcon_model \
  --tasks medmcqa,pubmedqa,medtext \
  --device cuda \
  --batch_size 32 \
  --trust_remote_code \
  --output_path results_merged.json


2025-05-10 03:22:37.320779: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-10 03:22:37.338673: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746847357.360202   24018 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746847357.366818   24018 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-10 03:22:37.388887: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [None]:
!lm-eval \
  --model hf \
  --model_args pretrained=tiiuae/falcon-rw-1b \
  --tasks medmcqa,pubmedqa,medtext \
  --device cuda \
  --batch_size 32 \
  --trust_remote_code \
  --output_path eval_outputs/base_results.json

2025-05-10 03:34:59.642096: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-10 03:34:59.660529: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746848099.683432   32851 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746848099.690454   32851 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-10 03:34:59.712907: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr