In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSeq2SeqLM
import os

from huggingface_hub import login


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

login("hf_token") # This will be the hugging face token

In [None]:

# # Load base model and tokenizer
# base_model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
# tokenizer = AutoTokenizer.from_pretrained(base_model_name)
# base_model = AutoModelForCausalLM.from_pretrained(base_model_name, torch_dtype=torch.float16)

# # Load fine-tuned models
# math_code_model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama_v1.1_math_code", torch_dtype=torch.float16)
# finance_model = AutoModelForCausalLM.from_pretrained("ritamsharma/tinyllama-finance-v3", torch_dtype=torch.float16)

In [None]:
# from transformers import AutoModelForCausalLM, AutoTokenizer

# # Save base model
# base_model.save_pretrained("models/base")
# tokenizer.save_pretrained("models/base")

# # Save math_code model
# math_code_model.save_pretrained("models/math_code")
# tokenizer.save_pretrained("models/math_code")

# # Save finance model
# finance_model.save_pretrained("models/finance")
# tokenizer.save_pretrained("models/finance")

In [None]:
!git clone https://github.com/yule-BUAA/MergeLM.git
os.chdir("/content/model_merging_methods")
# !pip install -r requirements.txt
os.getcwd()

fatal: destination path 'MergeLM' already exists and is not an empty directory.


'/content/model_merging_methods'

In [None]:
# !pip install datasets
# !pip install jsonlines
# !pip install vllm
# !pip install human_eval
# !pip install fraction

Collecting fraction
  Downloading Fraction-2.2.0-py3-none-any.whl.metadata (2.8 kB)
Downloading Fraction-2.2.0-py3-none-any.whl (4.7 kB)
Installing collected packages: fraction
Successfully installed fraction-2.2.0


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import copy
import os

# Import the MergingMethod class and potentially TaskVector if needed
# Assumes the class code is saved in 'merging_methods.py'
from merging_methods import MergingMethod #, TaskVector # Uncomment TaskVector if it's separate

# --- Configuration ---
BASE_MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
MATH_CODE_MODEL_ID = "TinyLlama/TinyLlama_v1.1_math_code"
FINANCE_MODEL_ID = "ritamsharma/tinyllama-finance-v3"

# Output directories for saved models
OUTPUT_DIR_BASE_MATH_FIN_AVG = "./merged_models/base_math_fin_avg"
OUTPUT_DIR_BASE_MATH_FIN_TASK = "./merged_models/base_math_fin_task_arithmetic"
OUTPUT_DIR_MATH_FIN_AVG = "./merged_models/math_fin_avg"
OUTPUT_DIR_MATH_FIN_TASK = "./merged_models/math_fin_task_arithmetic"

# Merging parameters
TASK_ARITHMETIC_SCALING = 0.5 # Example scaling coefficient for task arithmetic
EXCLUDE_REGEX = [] # No parameters excluded by default

# Device configuration (use GPU if available)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DTYPE = torch.float16 # Use float16 to save memory

# --- Helper Function to Save Model ---
def save_model(model, tokenizer, output_dir):
    """Saves the model and tokenizer to the specified directory."""
    os.makedirs(output_dir, exist_ok=True)
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Merged model saved to: {output_dir}")

# --- Load Models ---
print("Loading models...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    torch_dtype=DTYPE,
    low_cpu_mem_usage=True # Helps if RAM is limited
).to(DEVICE)
print(f"Loaded Base Model: {BASE_MODEL_ID}")

math_code_model = AutoModelForCausalLM.from_pretrained(
    MATH_CODE_MODEL_ID,
    torch_dtype=DTYPE,
    low_cpu_mem_usage=True
).to(DEVICE)
print(f"Loaded Math/Code Model: {MATH_CODE_MODEL_ID}")

finance_model = AutoModelForCausalLM.from_pretrained(
    FINANCE_MODEL_ID,
    torch_dtype=DTYPE,
    low_cpu_mem_usage=True
).to(DEVICE)
print(f"Loaded Finance Model: {FINANCE_MODEL_ID}")
print("-" * 30)

# --- Merge Combination 1: Base + Math/Code + Finance ---
print("Starting Merge Combination 1: Base + Math/Code + Finance")

# 1.A: Average Merging
print("\nPerforming Average Merging (Base + Math + Finance)...")
avg_merger = MergingMethod("average_merging")
models_to_avg_merge_1 = [base_model, math_code_model, finance_model]

# Calculate averaged parameters
averaged_params_1 = avg_merger.average_merging(
    models_to_merge=models_to_avg_merge_1,
    exclude_param_names_regex=EXCLUDE_REGEX
)

# Create a new model structure (copy of base) and load averaged weights
merged_model_avg_1 = copy.deepcopy(base_model).to(DEVICE) # Ensure it's on the right device
avg_merger.copy_params_to_model(params=averaged_params_1, model=merged_model_avg_1)
print("Average Merging (Base + Math + Finance) complete.")
save_model(merged_model_avg_1, tokenizer, OUTPUT_DIR_BASE_MATH_FIN_AVG)
del merged_model_avg_1, averaged_params_1 # Free up memory

# 1.B: Task Arithmetic
print("\nPerforming Task Arithmetic (Base + Math_Delta + Finance_Delta)...")
task_merger = MergingMethod("task_arithmetic")
# Models providing task vectors (relative to base_model)
models_for_task_vectors_1 = [math_code_model, finance_model]

# Create a copy of the base model to apply the task vectors to
merged_model_task_1_base = copy.deepcopy(base_model).to(DEVICE)

# Calculate merged parameters: base + scale * (math_delta + finance_delta)
task_merged_params_1 = task_merger.task_arithmetic(
    merged_model=merged_model_task_1_base, # This is the base model for delta calculation
    models_to_merge=models_for_task_vectors_1,
    exclude_param_names_regex=EXCLUDE_REGEX,
    scaling_coefficient=TASK_ARITHMETIC_SCALING
)

# Load merged parameters into the model structure
task_merger.copy_params_to_model(params=task_merged_params_1, model=merged_model_task_1_base)
print("Task Arithmetic (Base + Math_Delta + Finance_Delta) complete.")
save_model(merged_model_task_1_base, tokenizer, OUTPUT_DIR_BASE_MATH_FIN_TASK)
del merged_model_task_1_base, task_merged_params_1 # Free up memory
print("-" * 30)


# --- Merge Combination 2: Math/Code + Finance ---
print("Starting Merge Combination 2: Math/Code + Finance")

# 2.A: Average Merging
print("\nPerforming Average Merging (Math + Finance)...")
avg_merger_2 = MergingMethod("average_merging")
models_to_avg_merge_2 = [math_code_model, finance_model]

# Calculate averaged parameters
averaged_params_2 = avg_merger_2.average_merging(
    models_to_merge=models_to_avg_merge_2,
    exclude_param_names_regex=EXCLUDE_REGEX
)

# Create a new model structure (copy of one of the merged models)
# Using math_code_model structure here, could use finance_model too
merged_model_avg_2 = copy.deepcopy(math_code_model).to(DEVICE)
avg_merger_2.copy_params_to_model(params=averaged_params_2, model=merged_model_avg_2)
print("Average Merging (Math + Finance) complete.")
save_model(merged_model_avg_2, tokenizer, OUTPUT_DIR_MATH_FIN_AVG)
del merged_model_avg_2, averaged_params_2 # Free up memory


# 2.B: Task Arithmetic (Relative to Base Model)
print("\nPerforming Task Arithmetic (Base + Math_Delta + Finance_Delta)...")
# Note: Task Arithmetic inherently uses a base model to calculate the deltas.
# We use the original base_model here for delta calculation,
# resulting in Base + scaled * (Math_Delta + Finance_Delta).
# The models providing task vectors are Math and Finance.
task_merger_2 = MergingMethod("task_arithmetic")
models_for_task_vectors_2 = [math_code_model, finance_model]

# Create a copy of the *base* model to apply the task vectors to
merged_model_task_2_base = copy.deepcopy(base_model).to(DEVICE)

# Calculate merged parameters: base + scale * (math_delta + finance_delta)
task_merged_params_2 = task_merger_2.task_arithmetic(
    merged_model=merged_model_task_2_base, # Base model for delta calculation
    models_to_merge=models_for_task_vectors_2,
    exclude_param_names_regex=EXCLUDE_REGEX,
    scaling_coefficient=TASK_ARITHMETIC_SCALING
)

# Load merged parameters into the model structure
task_merger_2.copy_params_to_model(params=task_merged_params_2, model=merged_model_task_2_base)
print("Task Arithmetic (Base + Math_Delta + Finance_Delta) complete.")
# Note: Even though we only specified Math+Finance for merging,
# Task Arithmetic applies their deltas *onto the base model*.
save_model(merged_model_task_2_base, tokenizer, OUTPUT_DIR_MATH_FIN_TASK)
del merged_model_task_2_base, task_merged_params_2 # Free up memory
print("-" * 30)

print("All merging tasks finished.")

Loading models...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Loaded Base Model: TinyLlama/TinyLlama-1.1B-Chat-v1.0


config.json:   0%|          | 0.00/560 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/4.40G [00:00<?, ?B/s]

Error during conversion: ChunkedEncodingError(ProtocolError('Response ended prematurely'))


generation_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

Loaded Math/Code Model: TinyLlama/TinyLlama_v1.1_math_code


config.json:   0%|          | 0.00/731 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Loaded Finance Model: ritamsharma/tinyllama-finance-v3
------------------------------
Starting Merge Combination 1: Base + Math/Code + Finance

Performing Average Merging (Base + Math + Finance)...
Average Merging (Base + Math + Finance) complete.
Merged model saved to: ./merged_models/base_math_fin_avg

Performing Task Arithmetic (Base + Math_Delta + Finance_Delta)...
Task Arithmetic (Base + Math_Delta + Finance_Delta) complete.
Merged model saved to: ./merged_models/base_math_fin_task_arithmetic
------------------------------
Starting Merge Combination 2: Math/Code + Finance

Performing Average Merging (Math + Finance)...
Average Merging (Math + Finance) complete.
Merged model saved to: ./merged_models/math_fin_avg

Performing Task Arithmetic (Base + Math_Delta + Finance_Delta)...
Task Arithmetic (Base + Math_Delta + Finance_Delta) complete.
Merged model saved to: ./merged_models/math_fin_task_arithmetic
------------------------------
All merging tasks finished.


In [None]:
# import torch
# from transformers import AutoModelForCausalLM, AutoTokenizer
# import copy
# import os
# from merging_methods import MergingMethod

# # --- Configuration ---
# BASE_MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
# MATH_CODE_MODEL_ID = "TinyLlama/TinyLlama_v1.1_math_code"

# # Output directories for saved models
# OUTPUT_DIR_BASE_MATH_AVG = "./merged_models/base_math_avg"
# OUTPUT_DIR_BASE_MATH_TASK = "./merged_models/base_math_task_arithmetic"

# # Merging parameters
# TASK_ARITHMETIC_SCALING = 0.5
# EXCLUDE_REGEX = []

# # Device and dtype config
# DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# DTYPE = torch.float16

# # --- Helper Function ---
# def save_model(model, tokenizer, output_dir):
#     os.makedirs(output_dir, exist_ok=True)
#     model.save_pretrained(output_dir)
#     tokenizer.save_pretrained(output_dir)
#     print(f"Merged model saved to: {output_dir}")

# # --- Load Models ---
# print("Loading models...")
# tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)

# base_model = AutoModelForCausalLM.from_pretrained(
#     BASE_MODEL_ID,
#     torch_dtype=DTYPE,
#     low_cpu_mem_usage=True
# ).to(DEVICE)
# print(f"Loaded Base Model: {BASE_MODEL_ID}")

# math_model = AutoModelForCausalLM.from_pretrained(
#     MATH_CODE_MODEL_ID,
#     torch_dtype=DTYPE,
#     low_cpu_mem_usage=True
# ).to(DEVICE)
# print(f"Loaded Math Model: {MATH_CODE_MODEL_ID}")
# print("-" * 30)

# # --- Average Merge ---
# print("Performing Average Merge (Base + Math)...")
# avg_merger = MergingMethod("average_merging")
# averaged_params = avg_merger.average_merging(
#     models_to_merge=[base_model, math_model],
#     exclude_param_names_regex=EXCLUDE_REGEX
# )

# merged_model_avg = copy.deepcopy(base_model).to(DEVICE)
# avg_merger.copy_params_to_model(params=averaged_params, model=merged_model_avg)
# save_model(merged_model_avg, tokenizer, OUTPUT_DIR_BASE_MATH_AVG)
# del merged_model_avg, averaged_params
# print("Average merge complete.")

# # --- Task Arithmetic Merge ---
# print("\nPerforming Task Arithmetic (Base + Math_Delta)...")
# task_merger = MergingMethod("task_arithmetic")

# merged_model_task = copy.deepcopy(base_model).to(DEVICE)
# task_merged_params = task_merger.task_arithmetic(
#     merged_model=merged_model_task,
#     models_to_merge=[math_model],
#     exclude_param_names_regex=EXCLUDE_REGEX,
#     scaling_coefficient=TASK_ARITHMETIC_SCALING
# )
# task_merger.copy_params_to_model(params=task_merged_params, model=merged_model_task)
# save_model(merged_model_task, tokenizer, OUTPUT_DIR_BASE_MATH_TASK)
# del merged_model_task, task_merged_params
# print("Task arithmetic merge complete.")
# print("-" * 30)

# print("All merging tasks finished.")


In [None]:
!pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git
!pip install accelerate transformers datasets huggingface_hub
!pip install lm-eval[math]
!pip install --upgrade lm-eval

In [None]:
!python -c "from lm_eval.evaluator import eval_logger; print('eval_logger is available')"


2025-05-04 16:04:10.066048: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-04 16:04:10.083602: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746374650.104988    6492 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746374650.111443    6492 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-04 16:04:10.132977: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSeq2SeqLM
import os

from huggingface_hub import login

In [None]:
os.chdir("/content")
# !pip install -r requirements.txt
os.getcwd()

'/content'

## MATH BENCHMARKS

In [None]:
model = AutoModelForCausalLM.from_pretrained("/content/model_merging_methods/merged_models/base_math_fin_avg")
tokenizer = AutoTokenizer.from_pretrained("/content/model_merging_methods/merged_models/base_math_fin_avg")

model.save_pretrained("/content/merged_model_avg")
tokenizer.save_pretrained("/content/merged_model_avg")


!lm-eval \
  --model hf \
  --model_args pretrained=./merged_model_avg \
  --tasks boolq,hellaswag \
  --device cuda \
  --batch_size 50 \
  --output_path results_math_fin_avg.json


2025-05-04 16:06:12.602024: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746374772.623350    7057 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746374772.629813    7057 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-04:16:06:19 INFO     [__main__:440] Selected Tasks: ['boolq', 'hellaswag']
2025-05-04:16:06:19 INFO     [evaluator:185] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
2025-05-04:16:06:19 INFO     [evaluator:223] Initializing hf model, with arguments: {'pretrained': './merged_model_avg'}
2025-05-04:16:06:19 INFO     [models.huggingface:137] Using 

In [None]:
model = AutoModelForCausalLM.from_pretrained("/content/model_merging_methods/merged_models/math_fin_avg")
tokenizer = AutoTokenizer.from_pretrained("/content/model_merging_methods/merged_models/math_fin_avg")

model.save_pretrained("/content/math_fin_avg")
tokenizer.save_pretrained("/content/math_fin_avg")


!lm-eval \
  --model hf \
  --model_args pretrained=./math_fin_avg \
  --tasks boolq,hellaswag \
  --device cuda \
  --batch_size 50 \
  --output_path results_math_fin_avg_nobase.json

2025-05-04 16:17:17.347952: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746375437.369794    9959 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746375437.376453    9959 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-04:16:17:23 INFO     [__main__:440] Selected Tasks: ['boolq', 'hellaswag']
2025-05-04:16:17:23 INFO     [evaluator:185] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
2025-05-04:16:17:23 INFO     [evaluator:223] Initializing hf model, with arguments: {'pretrained': './math_fin_avg'}
2025-05-04:16:17:23 INFO     [models.huggingface:137] Using devi

In [None]:
model = AutoModelForCausalLM.from_pretrained("/content/model_merging_methods/merged_models/base_math_fin_task_arithmetic")
tokenizer = AutoTokenizer.from_pretrained("/content/model_merging_methods/merged_models/base_math_fin_task_arithmetic")

model.save_pretrained("/content/base_math_fin_task_arithmetic")
tokenizer.save_pretrained("/content/base_math_fin_task_arithmetic")


!lm-eval \
  --model hf \
  --model_args pretrained=./base_math_fin_task_arithmetic \
  --tasks boolq,hellaswag \
  --device cuda \
  --batch_size 50 \
  --output_path results_base_math_fin_task_arithmetic.json

2025-05-04 16:27:22.191303: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746376042.212228   12599 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746376042.218565   12599 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-04:16:27:28 INFO     [__main__:440] Selected Tasks: ['boolq', 'hellaswag']
2025-05-04:16:27:28 INFO     [evaluator:185] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
2025-05-04:16:27:28 INFO     [evaluator:223] Initializing hf model, with arguments: {'pretrained': './base_math_fin_task_arithmetic'}
2025-05-04:16:27:28 INFO     [models.huggingfac

In [None]:
model = AutoModelForCausalLM.from_pretrained("/content/model_merging_methods/merged_models/math_fin_task_arithmetic")
tokenizer = AutoTokenizer.from_pretrained("/content/model_merging_methods/merged_models/math_fin_task_arithmetic")

model.save_pretrained("/content/math_fin_task_arithmetic")
tokenizer.save_pretrained("/content/math_fin_task_arithmetic")


!lm-eval \
  --model hf \
  --model_args pretrained=./math_fin_task_arithmetic \
  --tasks boolq,hellaswag \
  --device cuda \
  --batch_size 50 \
  --output_path results_math_fin_task_arithmetic.json

2025-05-04 16:37:16.099138: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746376636.120983   15198 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746376636.127626   15198 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-04:16:37:22 INFO     [__main__:440] Selected Tasks: ['boolq', 'hellaswag']
2025-05-04:16:37:22 INFO     [evaluator:185] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
2025-05-04:16:37:22 INFO     [evaluator:223] Initializing hf model, with arguments: {'pretrained': './math_fin_task_arithmetic'}
2025-05-04:16:37:22 INFO     [models.huggingface:137

## QA and Fin Benchmarks

In [None]:
!lm-eval \
  --model hf \
  --model_args pretrained=./merged_model_avg \
  --tasks nq_open,piqa,arc_easy,arc_challenge,winogrande \
  --device cuda \
  --batch_size 50 \
  --output_path results_qa_only.json

2025-05-04 16:57:50.355216: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746377870.376003   20751 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746377870.382338   20751 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-04:16:57:56 INFO     [__main__:440] Selected Tasks: ['arc_challenge', 'arc_easy', 'nq_open', 'piqa', 'winogrande']
2025-05-04:16:57:56 INFO     [evaluator:185] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
2025-05-04:16:57:56 INFO     [evaluator:223] Initializing hf model, with arguments: {'pretrained': './merged_model_avg'}
2025-05-04:16:57:56 

In [None]:
!lm-eval \
  --model hf \
  --model_args pretrained=./math_fin_avg \
  --tasks nq_open,piqa,arc_easy,arc_challenge,winogrande \
  --device cuda \
  --batch_size 50 \
  --output_path results_math_fin_avg_qa_fin.json

2025-05-04 17:52:26.409958: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746381146.431143   35471 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746381146.437619   35471 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-04:17:52:32 INFO     [__main__:440] Selected Tasks: ['arc_challenge', 'arc_easy', 'nq_open', 'piqa', 'winogrande']
2025-05-04:17:52:32 INFO     [evaluator:185] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
2025-05-04:17:52:32 INFO     [evaluator:223] Initializing hf model, with arguments: {'pretrained': './math_fin_avg'}
2025-05-04:17:52:32 INFO

In [None]:
!lm-eval \
  --model hf \
  --model_args pretrained=./base_math_fin_task_arithmetic \
  --tasks nq_open,piqa,arc_easy,arc_challenge,winogrande \
  --device cuda \
  --batch_size 50 \
  --output_path results_math_fin_base_tv_qa_fin.json

2025-05-04 18:06:14.460567: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746381974.482006   39096 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746381974.488567   39096 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-04:18:06:21 INFO     [__main__:440] Selected Tasks: ['arc_challenge', 'arc_easy', 'nq_open', 'piqa', 'winogrande']
2025-05-04:18:06:21 INFO     [evaluator:185] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
2025-05-04:18:06:21 INFO     [evaluator:223] Initializing hf model, with arguments: {'pretrained': './base_math_fin_task_arithmetic'}
2025-05

In [None]:
!lm-eval \
  --model hf \
  --model_args pretrained=./math_fin_task_arithmetic \
  --tasks nq_open,piqa,arc_easy,arc_challenge,winogrande \
  --device cuda \
  --batch_size 50 \
  --output_path results_math_fin_tv_qa_fin.json

2025-05-04 18:20:15.291051: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746382815.312649   42743 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746382815.319151   42743 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-04:18:20:21 INFO     [__main__:440] Selected Tasks: ['arc_challenge', 'arc_easy', 'nq_open', 'piqa', 'winogrande']
2025-05-04:18:20:21 INFO     [evaluator:185] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
2025-05-04:18:20:21 INFO     [evaluator:223] Initializing hf model, with arguments: {'pretrained': './math_fin_task_arithmetic'}
2025-05-04:1

## BASE PERFORMANCE

In [None]:
BASE_MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
MATH_CODE_MODEL_ID = "TinyLlama/TinyLlama_v1.1_math_code"
FINANCE_MODEL_ID = "ritamsharma/tinyllama-finance-v3"

# Device configuration (use GPU if available)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DTYPE = torch.float16 # Use float16 to save memory

In [None]:
print("Loading models...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    torch_dtype=DTYPE,
    low_cpu_mem_usage=True # Helps if RAM is limited
).to(DEVICE)
print(f"Loaded Base Model: {BASE_MODEL_ID}")

math_code_model = AutoModelForCausalLM.from_pretrained(
    MATH_CODE_MODEL_ID,
    torch_dtype=DTYPE,
    low_cpu_mem_usage=True
).to(DEVICE)
print(f"Loaded Math/Code Model: {MATH_CODE_MODEL_ID}")

finance_model = AutoModelForCausalLM.from_pretrained(
    FINANCE_MODEL_ID,
    torch_dtype=DTYPE,
    low_cpu_mem_usage=True
).to(DEVICE)
print(f"Loaded Finance Model: {FINANCE_MODEL_ID}")
print("-" * 30)


Loading models...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loaded Base Model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
Loaded Math/Code Model: TinyLlama/TinyLlama_v1.1_math_code
Loaded Finance Model: ritamsharma/tinyllama-finance-v3
------------------------------


In [None]:
math_code_model.save_pretrained("math_code_model")
tokenizer.save_pretrained("math_code_model")

finance_model.save_pretrained("finance_model")
tokenizer.save_pretrained("finance_model")

base_model.save_pretrained("base_model")
tokenizer.save_pretrained("base_model")

('base_model/tokenizer_config.json',
 'base_model/special_tokens_map.json',
 'base_model/tokenizer.model',
 'base_model/added_tokens.json',
 'base_model/tokenizer.json')

In [None]:
!lm-eval \
  --model hf \
  --model_args pretrained=./base_model \
  --tasks nq_open,piqa,arc_easy,arc_challenge,winogrande,boolq,hellaswag \
  --device cuda \
  --batch_size 50 \
  --output_path results_math_fin_tv_qa_fin.json

2025-05-04 17:23:34.232906: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746379414.253537   27741 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746379414.259828   27741 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-04:17:23:40 INFO     [__main__:440] Selected Tasks: ['arc_challenge', 'arc_easy', 'boolq', 'hellaswag', 'nq_open', 'piqa', 'winogrande']
2025-05-04:17:23:40 INFO     [evaluator:185] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
2025-05-04:17:23:40 INFO     [evaluator:223] Initializing hf model, with arguments: {'pretrained': './base_model'}
2025

In [None]:
!lm-eval \
  --model hf \
  --model_args pretrained=./math_code_model \
  --tasks nq_open,piqa,arc_easy,arc_challenge,winogrande,boolq,hellaswag \
  --device cuda \
  --batch_size 50 \
  --output_path base_math.json

2025-05-04 17:36:12.106668: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746380172.128341   31092 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746380172.134974   31092 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-04:17:36:18 INFO     [__main__:440] Selected Tasks: ['arc_challenge', 'arc_easy', 'boolq', 'hellaswag', 'nq_open', 'piqa', 'winogrande']
2025-05-04:17:36:18 INFO     [evaluator:185] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
2025-05-04:17:36:18 INFO     [evaluator:223] Initializing hf model, with arguments: {'pretrained': './math_code_model'}

In [None]:
!lm-eval \
  --model hf \
  --model_args pretrained=./finance_model\
  --tasks nq_open,piqa,arc_easy,arc_challenge,winogrande,boolq,hellaswag \
  --device cuda \
  --batch_size 50 \
  --output_path base_fin.json

2025-05-04 17:42:48.574568: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746380568.596728   32882 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746380568.603272   32882 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-04:17:42:55 INFO     [__main__:440] Selected Tasks: ['arc_challenge', 'arc_easy', 'boolq', 'hellaswag', 'nq_open', 'piqa', 'winogrande']
2025-05-04:17:42:55 INFO     [evaluator:185] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fewshot manual seed to 1234
2025-05-04:17:42:55 INFO     [evaluator:223] Initializing hf model, with arguments: {'pretrained': './finance_model'}
2

## Our finetuned versions

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import copy
import os

# Import the MergingMethod class and potentially TaskVector if needed
# Assumes the class code is saved in 'merging_methods.py'
from merging_methods import MergingMethod #, TaskVector # Uncomment TaskVector if it's separate

# --- Configuration ---
BASE_MODEL_ID = 'tiiuae/Falcon3-1B-Base'
Medical_model_ID = "Kavanavnlp/falcon-1b-medical-qa-10k"
FINANCE_MODEL_ID = "Kavanavnlp/falcon-1b-finance-qa-10k"

# Output directories for saved models
OUTPUT_DIR_BASE_med_FIN_AVG = "./merged_models/base_med_fin_avg"
OUTPUT_DIR_BASE_med_FIN_TASK = "./merged_models/base_med_fin_task_arithmetic"
OUTPUT_DIR_med_FIN_AVG = "./merged_models/med_fin_avg"
OUTPUT_DIR_med_FIN_TASK = "./merged_models/med_fin_task_arithmetic"

# Merging parameters
TASK_ARITHMETIC_SCALING = 0.5 # Example scaling coefficient for task arithmetic
EXCLUDE_REGEX = [] # No parameters excluded by default

# Device configuration (use GPU if available)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DTYPE = torch.float16 # Use float16 to save memory

# --- Helper Function to Save Model ---
def save_model(model, tokenizer, output_dir):
    """Saves the model and tokenizer to the specified directory."""
    os.makedirs(output_dir, exist_ok=True)
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Merged model saved to: {output_dir}")

# --- Load Models ---
print("Loading models...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    torch_dtype=DTYPE,
    low_cpu_mem_usage=True # Helps if RAM is limited
).to(DEVICE)
print(f"Loaded Base Model: {BASE_MODEL_ID}")

medical_model = AutoModelForCausalLM.from_pretrained(
    Medical_model_ID,
    torch_dtype=DTYPE,
    low_cpu_mem_usage=True
).to(DEVICE)
print(f"Loaded med/Code Model: {Medical_model_ID}")

finance_model = AutoModelForCausalLM.from_pretrained(
    FINANCE_MODEL_ID,
    torch_dtype=DTYPE,
    low_cpu_mem_usage=True
).to(DEVICE)
print(f"Loaded Finance Model: {FINANCE_MODEL_ID}")
print("-" * 30)

# --- Merge Combination 1: Base + med/Code + Finance ---
print("Starting Merge Combination 1: Base + med/Code + Finance")

# 1.A: Average Merging
print("\nPerforming Average Merging (Base + med + Finance)...")
avg_merger = MergingMethod("average_merging")
models_to_avg_merge_1 = [base_model, medical_model, finance_model]

# Calculate averaged parameters
averaged_params_1 = avg_merger.average_merging(
    models_to_merge=models_to_avg_merge_1,
    exclude_param_names_regex=EXCLUDE_REGEX
)

# Create a new model structure (copy of base) and load averaged weights
merged_model_avg_1 = copy.deepcopy(base_model).to(DEVICE) # Ensure it's on the right device
avg_merger.copy_params_to_model(params=averaged_params_1, model=merged_model_avg_1)
print("Average Merging (Base + med + Finance) complete.")
save_model(merged_model_avg_1, tokenizer, OUTPUT_DIR_BASE_med_FIN_AVG)
del merged_model_avg_1, averaged_params_1 # Free up memory

# 1.B: Task Arithmetic
print("\nPerforming Task Arithmetic (Base + med_Delta + Finance_Delta)...")
task_merger = MergingMethod("task_arithmetic")
# Models providing task vectors (relative to base_model)
models_for_task_vectors_1 = [medical_model, finance_model]

# Create a copy of the base model to apply the task vectors to
merged_model_task_1_base = copy.deepcopy(base_model).to(DEVICE)

# Calculate merged parameters: base + scale * (med_delta + finance_delta)
task_merged_params_1 = task_merger.task_arithmetic(
    merged_model=merged_model_task_1_base, # This is the base model for delta calculation
    models_to_merge=models_for_task_vectors_1,
    exclude_param_names_regex=EXCLUDE_REGEX,
    scaling_coefficient=TASK_ARITHMETIC_SCALING
)

# Load merged parameters into the model structure
task_merger.copy_params_to_model(params=task_merged_params_1, model=merged_model_task_1_base)
print("Task Arithmetic (Base + med_Delta + Finance_Delta) complete.")
save_model(merged_model_task_1_base, tokenizer, OUTPUT_DIR_BASE_med_FIN_TASK)
del merged_model_task_1_base, task_merged_params_1 # Free up memory
print("-" * 30)


# --- Merge Combination 2: med/Code + Finance ---
print("Starting Merge Combination 2: med/Code + Finance")

# 2.A: Average Merging
print("\nPerforming Average Merging (med + Finance)...")
avg_merger_2 = MergingMethod("average_merging")
models_to_avg_merge_2 = [medical_model, finance_model]

# Calculate averaged parameters
averaged_params_2 = avg_merger_2.average_merging(
    models_to_merge=models_to_avg_merge_2,
    exclude_param_names_regex=EXCLUDE_REGEX
)

# Create a new model structure (copy of one of the merged models)
# Using medical_model structure here, could use finance_model too
merged_model_avg_2 = copy.deepcopy(medical_model).to(DEVICE)
avg_merger_2.copy_params_to_model(params=averaged_params_2, model=merged_model_avg_2)
print("Average Merging (med + Finance) complete.")
save_model(merged_model_avg_2, tokenizer, OUTPUT_DIR_med_FIN_AVG)
del merged_model_avg_2, averaged_params_2 # Free up memory


# 2.B: Task Arithmetic (Relative to Base Model)
print("\nPerforming Task Arithmetic (Base + med_Delta + Finance_Delta)...")
# Note: Task Arithmetic inherently uses a base model to calculate the deltas.
# We use the original base_model here for delta calculation,
# resulting in Base + scaled * (med_Delta + Finance_Delta).
# The models providing task vectors are med and Finance.
task_merger_2 = MergingMethod("task_arithmetic")
models_for_task_vectors_2 = [medical_model, finance_model]

# Create a copy of the *base* model to apply the task vectors to
merged_model_task_2_base = copy.deepcopy(base_model).to(DEVICE)

# Calculate merged parameters: base + scale * (med_delta + finance_delta)
task_merged_params_2 = task_arithmetic(
    merged_model=merged_model_task_2_base, # Base model for delta calculation
    models_to_merge=models_for_task_vectors_2,
    exclude_param_names_regex=EXCLUDE_REGEX,
    scaling_coefficient=TASK_ARITHMETIC_SCALING
)

# Load merged parameters into the model structure
task_merger_2.copy_params_to_model(params=task_merged_params_2, model=merged_model_task_2_base)
print("Task Arithmetic (Base + med_Delta + Finance_Delta) complete.")
# Note: Even though we only specified med+Finance for merging,
# Task Arithmetic applies their deltas *onto the base model*.
save_model(merged_model_task_2_base, tokenizer, OUTPUT_DIR_med_FIN_TASK)
del merged_model_task_2_base, task_merged_params_2 # Free up memory
print("-" * 30)

print("All merging tasks finished.")