In [1]:
!pip install transformers datasets trl peft bitsandbytes accelerate torch sentencepiece huggingface_hub -U
!pip install --upgrade trl transformers accelerate
!pip install --upgrade deepspeed==0.14.4

# ATTENTION: Be sure to restart the notebook after installing all packages!

Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting trl
  Downloading trl-0.16.1-py3-none-any.whl.metadata (12 kB)
Collecting peft
  Downloading peft-0.15.2-py3-none-any.whl.metadata (13 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting accelerate
  Downloading accelerate-1.6.0-py3-none-any.whl.metadata (19 kB)
Collecting torch
  Downloading torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting huggingface_hub
  Downloading huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metada

# Log into Hugging Face
Ensure that you have access to the `meta-llama/Llama-3.2-3B-Instruct` gated respository on Hugging Face (you can visit [https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct] to request access). Access to the base model is required before running inference on the fine-tuned model.

In [2]:
from huggingface_hub import interpreter_login
interpreter_login()


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|



# Prompt
Edit the code below to change the user prompt. Run the cell below to get the fine-tuned Llama 3.2 3B model's response.

In [None]:
prompt_text = "Investigating the effects of manipulation on the gale shapley algorithm."

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import warnings
from IPython.display import display, Markdown

# Suppress warnings if needed
warnings.filterwarnings("ignore")

# --- Configuration ---
base_model_id = "meta-llama/Llama-3.2-3B-Instruct"
# *** IMPORTANT: Update this path to where your LoRA adapter weights are saved ***
adapter_path = "./results_llama3_finetuned"
# Set device (use "cuda" if GPU is available, otherwise "cpu")
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Optional: Quantization config for loading base model (if needed for memory)
# Adjust based on your inference hardware and how you trained/saved.
# If you merge first, you might load the base without quantization here.
use_quantization = False # Set to True if you want to load base model in 4-bit
compute_dtype = torch.bfloat16 # Or torch.float16

bnb_config = None
if use_quantization:
    print("Loading base model with 4-bit quantization...")
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False, # Typically False for inference
    )
else:
    print(f"Loading base model with dtype: {compute_dtype}")


# --- Load Base Model and Tokenizer ---
print(f"Loading base model: {base_model_id}")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    torch_dtype=compute_dtype if not use_quantization else None, # Set dtype if not quantizing
    device_map="auto", # Automatically handle device placement
    trust_remote_code=True,
)

print(f"Loading tokenizer for: {base_model_id}")
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token # Set pad token if needed
tokenizer.padding_side = "right"

# --- Load LoRA Adapter and Merge ---
print(f"Loading LoRA adapter from: {adapter_path}")
# Load the LoRA adapter weights on top of the base model
# Note: device_map="auto" should handle placing the adapter correctly
model = PeftModel.from_pretrained(model, adapter_path)
print("LoRA adapter loaded.")

print("Merging LoRA adapter into the base model...")
# Merge the adapter weights into the base model weights
# This returns a standard transformers model object
model = model.merge_and_unload()
print("Adapter merged.")

# Set the model to evaluation mode
model.eval()

# --- Prepare Prompt using the Correct Chat Template ---
# *** CRITICAL: Use the exact same format as your training data ***
# Example Llama 3.2 Instruct format (VERIFY THIS STRUCTURE!)
# prompt_text = "Investigating the effects of manipulation on the gale shapley algorithm."

# Format the prompt using the chat template structure
# Ensure this matches the format_dataset_batch function from your training script
formatted_prompt = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

{prompt_text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

""" # Note the trailing space and newline for the assistant's turn

print("\nFormatted Prompt:")
print(formatted_prompt)

# --- Tokenize Input ---
print("Tokenizing prompt...")
# Encode the formatted prompt into token IDs
# Ensure tensors are moved to the same device as the model
inputs = tokenizer(formatted_prompt, return_tensors="pt", padding=False, truncation=True).to(device)

# --- Generate Output ---
print("Generating response...")
# Generate text using the model
# Adjust generation parameters as needed (max_new_tokens, temperature, top_p, etc.)
with torch.no_grad(): # Disable gradient calculations for inference
    outputs = model.generate(
        **inputs,
        max_new_tokens=2048,          # Maximum number of new tokens to generate
        do_sample=True,             # Whether to use sampling; set to False for deterministic output
        temperature=0.6,            # Controls randomness (lower = more deterministic)
        top_p=0.9,                  # Nucleus sampling probability
        eos_token_id=tokenizer.eos_token_id # Stop generation when EOS token is encountered
    )

# --- Decode and Print Output ---
# Decode the generated token IDs back into text
# Skip special tokens (like padding or EOS) in the output string
# We slice the output to only decode the newly generated tokens (after the input prompt)
generated_token_ids = outputs[0][inputs['input_ids'].shape[1]:]
response = tokenizer.decode(generated_token_ids, skip_special_tokens=True)

div = "="*80
print(f"\n{div}\nModel Response:")
display(Markdown(response))


# print(response)

print("\nInference complete.")



  warn(
2025-04-22 15:26:01.868674: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-22 15:26:01.868734: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-22 15:26:01.869805: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-22 15:26:01.876094: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[2025-04-22 15:26:03,615] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




  @autocast_custom_fwd
  @autocast_custom_bwd


Using device: cuda
Loading base model with dtype: torch.bfloat16
Loading base model: meta-llama/Llama-3.2-3B-Instruct


config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Loading tokenizer for: meta-llama/Llama-3.2-3B-Instruct


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Loading LoRA adapter from: ./x_results_llama3_finetuned
LoRA adapter loaded.
Merging LoRA adapter into the base model...
Adapter merged.

Formatted Prompt:
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Investigating the effects of manipulation on the gale shapley algorithm.<|eot_id|><|start_header_id|>assistant<|end_header_id|>


Tokenizing prompt...


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Generating response...

Model Response:


The Gale-Shapley algorithm is a well-known algorithm used in stable marriage problem to match entities (e.g., men and women) in a way that maximizes the number of stable pairings. Manipulation in this context refers to finding or designing strategies that deviate from the algorithm’s output to achieve a different stable or suboptimal matching.

**Research Problem:**  
Investigate how manipulation affects the Gale-Shapley algorithm's stability, efficiency, and fairness.

---

### Approach

1. **Literature Review**  
   - Study existing theoretical results on the Gale-Shapley algorithm and known manipulation strategies.  
   - Review previous empirical studies on the robustness of the algorithm against manipulation.

2. **Theoretical Analysis**  
   - Formalize different types of manipulation (e.g., prefix manipulation, postfix manipulation, and simultaneous manipulation).  
   - Analyze the impact of each type on the algorithm's output.  
   - Derive conditions under which manipulation succeeds or fails.

3. **Simulation and Experimentation**  
   - Implement the Gale-Shapley algorithm and various manipulation strategies.  
   - Generate synthetic datasets reflecting realistic pairing scenarios (e.g., different numbers of entities, preferences).  
   - Run simulations to observe the outcomes and measure success rates of manipulation attempts.

4. **Case Studies**  
   - Analyze specific instances where manipulation succeeds or fails.  
   - Interpret results to understand the impact on stability and fairness.

5. **Comparison with Alternative Algorithms**  
   - Compare the Gale-Shapley algorithm’s robustness against manipulation with other matching algorithms (e.g., Hungarian algorithm, block design algorithms).

### Data Collection Plan

- Collect synthetic data from generated random preference lists and matching results.  
- Record manipulation attempts and outcomes.  
- Document stability metrics (number of stable pairings, number of manipulations successful).

### Data Analysis Plan

- Statistical analysis to compare success rates of manipulation across different manipulation types and dataset sizes.  
- Visualization of algorithm performance metrics over varying numbers of entities.  
- Theoretical analysis to derive bounds on manipulation success probability.

### Potential Challenges or Limitations

- Computational complexity in simulating large-scale scenarios.  
- Difficulty in generalizing results across diverse preference structures.  
- Defining and measuring "success" of manipulation in a formal way.

### Ethical Considerations

- Ensure fairness in the representation of preferences and entities.  
- Transparency in reporting manipulation results without bias.

---

This structured approach aims to comprehensively understand and quantify the effects of manipulation on the Gale-Shapley algorithm, contributing to both theoretical insights and practical improvements.


Inference complete.
