# Part 0: Setup

In [7]:
# Install PEFT along with dependencies
!pip install -q peft transformers accelerate bitsandbytes

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [8]:
import os
from dotenv import load_dotenv
import torch
import platform
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import pandas as pd


In [9]:
# RUN THIS CELL ONLY IF RUNNING ON PACE-ICE
# override the huggingface cache path and nltk cache path
dirs = {
    "HF_HOME":"~/scratch/hf_cache",
    "TRITON_CACHE_DIR":"~/scratch/triton_cache",
    "TORCHINDUCTOR_CACHE_DIR":"~/scratch/inductor_cache",
    'NLTK_DATA':"~/scratch/nltk_data"
}

for name in dirs:
    d = dirs[name]
    path = os.path.expanduser(d)
    print(name)
    print(path)
    os.makedirs(path, exist_ok=True)
    # making sure the cache dirs are rwx for owner
    os.chmod(path, 0o700)
    os.environ[name] = path
print("Make sure the cache files are in ~/scratch/ so quota doesn't exceed limit!")

HF_HOME
/home/hice1/jho89/scratch/hf_cache
TRITON_CACHE_DIR
/home/hice1/jho89/scratch/triton_cache
TORCHINDUCTOR_CACHE_DIR
/home/hice1/jho89/scratch/inductor_cache
NLTK_DATA
/home/hice1/jho89/scratch/nltk_data
Make sure the cache files are in ~/scratch/ so quota doesn't exceed limit!


In [10]:
# Change this to your own token (or save in .env)
os.environ['HF_TOKEN'] = ''

In [11]:
load_dotenv()  # loads HF_TOKEN into environment
print("‚úÖ Hugging Face token loaded from environment.")

‚úÖ Hugging Face token loaded from environment.


In [12]:
print("=== üß† Environment Info ===")
print(f"Python version: {platform.python_version()}")
print(f"PyTorch version: {torch.__version__}")
print("-----------------------------")

# Check for CUDA (NVIDIA GPUs)
if torch.cuda.is_available():
    num_gpus = torch.cuda.device_count()
    print(f"‚úÖ CUDA is available. Number of GPUs: {num_gpus}")

    for i in range(num_gpus):
        gpu_name = torch.cuda.get_device_name(i)
        total_mem = torch.cuda.get_device_properties(i).total_memory / (1024**3)
        print(f"  ‚Ä¢ GPU {i}: {gpu_name} ({total_mem:.2f} GB VRAM)")

    # Also show current GPU and free memory
    current_gpu = torch.cuda.current_device()
    print(f"\nUsing GPU: {torch.cuda.get_device_name(current_gpu)}")
    free_mem, total_mem = torch.cuda.mem_get_info()
    print(f"Available VRAM: {free_mem/1e9:.2f} GB / {total_mem/1e9:.2f} GB")

# Check for Apple Silicon (MPS)
elif torch.backends.mps.is_available():
    print("‚úÖ Running on Apple Silicon (MPS backend).")

# Check for ROCm (AMD GPUs)
elif torch.version.hip is not None:
    print("‚úÖ ROCm (AMD GPU) detected.")

# Otherwise fallback to CPU
else:
    print("‚ö†Ô∏è No GPU detected ‚Äî running on CPU only.")
    print("This will be very slow for large models like Llama-3.1-8B.")

print("-----------------------------")

# Confirm torch default device
default_device = "cuda" if torch.cuda.is_available() else (
    "mps" if torch.backends.mps.is_available() else "cpu"
)
print(f"Default torch device: {default_device}")

=== üß† Environment Info ===
Python version: 3.10.13
PyTorch version: 2.8.0+cu128
-----------------------------
‚úÖ CUDA is available. Number of GPUs: 1
  ‚Ä¢ GPU 0: NVIDIA H200 (139.80 GB VRAM)

Using GPU: NVIDIA H200
Available VRAM: 101.23 GB / 150.11 GB
-----------------------------
Default torch device: cuda


In [13]:

# --- 3. Model name on Hugging Face Hub ---
model_name = "meta-llama/Llama-3.1-8B"

# --- 4. (Optional) Authenticate if model is gated/private ---
# from huggingface_hub import login
# login(token="YOUR_HF_TOKEN")

print("Loading tokenizer and model‚Ä¶")

# --- 5. Load tokenizer ---
# Tokenizer converts text ‚Üî tokens. Must match model for correct vocabulary.
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # ensure padding works

# --- 6. Load model in full bf16 precision (no quantization) ---
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",      # Automatically distributes layers across GPUs
    torch_dtype=torch.bfloat16,  # Use bf16 for all layers
    low_cpu_mem_usage=True,       # Stream weights directly to GPU to reduce CPU RAM footprint
    trust_remote_code=True        # Needed if the repo includes custom code
)

print("‚úÖ Model loaded successfully!")


Loading tokenizer and model‚Ä¶


Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:02<00:00,  1.67it/s]


‚úÖ Model loaded successfully!


In [14]:

# --- 8. Simple inference test ---
prompt = """### Instruction:
Explain the difference between left-wing and right-wing economic policies.

### Response:"""

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

outputs = model.generate(
    **inputs,
    max_new_tokens=150,                  # control output length
    do_sample=True,                      # enables some randomness
    temperature=0.7,                     # mild creativity
    top_p=0.9,                           # nucleus sampling
    pad_token_id=tokenizer.eos_token_id,
    repetition_penalty=1.2               # prevent repeated text
)

result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(result)

### Instruction:
Explain the difference between left-wing and right-wing economic policies.

### Response: 
The main differences in economic policy between these two ideologies are that Left-Wing ideology tends to support government intervention, regulation of markets by taxation, redistribution of income or wealth through progressive tax systems and social welfare programs such as unemployment benefits etc., whereas Right Wing Economics focuses more on individual responsibility rather than collective action with less emphasis placed upon state control over private enterprise activities. Additionally there may also be ideological variations depending on which side you look at e.g some people consider themselves libertarian but still identify strongly enough within one camp so they can't really fit into either category perfectly well - this sorta thing happens quite often!


# Part 1: Load & Prepare Data
Data when loaded in:
```
{
    "instruction": "Write a political news story from a {left/ right/ center} perspective based on the headline.", 
    "input": "Headline: ...",
    "output": "..."
}
```

**Steps to process it:**
1. Combine into 1 traning text
2. Tokenize text

In [15]:
cwd = os.getcwd()

# build full paths
data_path = os.path.join(cwd, "train.jsonl")   # input file
# Option 1: Load using pandas ‚Äî easiest for inspection
train_df = pd.read_json(data_path, lines=True)

print("‚úÖ Loaded train.jsonl successfully!")
print(f"Number of samples: {len(train_df)}\n")
print("üìä Preview:")
display(train_df.head())

‚úÖ Loaded train.jsonl successfully!
Number of samples: 2852

üìä Preview:


Unnamed: 0,instruction,input,output
0,Write a political news story from a right pers...,Headline: If Democrats Flip House What Will Th...,Quotes displayed in real-time or delayed by at...
1,Write a political news story from a center per...,Headline: Putin Marks Russias Victory Day Spee...,It has become an annual event - the military p...
2,Write a political news story from a left persp...,Headline: Media Industry Ap Criticized After D...,Copyright 2025 The Associated Press. All Right...
3,Write a political news story from a left persp...,Headline: Economy And Jobs Will There Be Reces...,"As the Fed wrestles with inflation, experts wo..."
4,Write a political news story from a right pers...,Headline: Technology Plans Tesla Tunnel Nashvi...,The project will be similar to one already in ...


In [16]:
def format_prompt(example):
    """
    Combine instruction, input, and output into one training text.
    """
    return (
        f"### Instruction:\n{example['instruction'].strip()}\n\n"
        f"### Input:\n{example['input'].strip()}\n\n"
        f"### Response:\n{example['output'].strip()}"
    )

In [26]:
N = 200
eval_df = train_df.sample(n=N, random_state=42).reset_index(drop=True)

def build_eval_prompt(row):
    """
    Same format as training, but without including the output.
    The model should generate the part after '### Response:'.
    """
    return (
        f"### Instruction:\n{row['instruction'].strip()}\n\n"
        f"### Input:\n{row['input'].strip()}\n\n"
        f"### Response:\n"
    )

eval_prompts = [build_eval_prompt(r) for _, r in eval_df.iterrows()]
eval_references = [r["output"] for _, r in eval_df.iterrows()]  # gold answers
print(eval_prompts[0])
print("-------")
print(eval_references[0][:400])


### Instruction:
Write a political news story from a left perspective based on the headline.

### Input:
Headline: Perspectives Florida Bans Critical Race Theory Schools, Summary: The Florida State Board of Education unanimously voted to ban lessons that include critical race theory (CRT) from public schools Thursday. The board approved anamendment that¬†prohibitsthe teaching that America is inherently racist, or¬†"the theory that racism is not merely the product of prejudice, but that racism is embedded in American society and its legal systems in order to uphold the supremacy of white persons." Florida Gov. Ron DeSantis said in a video before the meeting that¬†CRT distorts history, and urged board members, many who were appointed by DeSantis himself,¬†to approve of the amendment that would teach student historical¬†facts instead of ‚Äútrying to indoctrinate them with ideology.‚Äù This comes after other states, such as¬†Idaho, Oklahoma, Tennessee, and Iowa, have putrestrictions on CRT

In [18]:
# Apply formatting to DataFrames
train_df["text"] = train_df.apply(format_prompt, axis=1)

# Preview one formatted sample
print(train_df["text"].iloc[0][:800])  # show first 800 chars

### Instruction:
Write a political news story from a right perspective based on the headline.

### Input:
Headline: If Democrats Flip House What Will They Prioritize, Summary: With Democrats largely projected to flip the House, perspectives vary on their potential agenda, with some positing they will reform campaign ethics, outlaw gerrymandering, and bolster voting rights, while others say they will prioritize impeachment and increase spending. Republicans are expected to keep the Senate, so some also point out that many policy initiatives on both sides of the aisle will likely come to a standstill.

### Response:
Quotes displayed in real-time or delayed by at least 15 minutes. Market data provided byFactset.
          Powered and implemented byFactSet Digital Solutions.Legal Statement.

T


In [19]:
from datasets import Dataset
def tokenize_function(example):
    return tokenizer(
        example["text"],
        truncation=True,
        max_length=1024,   # adjust based on GPU memory
        padding="max_length"
    )
# Convert to HF Dataset and tokenize
train_dataset = Dataset.from_pandas(train_df[["text"]])
tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Check shape and sample
print(tokenized_train)

print("============ Sanity Check - decoding encoded tokens ============")
print(tokenizer.decode(tokenized_train[0]["input_ids"][:200]))

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2852/2852 [00:05<00:00, 511.53 examples/s]

Dataset({
    features: ['__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 2852
})
<|begin_of_text|>### Instruction:
Write a political news story from a right perspective based on the headline.

### Input:
Headline: If Democrats Flip House What Will They Prioritize, Summary: With Democrats largely projected to flip the House, perspectives vary on their potential agenda, with some positing they will reform campaign ethics, outlaw gerrymandering, and bolster voting rights, while others say they will prioritize impeachment and increase spending. Republicans are expected to keep the Senate, so some also point out that many policy initiatives on both sides of the aisle will likely come to a standstill.

### Response:
Quotes displayed in real-time or delayed by at least 15 minutes. Market data provided byFactset.
          Powered and implemented byFactSet Digital Solutions.Legal Statement.

This material may not be published, broadcast, rewritten, or redistributed. ¬©2025 F




# Part 2: LoRA fine-tuning
**Steps**
1. Set up LoRA model
2. Set up training arguments
3. Prepare a Data Collator
4. Set up Trainer
5. Run Training
6. Save new weights

In [20]:
# Set up Lora model for fine-tuning
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=16,                # rank of the LoRA matrices
    lora_alpha=32,       # scaling factor
    target_modules=["q_proj", "v_proj"],  # which layers to fine-tune
    lora_dropout=0.05,   # dropout for LoRA
    bias="none",         # keep bias frozen
    task_type="CAUSAL_LM" # type of task
)

# Wrap base model with PEFT
model = get_peft_model(model, lora_config) # freezes original layer
model.print_trainable_parameters()  # confirm only LoRA params are trainable

trainable params: 6,815,744 || all params: 8,037,076,992 || trainable%: 0.0848


In [21]:
# Define Training Arguments
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="outputs/lora",         # where to save checkpoints & LoRA adapters
    per_device_train_batch_size=8,     # batch size per GPU
    gradient_accumulation_steps=4,     # effective batch size = 8 * 4 * 2 GPUs = 64
    learning_rate=3e-4,                # LoRA-friendly default
    num_train_epochs=3,                # 3 epochs, increase if dataset is larger
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=2,
    bf16=True,                          # optional: H200 supports bf16, can improve speed
    dataloader_drop_last=True,           # drop incomplete batch to avoid OOM
    report_to="none",                    # change to 'wandb' if using W&B
    remove_unused_columns=False,
    ddp_find_unused_parameters=False,   # improves multi-GPU performance
    gradient_checkpointing=True,        # reduce memory usage for large models
    warmup_steps=50,                    # optional warmup
    optim="paged_adamw_32bit",          # memory-efficient optimizer
    lr_scheduler_type="cosine"          # smooth LR schedule
)


print("‚úÖ Training arguments set up")


‚úÖ Training arguments set up


In [22]:
# Prepare Data Collator (Coverts list of examples into tensors)
from transformers import DataCollatorForLanguageModeling

# Collator for causal LM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # False because this is causal LM, not masked LM
)

print("‚úÖ Data collator ready")

‚úÖ Data collator ready


In [23]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    data_collator=data_collator,
    processing_class=tokenizer
)

print("‚úÖ Trainer ready")

‚úÖ Trainer ready


In [None]:
# Begin training
trainer.train()

# Save the LoRA adapter after training
model.save_pretrained("outputs/lora_adapter")

print("‚úÖ Training complete and LoRA adapter saved")


Step,Training Loss
50,1.9467
100,1.7948
150,1.7436
200,1.7123
250,1.7022


# Part 3: Quick Tests

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch, os

# ---- Paths ----
# base model you used during LoRA training
base_model_name = "meta-llama/Llama-3.1-8B"   # change if you used a different one

# absolute path to your LoRA adapter (from the screenshot)
adapter_path = "./outputs/lora_adapter"

print("Base model name:", base_model_name)
print("Adapter path:", adapter_path, "exists?", os.path.isdir(adapter_path))

# ---- Load tokenizer from base model ----
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.pad_token = tokenizer.eos_token

# ---- Load base model ----
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
)

# ---- Attach LoRA adapter on top of base model ----
trained_model = PeftModel.from_pretrained(
    base_model,
    adapter_path,
)

# Optional: merge LoRA weights into the base model and drop the adapter structure
# trained_model = trained_model.merge_and_unload()

print("‚úÖ Loaded base model + LoRA adapter; ready for evaluation!")


Base model name: meta-llama/Llama-3.1-8B
Adapter path: ./outputs/lora_adapter exists? True


Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:02<00:00,  1.67it/s]


‚úÖ Loaded base model + LoRA adapter; ready for evaluation!




In [24]:

# --- Test generation ---
prompt = """### Instruction:
Write a political news story from a right perspective based on the headline.

### Input:
Headline: Georgia Tech student working on NLP project

### Response:
"""

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
    output_ids = trained_model.generate(
        **inputs,
        max_new_tokens=300,
        temperature=0.7,
        do_sample=True,
        top_p=0.9
    )

generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(generated_text)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


### Instruction:
Write a political news story from a right perspective based on the headline.

### Input:
Headline: Georgia Tech student working on NLP project

### Response:
Georgia Tech student working on NLP project is an example of how technology is being used to advance the field of artificial intelligence. The project, which is being developed by a team of students at the university, aims to create a system that can understand and respond to human language in a more natural and intuitive way. This could have a number of applications, including improving customer service, developing more advanced virtual assistants, and creating more sophisticated chatbots. The project is still in its early stages, but the team is hopeful that it will be able to make significant progress in the coming years.


In [30]:
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"
trained_model.to(device)

generated_outputs = []

for i, prompt in tqdm(enumerate(eval_prompts), total=len(eval_prompts), desc="Generating"):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        output_ids = trained_model.generate(
            **inputs,
            max_new_tokens=256,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
        )

    input_len = inputs["input_ids"].shape[1]
    gen_tokens = output_ids[0, input_len:]
    gen_text = tokenizer.decode(gen_tokens, skip_special_tokens=True).strip()

    generated_outputs.append(gen_text)


Generating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 200/200 [16:02<00:00,  4.81s/it]


In [37]:
import json

with open("generated_outputs.jsonl", "w") as f:
    for prompt, output in zip(eval_prompts, generated_outputs):
        f.write(json.dumps({"prompt": prompt, "generation": output}) + "\n")

print("Saved to generated_outputs.jsonl")


Saved to generated_outputs.jsonl


Distinct N evaluation

In [38]:
from collections import Counter

def get_ngrams(tokens, n):
    return [tuple(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]

def distinct_n(corpus, n=1):
    all_ngrams = []
    for text in corpus:
        tokens = text.split()
        if len(tokens) < n:
            continue
        all_ngrams.extend(get_ngrams(tokens, n))

    if not all_ngrams:
        return 0.0

    unique_ngrams = set(all_ngrams)
    return len(unique_ngrams) / len(all_ngrams)

d1 = distinct_n(generated_outputs, n=1)
d2 = distinct_n(generated_outputs, n=2)

print(f"Distinct-1: {d1:.4f}")
print(f"Distinct-2: {d2:.4f}")


Distinct-1: 0.2067
Distinct-2: 0.6128


In [39]:
prompt_texts = [str(x) for x in eval_prompts]

In [45]:
from sentence_transformers import SentenceTransformer, util

sem_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

emb_prompts = sem_model.encode(prompt_texts, convert_to_tensor=True)
emb_gen     = sem_model.encode(generated_outputs, convert_to_tensor=True)

cos_scores = util.cos_sim(emb_prompts, emb_gen).diagonal()

avg_sim = float(cos_scores.mean())
print("First 10 prompt‚Üîgeneration similarities:",
      [round(float(x), 4) for x in cos_scores[:10]])
print(f"\nAverage semantic similarity: {avg_sim:.4f}")

First 10 prompt‚Üîgeneration similarities: [0.4634, 0.0702, 0.948, 0.9443, 0.9645, 0.4293, 0.9097, 0.9684, 0.8727, 0.8349]

Average semantic similarity: 0.8032


In [46]:
output_path = "eval_results.jsonl"
print("Saving to:", os.path.abspath(output_path))

with open(output_path, "w", encoding="utf-8") as f:
    for prompt, generation, sim in zip(eval_prompts, generated_outputs, cos_scores):
        record = {
            "prompt": prompt,
            "generation": generation,
            "semantic_similarity": float(sim),
        }
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

print("Saved", len(generated_outputs), "records with similarity to", output_path)


Saving to: /storage/ice1/3/6/jho89/AI-Political-Perspectives-main/training/eval_results.jsonl
Saved 200 records with similarity to eval_results.jsonl
