In [None]:
# Importing the sys module to access system-specific parameters and functions
import sys
# Importing the os module to interact with the operating system
import os

# This function checks if CUDA (GPU support) is available and installs necessary dependencies for fine-tuning
# If CUDA is not available, it warns the user to select a GPU runtime
# The function installs the following packages:
# - openpyxl: Library for reading and writing Excel files (xlsx/xlsm/xltx/xltm)
def install_dependencies():
    import torch
    if not torch.cuda.is_available():
      print("CUDA is not available. \nPick a GPU before running this notebook. \nGo to 'Runtime' -> 'Change runtime type' to do this. (Colab)")
      return
    %pip install bitsandbytes
    %pip install accelerate
    %pip install transformers
    %pip install datasets
    %pip install evaluate
    %pip install peft
    %pip install trl
    %pip install evaluate
    %pip install scikit-learn
    %pip install wandb
    return

In [None]:
def is_running_in_colab():
    return "google.colab" in sys.modules

if is_running_in_colab() or is_running_in_kaggle():
    print("Running on Colab/Kaggle")
    install_dependencies()
else:
    print("Not running in Colab/Kaggle")

In [None]:
import transformers

seed = 22 # Please set your own favorite seed!

# set the seed
transformers.set_seed(seed)

In [None]:
def download_data():
    !git clone https://github.com/UjjayiniDas/LLM-NLPOR-Work.git
    %mv LLM-NLPOR-Work/Data/ .
    %rm -rf LLM-NLPOR-Work/
    return

download_data()

In [None]:
# Import pandas for data manipulation and analysis
# This library provides data structures like DataFrames that make working with structured data easy
import pandas as pd

# Read the CSV file into a pandas DataFrame
df_snap_w12 = pd.read_csv("Data/Processed_for_LLM/Wave12_processed.csv")
df_snap_nonimp_w12= df_snap_w12[df_snap_w12["imputed_snap"] == False]

# Removing imputed identifier column
df_snap_nonimp_w12_final= df_snap_nonimp_w12.drop("unique_ID",axis=1).drop("snap_wave1",axis=1).drop("imputed_snap_wave1",axis=1).drop("snap",axis=1).drop("imputed_snap",axis=1)

In [None]:
# Display some rows of the DataFrame
df_snap_nonimp_w12_final.head(5)

In [None]:
features = [
    # Demographics
    "wave ID",
    "marital status",
    "sex",
    "hispanic origin",
    "race",
    "education",
    "citizenship",
    "employment",
    "state",
    "age",
    "income",
    "SNAP_wave1"
]
label = "SNAP" # this is the snap coverage in wave 2 (dependent) in this dataset
## No missing values

In [None]:
# Splitting nonimputed dataset into train/test
#%pip install scikit-learn datasets
#import sklearn
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

df_snap_nonimp_w12_train, df_snap_nonimp_w12_test = train_test_split(
    df_snap_nonimp_w12_final,  # The processed survey dataframe to split into train and test sets
    test_size=0.3,        # Allocate 20% of the data to the test set
    random_state=seed,    # Set a random seed for reproducibility of the split
)


dataset_snap_w12 = DatasetDict({
    "train": Dataset.from_pandas(df_snap_nonimp_w12_train, preserve_index=False),  # Convert training dataframe to a Hugging Face Dataset
    "test": Dataset.from_pandas(df_snap_nonimp_w12_test, preserve_index=False),    # Convert test dataframe to a Hugging Face Dataset
})

In [None]:
# Model Selection
# id of a model hosted on Hugging Face
model_id = "Qwen/Qwen2.5-0.5B-Instruct"

# Loading tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    model_id,  # The ID of the model to load the tokenizer for
    revision= "2b01de6d1108f9b2b5e46a726aa678a359b6c03b", # NOTE: revision should be set for an reproducible experiment
    trust_remote_code=True,  # Allow the tokenizer to execute remote code from the model repository
)

In [None]:
# Prompt Design
instruction = (
    "You are an expert text classifier. Please perform a classification task. The Supplemental Nutrition Assistance Program (SNAP) provides food-purchasing assistance to low-income individuals and families in the U.S."
    "You will be given a respondent’s survey answers from wave 1 of the 2014 Survey of Income and Program Participation (SIPP), along with their demographic information, income, and their reported SNAP coverage in wave 1."
    "Based on this information, classify if the person has coverage for SNAP in wave 2."
    "Return only one label: either 'Yes' or 'No', without any other text.\n"
)

column_name_map= {
    "wave ID":"Wave of SIPP",
    "marital status":"Marital Status",
    "sex":"Sex",
    "hispanic origin":"Hispanic Origin",
    "race":"Race",
    "education":"Educational Attainment",
    "citizenship":"Citizenship",
    "employment":"Employment Status",
    "state":"State of Residence",
    "age":"Age",
    "income":"Income",
    "SNAP_wave1":"Coverage for SNAP in Wave 1",
    "SNAP":"Coverage for SNAP"
}

In [None]:
# Prompt Completion Pair
def build_prompt_completion(
    row: dict,
    system_prompt: str = instruction,
) -> list[list[dict]]:
    user_prompt = "\n".join(
        [f"{column_name_map[k]}: {v}" for k, v in row.items() if k != label]
    )
    assistant_prompt = row[label]
    return {
        "prompt": [
            {
                "role": "system",
                "content": system_prompt,
            },
            {
                "role": "user",
                "content": user_prompt,
            },
        ],
        "completion": [
            {
                "role": "assistant",
                "content": assistant_prompt,
            },
        ],
    }


build_prompt_completion(
    row=dataset_snap_w12["train"][0],
)

In [None]:
# Transforming our tabular data into a format suitable for fine-tuning.
dataset_snap_w12_llm = dataset_snap_w12.map(build_prompt_completion).remove_columns(features+[label])
dataset_snap_w12_llm

In [None]:
# Reducing the memory requirements of the model using quantization.

import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

# load model in 4bit
model = AutoModelForCausalLM.from_pretrained(
    model_id,  # The ID of the model to load
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,  # Enable 4-bit quantization to reduce memory usage
    ),
    trust_remote_code=True,  # Allow execution of remote code from the model repository
    device_map="auto",  # Automatically determine whether to use CPU or GPU
)

# Overview of the model architecture
model

In [None]:
from trl import SFTConfig, SFTTrainer
from datetime import datetime
from peft import LoraConfig, TaskType

# key hyperparameters
learning_rate = 2e-5  # Learning rate for optimizer - controls how quickly model parameters are updated
batch_size = 8        # Number of samples processed in each training batch
epochs = 1            # Number of complete passes through the training dataset

now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
dataset_name = "sipp 2014 snap"
run_name = f"{model_id}_{dataset_name}_seed_{seed}_{now}" # The run name. TODO choose your own run name


# key hyperparameters
lora_rank = 8
lora_alpha = 8

lora_config = LoraConfig(
    r=lora_rank,  # Rank of the low-rank matrices
    lora_alpha=lora_alpha,  # Scaling factor for the LoRA contribution
    lora_dropout=0.05,  # Dropout probability for regularization (default)
    bias="none",  # Don't apply LoRA to bias parameters (default)
    task_type=TaskType.CAUSAL_LM,  # Specify that we're fine-tuning a causal language model
    target_modules="all-linear",  # Apply LoRA to all linear layers in the model
)

#lora_config

training_args = SFTConfig(
    # training parameters
    per_device_train_batch_size=batch_size,  # number of samples per batch on each device during training
    per_device_eval_batch_size=batch_size,   # number of samples per batch on each device during evaluation
    num_train_epochs=epochs,                 # total number of training epochs

    # evaluation settings
    do_eval=True,                            # whether to run evaluation
    eval_strategy="steps",                   # when to run evaluation (after certain steps)
    eval_steps=1 / 3,                        # evaluate after each third of an epoch

    # logging configuration
    logging_steps=10,                        # log metrics every 10 steps
    report_to="none",                        # send logs to desired location
    run_name=run_name,                       # name of the run for tracking

    output_dir="./results",                  # directory to save model checkpoints (and logs)
)

trainer = SFTTrainer(
    model=model,                             # The pre-trained model to fine-tune
    train_dataset=dataset_snap_w12_llm["train"],      # Training dataset with prompts and completions
    eval_dataset=dataset_snap_w12_llm["test"],        # Evaluation dataset for testing model performance
    args=training_args,                     # Training configuration settings
    peft_config=lora_config                 # LoRA configuration for parameter-efficient fine-tuning

)

In [None]:
trainer.train() # run fine-tuning and evaluate at intermediate steps

In [None]:
trainer.evaluate() # evaluate the model on test data to log zero-shot performance

In [None]:
#Updating model and tokenizer
model_finetuned= trainer.model
tokenizer_finetuned= trainer.tokenizer

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


In [None]:
# Set roles in the prompt-completition pairs for easy cleaning
def format_chat_prompt(prompt_list):
    formatted = ""
    for msg in prompt_list:
        role = msg["role"]
        content = msg["content"]
        if role == "system":
            formatted += f"<|system|>\n{content}\n"
        elif role == "user":
            formatted += f"<|user|>\n{content}\n"
        elif role == "assistant":
            formatted += f"<|assistant|>\n{content}\n"
    formatted += "<|assistant|>\n"  # model should now generate the response
    return formatted

In [None]:
# Constructing batches for generating predictions
from torch.utils.data import DataLoader
batch_size = 32

formatted_prompts = []
ground_truths = []
for example in dataset_snap_w12_llm["test"]:
    prompt_list = example["prompt"]       # a list of {"role": ..., "content": ...}
    formatted = format_chat_prompt(prompt_list)  # convert to formatted prompt string
    formatted_prompts.append(formatted)
    ground_truths.append(example["completion"])


dataloader = DataLoader(
    formatted_prompts,
    batch_size=batch_size
)

In [None]:
# Converting LoRA layers to bfloat16 manually
model_dtype = next(model_finetuned.parameters()).dtype
print(model_dtype)
for name, param in model_finetuned.named_parameters():
    if param.dtype == torch.float32:
        param.data = param.data.to(torch.bfloat16)
        print(name, param.dtype)

In [None]:
tokenizer_finetuned.padding_side = "left"

In [None]:
from tqdm.auto import tqdm

# Store predictions
all_predictions = []

for batch_prompts in tqdm(dataloader, desc="Generating"):
    inputs = tokenizer_finetuned(
        batch_prompts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=1024
    )


# Move input_ids to model.device (must be long/int64)
    inputs["input_ids"] = inputs["input_ids"].to(device=model_finetuned.device)
    # Ensure attention_mask is float and matches model's dtype
    if "attention_mask" in inputs:
        inputs["attention_mask"] = inputs["attention_mask"].to(device=model_finetuned.device, dtype=model_dtype)

    with torch.no_grad():
        output_ids = model_finetuned.generate(
            **inputs,
            max_new_tokens=100,
            do_sample=False,
            pad_token_id=tokenizer_finetuned.pad_token_id,
            eos_token_id=tokenizer_finetuned.eos_token_id
        )

    decoded_outputs = tokenizer_finetuned.batch_decode(output_ids, skip_special_tokens=True)
    all_predictions.extend([out.strip() for out in decoded_outputs])

In [None]:
# Cleaning predicted values
import pandas as pd
import re

cleaned_labels = []

for pred in all_predictions:
    # Get only content after <|assistant|>
    text_pred = pred.split("<|assistant|>\n", 1)[-1]

    # Get only the portion before "#1"
    text_pred = text_pred.split("#1")[0].strip()

    # Remove brackets, quotes, and normalize spacing
    match = re.search(r"\bYes\b|\bNo\b", text_pred, re.IGNORECASE)
    if match:
        cleaned_labels.append(match.group(0).title())  # Ensure title case
    else:
        cleaned_labels.append("Unknown")  # fallback if parsing fails

In [None]:
references = [
    example["completion"][0]["content"].strip().title()
    for example in dataset_snap_w12_llm["test"]
]

In [None]:
df_cleaned_pred = pd.DataFrame({
    "Predicted_Label": cleaned_labels,
    "Ground_Truth": references
})

df_cleaned_pred.to_excel("finetuned_nonimputed_predictions_w12.xlsx", index=False)

In [None]:
from sklearn.metrics import accuracy_score, f1_score

# If some predictions were "Unknown", skip those:
valid = [(p, t) for p, t in zip(cleaned_labels, references) if p in ["Yes", "No"]]

if valid:
    y_pred, y_true = zip(*valid)
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, pos_label="Yes")
    print(f"Accuracy: {acc:.4f}")
    print(f"F1 Score: {f1:.4f}")
else:
    print("No valid predictions to evaluate.")

In [None]:
# testing on imputed cases
# preparing the test data
df_snap_imp_w12 = df_snap_w12[df_snap_w12["imputed_snap"] == True]
df_snap_imp_w12_final = df_snap_imp_w12.drop("unique_ID",axis=1).drop("snap_wave1",axis=1).drop("imputed_snap_wave1",axis=1).drop("snap",axis=1).drop("imputed_snap",axis=1)

dataset_snap_imp_w12 = DatasetDict({
    "train": Dataset.from_pandas(df_snap_nonimp_w12_train, preserve_index=False),  # Convert training dataframe to a Hugging Face Dataset
    "test": Dataset.from_pandas(df_snap_imp_w12_final, preserve_index=False),    # Convert test dataframe to a Hugging Face Dataset
})

dataset_snap_imp_w12_llm = dataset_snap_imp_w12.map(build_prompt_completion).remove_columns(features+[label])

In [None]:
# Constructing batches for generating predictions
#from torch.utils.data import DataLoader
batch_size_pred = 32

formatted_prompts_imp = []
imputed_sipp = []
for example in dataset_snap_imp_w12_llm["test"]:
    prompt_list = example["prompt"]       # a list of {"role": ..., "content": ...}
    formatted_imp = format_chat_prompt(prompt_list)  # convert to formatted prompt string
    formatted_prompts_imp.append(formatted_imp)
    imputed_sipp.append(example["completion"])


dataloader_imp = DataLoader(
    formatted_prompts_imp,
    batch_size=batch_size_pred
)

In [None]:
# Converting LoRA layers to bfloat16 manually
import torch

model_dtype = next(model_finetuned.parameters()).dtype
print(model_dtype)
for name, param in model_finetuned.named_parameters():
    if param.dtype == torch.float32:
        param.data = param.data.to(torch.bfloat16)
        print(name, param.dtype)

In [None]:
tokenizer_finetuned.padding_side = "left"


In [None]:
from tqdm.auto import tqdm
# Store predictions
imputed_predictions = []

for batch_prompts in tqdm(dataloader_imp, desc="Generating"):
    inputs_imp = tokenizer_finetuned(
        batch_prompts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=1024
    )

    inputs_imp = {k: v.to(device=model_finetuned.device) for k, v in inputs_imp.items()}

    with torch.no_grad():
        output_imp_ids = model_finetuned.generate(
            **inputs_imp,
            max_new_tokens=100,
            do_sample=False,
            pad_token_id=tokenizer_finetuned.pad_token_id,
            eos_token_id=tokenizer_finetuned.eos_token_id
        )

    decoded_outputs_imp = tokenizer_finetuned.batch_decode(output_imp_ids, skip_special_tokens=True)
    imputed_predictions.extend([out.strip() for out in decoded_outputs_imp])

In [None]:
# Cleaning predicted values for imputed set
#import pandas as pd
import re

cleaned_labels_imp = []

for pred in imputed_predictions:
    # Get only content after <|assistant|>
    text_pred_imp = pred.split("<|assistant|>\n", 1)[-1]

    # Get only the portion before "#1"
    text_pred_imp = text_pred_imp.split("#1")[0].strip()

    # Remove brackets, quotes, and normalize spacing
    match_imp = re.search(r"\bYes\b|\bNo\b", text_pred_imp, re.IGNORECASE)
    if match_imp:
        cleaned_labels_imp.append(match_imp.group(0).title())  # Ensure title case
    else:
        cleaned_labels_imp.append("Unknown")  # fallback if parsing fails

references_imp = [
    example["completion"][0]["content"].strip().title()
    for example in dataset_snap_imp_w12_llm["test"]
]

In [None]:
from sklearn.metrics import accuracy_score, f1_score

# If some predictions were "Unknown", skip those:
valid_imp = [(p, t) for p, t in zip(cleaned_labels_imp, references_imp) if p in ["Yes", "No"]]

if valid_imp:
    y_pred_imp, y_true_imp = zip(*valid_imp)
    acc_imp = accuracy_score(y_true_imp, y_pred_imp)
    f1_imp = f1_score(y_true_imp, y_pred_imp, pos_label="Yes")
    print(f"Accuracy: {acc_imp:.4f}")
    print(f"F1 Score: {f1_imp:.4f}")
else:
    print("No valid predictions to evaluate.")

In [None]:
#Writing to excel

df_imputed_pred = pd.DataFrame({
    "Predicted_Label": cleaned_labels_imp,
    "Imputed_Label": references_imp
})

df_imputed_pred.to_excel("finetuned_imputed_predictions_w12.xlsx", index=False)