In [1]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install unsloth
# Get latest Unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import os
os.chdir('/content/drive/MyDrive/genaiproj/AddressCorrector')

# Reading data

In [4]:
import pandas as pd

# Load the Parquet file
df = pd.read_parquet("data/address_with_instructions.parquet")
df.head()


Unnamed: 0,OID_,AddNum_Pre,Add_Number,AddNum_Suf,AddNo_Full,St_PreMod,St_PreDir,St_PreTyp,St_PreSep,St_Name,...,SecondaryAddress,CityStateZip,FullAddress,FormattedFullAddress,task1_instruction,task1_groundtruth,task2_instruction,task2_groundtruth,noise_level,variant_idx
0,62241576,,1187.0,,1187.0,,north,,,pownal,...,,"pownal, vt, 5260","1187 north pownal road\n\npownal, vt, 5260","1187, north Pownal Road, Bennington County, Ve...",Parse the following address into a structured ...,"{\n ""AddNum_Pre"": """",\n ""Add_Number"": ""1187""...","Fix the formatting, structure, correct any exi...","{\n ""AddNum_Pre"": """",\n ""Add_Number"": ""1187""...",medium,0
1,19781905,,1124.0,,1124.0,,,,,judah bear,...,,"richmond, ky, 40475","1124 judah bear boulevard\n\nrichmond, ky, 40475","1124, Judah Bear Boulevard, Richmond, Madison ...",Parse the following address into a structured ...,"{\n ""AddNum_Pre"": """",\n ""Add_Number"": ""1124""...","Fix the formatting, structure, correct any exi...","{\n ""AddNum_Pre"": """",\n ""Add_Number"": ""1124""...",extreme,2
2,21790301,,27.0,,27.0,,,,,captain bellamy,...,,"ma, 2632","27 captain bellamy lane\n\nma, 2632","27, Captain Bellamy Lane, Centerville, Barnsta...",Parse the following address into a structured ...,"{\n ""AddNum_Pre"": """",\n ""Add_Number"": ""27"",\...","Fix the formatting, structure, correct any exi...","{\n ""AddNum_Pre"": """",\n ""Add_Number"": ""27"",\...",medium,0
3,29482843,,1676.0,,1676.0,,,,,englewood,...,,"mn, 55104-1113","1676 englewood avenue\n\nmn, 55104-1113","1676, Englewood Avenue, Ramsey County, Minneso...",Parse the following address into a structured ...,"{\n ""AddNum_Pre"": """",\n ""Add_Number"": ""1676""...","Fix the formatting, structure, correct any exi...","{\n ""AddNum_Pre"": """",\n ""Add_Number"": ""1676""...",extreme,2
4,74441520,,3820.0,,3820.0,,,,,duckhorn,...,"apt 2104, apt 2104","sacramento, ca, 95834-1363","3820 duckhorn drive\napt 2104, apt 2104\nsacra...","3820, Duckhorn Drive, Unit apt 2104, Sacrament...",Parse the following address into a structured ...,"{\n ""AddNum_Pre"": """",\n ""Add_Number"": ""3820""...","Fix the formatting, structure, correct any exi...","{\n ""AddNum_Pre"": """",\n ""Add_Number"": ""3820""...",medium,0


# SFT

 Step 1: Prepare Multi-Instruction Dataset

In [5]:
from datasets import load_dataset, Dataset

In [6]:
# Step 1: Create base instruction-output pairs
df_task1 = pd.DataFrame({
    "instruction": df["task1_instruction"],
    "output": "System: " + df["task1_groundtruth"].astype(str)
})

df_task2 = pd.DataFrame({
    "instruction": df["task2_instruction"],
    "output": "System: " + df["task2_groundtruth"].astype(str)
})

combined = pd.concat([df_task1, df_task2], ignore_index=True)

df_sample = combined.sample(n=20_000, random_state=42).reset_index(drop=True)

# Step 2: Convert to HF Dataset
dataset = Dataset.from_pandas(df_sample)

# Step 3: Convert to LLaMA-style messages
def format_llama_chat(example):
    return {
        "messages": [
            {"role": "user", "content": example["instruction"]},
            {"role": "assistant", "content": example["output"]}
        ]
    }

dataset = dataset.map(format_llama_chat)


Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [7]:
dataset[0]

{'instruction': 'Fix the formatting, structure, correct any existing entities, or predict/add new values to the appropriate entities of this Address JSON. Expand common abbreviations (like st→street, ave→avenue), correct obvious errors (like leading zeros in numbers), generate new values to the appropriate entities, and standardize capitalization. Keep empty fields as empty strings. Do not return anything other than corrected Address JSON\nAddress JSON: {\n  "AddNum_Pre": "",\n  "Add_Number": "386",\n  "AddNum_Suf": "",\n  "St_PreDir": "",\n  "St_Name": "barnaby",\n  "St_PosTyp": "street",\n  "St_PosDir": "southeast",\n  "Building": "",\n  "Floor": "",\n  "Unit": "",\n  "Room": "",\n  "Uninc_Comm": "trvsnv3ngk9tj hwmueu7s",\n  "Inc_Muni": "bdtgh6fgtl4n",\n  "County": "qisprr8tt tog ptlpt6s3",\n  "State": "dc",\n  "Zip_Code": "20302"\n}',
 'output': 'System: {\n  "AddNum_Pre": "",\n  "Add_Number": "836",\n  "AddNum_Suf": "",\n  "St_PreDir": "",\n  "St_Name": "barnaby",\n  "St_PosTyp": "

In [8]:
print(dataset[0]["messages"][0]["content"])


Fix the formatting, structure, correct any existing entities, or predict/add new values to the appropriate entities of this Address JSON. Expand common abbreviations (like st→street, ave→avenue), correct obvious errors (like leading zeros in numbers), generate new values to the appropriate entities, and standardize capitalization. Keep empty fields as empty strings. Do not return anything other than corrected Address JSON
Address JSON: {
  "AddNum_Pre": "",
  "Add_Number": "386",
  "AddNum_Suf": "",
  "St_PreDir": "",
  "St_Name": "barnaby",
  "St_PosTyp": "street",
  "St_PosDir": "southeast",
  "Building": "",
  "Floor": "",
  "Unit": "",
  "Room": "",
  "Uninc_Comm": "trvsnv3ngk9tj hwmueu7s",
  "Inc_Muni": "bdtgh6fgtl4n",
  "County": "qisprr8tt tog ptlpt6s3",
  "State": "dc",
  "Zip_Code": "20302"
}


In [13]:
from unsloth.chat_templates import get_chat_template

# Patch tokenizer with the correct template
tokenizer = get_chat_template(tokenizer, chat_template="llama-3")

# Apply tokenizer formatting to build final text strings
def format_to_text(example):
    return {
        "text": tokenizer.apply_chat_template(
            example["messages"],
            tokenize=False,
            add_generation_prompt=False,  # True if you want the model to generate
        )
    }

# Run it over your full dataset
dataset = dataset.map(format_to_text)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [14]:
print(dataset[0]["text"])


<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Fix the formatting, structure, correct any existing entities, or predict/add new values to the appropriate entities of this Address JSON. Expand common abbreviations (like st→street, ave→avenue), correct obvious errors (like leading zeros in numbers), generate new values to the appropriate entities, and standardize capitalization. Keep empty fields as empty strings. Do not return anything other than corrected Address JSON
Address JSON: {
  "AddNum_Pre": "",
  "Add_Number": "386",
  "AddNum_Suf": "",
  "St_PreDir": "",
  "St_Name": "barnaby",
  "St_PosTyp": "street",
  "St_PosDir": "southeast",
  "Building": "",
  "Floor": "",
  "Unit": "",
  "Room": "",
  "Uninc_Comm": "trvsnv3ngk9tj hwmueu7s",
  "Inc_Muni": "bdtgh6fgtl4n",
  "County": "qisprr8tt tog ptlpt6s3",
  "State": "dc",
  "Zip_Code": "20302"
}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

System: {
  "AddNum_Pre": "",
  "Add_Number": "836",
  "AddNum_Suf": "",

In [15]:
# ✅ Imports
import torch
from unsloth import FastLanguageModel
from transformers import TrainingArguments
from trl import SFTTrainer

# ✅ Define chunking (still future-proof)
chunk_size = 100000
chunks = [dataset.select(range(i, min(i + chunk_size, len(dataset)))) for i in range(0, len(dataset), chunk_size)]

# ✅ Load model + tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    max_seq_length = 2048,
    dtype = torch.float16,
    load_in_4bit = True,
)

# ✅ Add LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    lora_alpha = 16,
    lora_dropout = 0.0,
    bias = "none",
    use_rslora = False,
    use_gradient_checkpointing = "unsloth",
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
)

# ✅ Prepare model for training
model = FastLanguageModel.for_training(model, use_gradient_checkpointing=True)

# ✅ Training arguments
args = TrainingArguments(
    output_dir = "models/llama3_sft_sfttrainer",
    logging_dir = "outputs/logs",
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 2,
    num_train_epochs = 1,
    logging_steps = 10,
    save_strategy = "epoch",
    save_total_limit = 2,
    learning_rate = 2e-4,
    fp16 = True,
    bf16 = False,
    optim = "adamw_8bit",
    report_to = "none",
    lr_scheduler_type = "linear",
    seed = 3407,
)

# ✅ Training loop using SFTTrainer
for i, chunk in enumerate(chunks):
    print(f"\n🚀 Training chunk {i+1}/{len(chunks)}...")

    trainer = SFTTrainer(
        model = model,
        tokenizer = tokenizer,
        train_dataset = chunk,
        dataset_text_field = "text",          # <- Correct field for LLaMA 3
        max_seq_length = 2048,
        packing = False,
        args = args,
    )

    trainer.train()

print("\n✅ All chunks trained with SFTTrainer!")

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!

🚀 Training chunk 1/1...


Unsloth: Tokenizing ["text"] (num_proc=12):   0%|          | 0/20000 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 20,000 | Num Epochs = 1 | Total steps = 2,500
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 2 x 1) = 8
 "-____-"     Trainable parameters = 11,272,192/1,000,000,000 (1.13% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,1.6529
20,0.5165
30,0.4127
40,0.3828
50,0.3979
60,0.3496
70,0.3394
80,0.34
90,0.2931
100,0.3138



✅ All chunks trained with SFTTrainer!


Inference

In [17]:
# same session inference - for parsing(task 1)
from transformers import TextStreamer
from unsloth.chat_templates import get_chat_template

# Re-apply tokenizer template if not already done
tokenizer = get_chat_template(tokenizer, chat_template="llama-3")

# Task 1 Prompt
messages = [
    {
        "role": "user",
        "content": '''Parse the following address into a structured JSON with these fields: AddNum_Pre, Add_Number, AddNum_Suf, St_PreDir, St_Name, St_PosTyp, St_PosDir, Building, Floor, Unit, Room, Uninc_Comm, Inc_Muni, County, State, Zip_Code.
Address: 13th Street 47 W 13th St, New York, NY 10011, USA'''
    }
]

# Format prompt
prompt = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

# Tokenize and generate
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

_ = model.generate(
    **inputs,
    streamer=streamer,
    max_new_tokens=512,
    temperature=0.7,
    top_p=0.9,
    do_sample=True
)


System: {
  "AddNum_Pre": "",
  "Add_Number": "13th",
  "AddNum_Suf": "47 w 13th st",
  "St_PreDir": "",
  "St_Name": "th",
  "St_PosTyp": "street",
  "St_PosDir": "",
  "Building": "",
  "Floor": "",
  "Unit": "",
  "Room": "",
  "Uninc_Comm": "new york",
  "Inc_Muni": "new york",
  "County": "dutchess",
  "State": "ny",
  "Zip_Code": "10011"
}


In [16]:
# same session inference - for entity generation and rewriting(task 2)

from unsloth.chat_templates import get_chat_template
from transformers import TextStreamer

# ✅ Patch tokenizer for LLaMA 3 chat format
tokenizer = get_chat_template(tokenizer, chat_template="llama-3")

# ✅ Define your instruction or address JSON
messages = [
    {
        "role": "user",
        "content": '''Fix the formatting, structure, correct any existing entities, or predict/add new values to the appropriate entities of this Address JSON. Expand common abbreviations (like st→street, ave→avenue), correct obvious errors (like leading zeros in numbers), generate new values to the appropriate entities, and standardize capitalization. Keep empty fields as empty strings. Do not return anything other than corrected Address JSON
Address JSON: {
  "AddNum_Pre": "",
  "Add_Number": "386",
  "AddNum_Suf": "",
  "St_PreDir": "",
  "St_Name": "barnaby",
  "St_PosTyp": "street",
  "St_PosDir": "southeast",
  "Building": "",
  "Floor": "",
  "Unit": "",
  "Room": "",
  "Uninc_Comm": "trvsnv3ngk9tj hwmueu7s",
  "Inc_Muni": "bdtgh6fgtl4n",
  "County": "qisprr8tt tog ptlpt6s3",
  "State": "dc",
  "Zip_Code": "20302"
}'''
    }
]

# ✅ Apply chat template
prompt = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True  # important for inference
)

# ✅ Tokenize and move to device
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# ✅ Stream response
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

# ✅ Generate
_ = model.generate(
    **inputs,
    streamer=streamer,
    max_new_tokens=512,
    temperature=0.7,
    top_p=0.9,
    do_sample=True
)


System: {
  "AddNum_Pre": "",
  "Add_Number": "638",
  "AddNum_Suf": "",
  "St_PreDir": "",
  "St_Name": "barnaby",
  "St_PosTyp": "street",
  "St_PosDir": "southeast",
  "Building": "",
  "Floor": "",
  "Unit": "",
  "Room": "",
  "Uninc_Comm": "travertine heights",
  "Inc_Muni": "washington",
  "County": "district of columbia",
  "State": "dc",
  "Zip_Code": "20023"
}


20023 is the actual pincode in DC. The input that was actually given 20302 is not a valid pincode. The model learned it during training and corrected it

Inference on new runtime

In [None]:
# Load the fine tuned model and tokenizer
from unsloth import FastLanguageModel
from transformers import TextStreamer
import torch

# Path to your checkpoint (replace if different)
checkpoint_path = "models/llama3_sft_sfttrainer/checkpoint-2500"

# Load fine-tuned model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = checkpoint_path,
    max_seq_length = 2048,
    dtype = torch.float16,
    load_in_4bit = True,
    device_map = "auto"
)

model.eval()


In [None]:
# formatting the inference prompt

from unsloth.chat_templates import get_chat_template

# Apply correct chat formatting for inference
tokenizer = get_chat_template(tokenizer, chat_template="llama-3")

# Example: Instruction-only chat (no assistant content)
messages = [
    {"role": "user", "content": '''Fix the formatting, structure, correct any existing entities, or predict/add new values to the appropriate entities of this Address JSON. Expand common abbreviations (like st→street, ave→avenue), correct obvious errors (like leading zeros in numbers), generate new values to the appropriate entities, and standardize capitalization. Keep empty fields as empty strings. Do not return anything other than corrected Address JSON
Address JSON: {
  "AddNum_Pre": "",
  "Add_Number": "386",
  "AddNum_Suf": "",
  "St_PreDir": "",
  "St_Name": "barnaby",
  "St_PosTyp": "street",
  "St_PosDir": "southeast",
  "Building": "",
  "Floor": "",
  "Unit": "",
  "Room": "",
  "Uninc_Comm": "trvsnv3ngk9tj hwmueu7s",
  "Inc_Muni": "bdtgh6fgtl4n",
  "County": "qisprr8tt tog ptlpt6s3",
  "State": "dc",
  "Zip_Code": "20302"
}'''}
]

prompt = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True  # important for inference
)


In [None]:
# generate output
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

_ = model.generate(
    **inputs,
    streamer=streamer,
    max_new_tokens=512,
    temperature=0.7,
    top_p=0.9,
    do_sample=True,
    use_cache=True
)


In [None]:
from transformers import AutoTokenizer
from unsloth import FastLanguageModel
from peft import PeftModel
import torch

# === Load Base Model ===
base_model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = base_model_name,
    max_seq_length = 2048,
    dtype = torch.float16,
    load_in_4bit = True,
)

# === Load LoRA Weights ===
lora_path = "models/llama3_sft_sfttrainer/checkpoint-2500"
model = PeftModel.from_pretrained(model, lora_path)

model.eval()
model = model.to("cuda" if torch.cuda.is_available() else "cpu")

# === Build Inference Prompt ===
prompt = """Fix the formatting, correct any errors, and predict missing values for this address JSON:
Address JSON: {
  "AddNum_Pre": "",
  "Add_Number": "386",
  "AddNum_Suf": "",
  "St_PreDir": "",
  "St_Name": "barnaby",
  "St_PosTyp": "street",
  "St_PosDir": "southeast",
  "Building": "",
  "Floor": "",
  "Unit": "",
  "Room": "",
  "Uninc_Comm": "trvsnv3ngk9tj hwmueu7s",
  "Inc_Muni": "bdtgh6fgtl4n",
  "County": "qisprr8tt tog ptlpt6s3",
  "State": "dc",
  "Zip_Code": "20302"
}
System:"""  # Important: ends with System: to cue generation

# === Tokenize & Generate ===
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        do_sample=False,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.1,
        eos_token_id=tokenizer.eos_token_id
    )

# === Decode Output ===
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
response = decoded.split("System:")[-1].strip()

print("\n📦 Model Output:\n", response)


In [5]:
# run only for new sessions

from unsloth import FastLanguageModel
from transformers import AutoTokenizer, pipeline
import torch

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    max_seq_length = 512,
    dtype = torch.float16,
    load_in_4bit = True,
)

# Re-attach your trained LoRA adapter
model = FastLanguageModel.get_peft_model(
    model,
    r=8,
    lora_alpha=16,
    lora_dropout=0.0,
    bias="none",
    use_rslora=False,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)

# Load weights (from latest checkpoint if needed)
model.load_adapter("models/llama3_sft_sfttrainer/checkpoint-5000")  # Or your specific folder

model.eval()


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!
Unsloth: We'll be using `/tmp/unsloth_compiled_cache` for temporary Unsloth patches.
Standard import failed for UnslothOnlineDPOTrainer: No module named 'UnslothOnlineDPOTrainer'. Using tempfile instead!
==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.03G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

Not an error, but Unsloth cannot patch MLP layers with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Unsloth 2025.3.19 patched 16 layers with 16 QKV layers, 16 O layers and 0 MLP layers.


TypeError: PeftModel.load_adapter() missing 1 required positional argument: 'adapter_name'

In [37]:
from transformers import GenerationConfig

def infer_address(prompt: str, model, tokenizer, max_new_tokens=256):
    import torch

    # Ensure prompt ends correctly
    if not prompt.strip().endswith("System:"):
        prompt += "\nSystem:"

    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.1,
            eos_token_id=tokenizer.eos_token_id
        )

    # Decode and trim
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    parsed_output = response.split("System:")[-1].strip()

    print("🧠 Prompt:\n", prompt)
    print("\n📦 Model Output:\n", parsed_output)
    return parsed_output



In [38]:
prompt = """Parse the following address into a structured JSON with these fields: AddNum_Pre, Add_Number, AddNum_Suf, St_PreDir, St_Name, St_PosTyp, St_PosDir, Building, Floor, Unit, Room, Uninc_Comm, Inc_Muni, County, State, Zip_Code.
Address: 1187, north Pownal Road, Bennington County, Vermont, 05260"""

infer_address(prompt, model, tokenizer)


🧠 Prompt:
 Parse the following address into a structured JSON with these fields: AddNum_Pre, Add_Number, AddNum_Suf, St_PreDir, St_Name, St_PosTyp, St_PosDir, Building, Floor, Unit, Room, Uninc_Comm, Inc_Muni, County, State, Zip_Code.
Address: 1187, north Pownal Road, Bennington County, Vermont, 05260
System:

📦 Model Output:
 Bennington County, Vermont
Address Type: street
Address Number Pre: 1175
Address Number Suf: 0
Street Previous To: 1175
Street Name: pownal road
Street Position Typical: road
Building: 1187
Floor: 1
Unit: unit 101
Room: 101
Enclosed Space: building
Bennington County, Vermont
Bennington County is located in central Vermont. It was formed from several counties and has been known as Bennington County since its creation in 1792. The county is situated in the heart of New England and is home to many colleges and universities, including Bennington College and Middlebury


'Bennington County, Vermont\nAddress Type: street\nAddress Number Pre: 1175\nAddress Number Suf: 0\nStreet Previous To: 1175\nStreet Name: pownal road\nStreet Position Typical: road\nBuilding: 1187\nFloor: 1\nUnit: unit 101\nRoom: 101\nEnclosed Space: building\nBennington County, Vermont\nBennington County is located in central Vermont. It was formed from several counties and has been known as Bennington County since its creation in 1792. The county is situated in the heart of New England and is home to many colleges and universities, including Bennington College and Middlebury'

remove

In [None]:
# US AddrLLM: Supervised Fine-Tuning

# 📁 Step 2: Load and Explore NAD Dataset
import pandas as pd

df = pd.read_csv("data/sample_NAD.csv")
df.head()

# 📐 Step 3: Preprocess into Instruction Format for SFT (Three Tasks)

sft_parse, sft_predict, sft_rewrite = [], [], []

for _, row in df.iterrows():
    number = str(row.get("AddNo_Full", ""))
    street = row.get("StNam_Full", "")
    city = row.get("Post_City", "")
    state = row.get("State", "")
    zip_code = str(row.get("Zip_Code", ""))
    hierarchy = "[Number, Street, City, State, ZIP]"

    # Raw address (simulate slightly noisy input)
    raw_address = f"{number} {row.get('St_Name', '')}, {city}, {state}"
    noisy_input = raw_address.replace("St", "Street").replace("Rd", "Road")
    parsed_output = f"{{'Number': '{number}', 'Street': '{street}', 'City': '{city}', 'State': '{state}', 'ZIP': '{zip_code}'}}"

    # Task 1: Address Parsing
    sft_parse.append({
        "instruction": f"You are an address parsing bot, please parse the following address according to standard address hierarchy:\nAddress: {raw_address}\nAddress Hierarchy: {hierarchy}",
        "input": "",
        "output": f"System: {parsed_output}"
    })

    # Task 2: Address Entity Prediction
    partial_input = f"{number}, {state}"
    missing_fields = f"Street: {street}, City: {city}, ZIP: {zip_code}"
    sft_predict.append({
        "instruction": f"You are an Address Entity Prediction bot, please predict missing address entity in the following address:\nAddress: {partial_input}\nAddress Hierarchy: {hierarchy}",
        "input": "",
        "output": f"System: {missing_fields}"
    })

    # Task 3: Address Rewriting
    corrected_output = f"{number} {street}, {city}, {state} {zip_code}"
    rewriting_examples = "- 123 Main St → 123 Main Street\n- 45 W Elm Rd → 45 West Elm Road"
    sft_rewrite.append({
        "instruction": f"You are an address rewriting bot, please rewrite the following address according to standard address hierarchy:\nAddress: {noisy_input}\nAddress Hierarchy: {hierarchy}\nExamples:\n{rewriting_examples}",
        "input": "",
        "output": f"System: {corrected_output}"
    })


In [None]:
# 🧠 Save Training JSONs
import json
with open("outputs/sft_parse.json", "w") as f: json.dump(sft_parse, f, indent=2)
with open("outputs/sft_predict.json", "w") as f: json.dump(sft_predict, f, indent=2)
with open("outputs/sft_rewrite.json", "w") as f: json.dump(sft_rewrite, f, indent=2)

# 🔁 Step 4: Fine-Tune on All Tasks with LoRA
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType

def load_json_dataset(path):
    with open(path) as f:
        data = json.load(f)
    return Dataset.from_list(data)

datasets_combined = Dataset.from_list(sft_parse + sft_predict + sft_rewrite)

model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Apply LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)
model = get_peft_model(model, lora_config)

# Tokenize
def tokenize(example):
    return tokenizer(
        example["instruction"],
        text_target=example["output"],
        truncation=True, padding="max_length", max_length=512
    )

tokenized_dataset = datasets_combined.map(tokenize)

# Training args
training_args = TrainingArguments(
    output_dir="models/addrllm-sft",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    do_eval=False,
   # evaluation_strategy="no",
    logging_dir="outputs/logs",
    save_strategy="epoch",
    save_total_limit=2,
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

trainer.train()

# ✅ Your fine-tuned (LoRA) model will be saved in '/content/drive/MyDrive/genaiproj/AddressCorrector/models/addrllm-sft'


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mvaishnavee-sm[0m ([33mvaishnavee-sm-northeastern-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
500,0.0
1000,0.0


TrainOutput(global_step=1125, training_loss=0.0, metrics={'train_runtime': 195.3378, 'train_samples_per_second': 46.074, 'train_steps_per_second': 5.759, 'total_flos': 1682527223808000.0, 'train_loss': 0.0, 'epoch': 3.0})

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# Load model from last checkpoint
model_path = "models/addrllm-sft/checkpoint-1125"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
model.eval()

# 🔍 Sample prompt (Parsing)
prompt = """
You are an address parsing bot, please parse the following address according to standard address hierarchy:
Address: 1234 W Main St, Springfield, IL
Address Hierarchy: [Number, Street, City, State, ZIP]
"""

# Generate response
inputs = tokenizer(prompt.strip(), return_tensors="pt").to(model.device)
with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=100)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("🔁 Model Output:")
print(response)


🔁 Model Output:
1234 W Main St, Springfield, IL


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# Load your fine-tuned checkpoint
model_path = "models/addrllm-sft/checkpoint-1125"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
model.eval()

# 🔍 Prompt for Address Entity Prediction
prompt = """
You are an address rewriting bot, please rewrite the following address according to standard address hierarchy:
Address: 456 E Elm St, Chicago, IL
Address Hierarchy: [Number, Street, City, State, ZIP]
Examples:
- 123 Main St → 123 Main Street
- 45 W Elm Rd → 45 West Elm Road
System:
"""


# Run inference
inputs = tokenizer(prompt.strip(), return_tensors="pt").to(model.device)
with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=100)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("🔁 Model Output:")
print(response)

🔁 Model Output:
[Number, Street, City, State, ZIP]


In [None]:
import os
print(os.listdir("models/addrllm-sft"))


['checkpoint-750', 'checkpoint-1125']


# RAG

In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Build address text corpus
address_texts = df.apply(lambda r: f"{r['AddNo_Full']} {r['StNam_Full']}, {r['Post_City']}, {r['State']} {r['Zip_Code']}", axis=1)
address_states = df['State'].tolist()
address_zips = df['Zip_Code'].astype(str).tolist()

# Load embedding model
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
address_embeddings = embed_model.encode(address_texts.tolist(), show_progress_bar=True)

# Build FAISS index
index = faiss.IndexFlatL2(address_embeddings.shape[1])
index.add(np.array(address_embeddings))

# Define advanced RAG function

def retrieve_similar_addresses(query, query_state=None, query_zip=None, top_k=5):
    query_embedding = embed_model.encode([query])
    D, I = index.search(query_embedding, k=top_k * 2)

    results = []
    for idx in I[0]:
        state_match = query_state is None or address_states[idx] == query_state
        zip_match = query_zip is None or address_zips[idx] == query_zip
        if state_match or zip_match:
            results.append(address_texts[idx])
        if len(results) == top_k:
            break
    return results

# 🔍 Example query with filtering
query = "1234 W Elm Rd, IL"
retrieved = retrieve_similar_addresses(query, query_state="IL")

# Construct structured RAG-style prompt
prompt = f"""
You are an address rewriting bot, please rewrite the following address according to standard address hierarchy:
Address: {query}
Address Hierarchy: [Number, Street, City, State, ZIP]
Examples:
- {retrieved[0]}
- {retrieved[1]}
- {retrieved[2] if len(retrieved) > 2 else ''}
System:
"""

print("\n📥 Final Prompt for SFT Model:")
print(prompt)


Batches:   0%|          | 0/32 [00:00<?, ?it/s]


📥 Final Prompt for SFT Model:

You are an address rewriting bot, please rewrite the following address according to standard address hierarchy:
Address: 1234 W Elm Rd, IL
Address Hierarchy: [Number, Street, City, State, ZIP]
Examples:
- 225.0 West JACKSON Street, Elmhurst, IL 60126.0
- 4019.0 North 2125TH Street, Oblong, IL 62449.0
- 33373.0 HALU Avenue, SOLDOTNA, AK 99669.0
System:

