In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone
import torch
import os
from tqdm.auto import tqdm
from unsloth import FastLanguageModel
import transformers


Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!
Unsloth: We'll be using `/tmp/unsloth_compiled_cache` for temporary Unsloth patches.
Standard import failed for UnslothBCOTrainer: No module named 'UnslothBCOTrainer'. Using tempfile instead!


In [None]:
os.environ['PINECONE_API_KEY'] = "add-api-key"
os.environ['PINECONE_ENVIRONMENT'] = "us-east-1"

In [None]:
# Configuration for Retreiever and generator models

# Retriever Model Configuration

FINETUNED_RETRIEVER_PATH = 'output/finetuned-all-distilroberta-v1-2025-04-22_15-26-27'

# Generator Model Configuration
FINETUNED_GENERATOR_PATH =  "llama3_sft_sfttrainer_MA/checkpoint-2500"
MAX_SEQ_LENGTH = 2048
LOAD_IN_4BIT = True

# Pinecone Configuration
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY", "YOUR_API_KEY")
PINECONE_INDEX_NAME = 'address-data-index'

# Retrieval Configuration - Number of relevant addresses to retrieve
TOP_K_RETRIEVED = 5

# --- Check GPU ---
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available. Using CUDA.")
else:
    device = torch.device("cpu")
    print("GPU not available. Using CPU.")

GPU is available. Using CUDA.


In [None]:
# Initializing Pinecone connection

print("\nInitializing Pinecone connection...")

pinecone = Pinecone(api_key=PINECONE_API_KEY)
if PINECONE_INDEX_NAME not in pinecone.list_indexes().names():
      print(f"Error: Index '{PINECONE_INDEX_NAME}' does not exist in Pinecone.")
index = pinecone.Index(PINECONE_INDEX_NAME)
print(f"Connected to Pinecone index '{PINECONE_INDEX_NAME}'.")
print(index.describe_index_stats())


Initializing Pinecone connection...
Connected to Pinecone index 'address-data-index'.
{'dimension': 768,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 10000}},
 'total_vector_count': 10000,
 'vector_type': 'dense'}


In [None]:
# Loading fine-tuned retriever model

retriever_model = SentenceTransformer(FINETUNED_RETRIEVER_PATH, device=str(device))
print(f"Retriever model loaded successfully to {retriever_model.device}.")

Retriever model loaded successfully to cuda:0.


In [None]:
# Load Generator LLM (Unsloth Llama 3.2)

generator_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = FINETUNED_GENERATOR_PATH,
    max_seq_length = MAX_SEQ_LENGTH,
    dtype = None,
    load_in_4bit = LOAD_IN_4BIT,
)
print("Generator LLM loaded successfully.")

# Set pad token if missing (common for Llama models)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.03G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

Unsloth 2025.3.19 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


Generator LLM loaded successfully.


In [None]:
# Enable Unsloth inference optimizations
FastLanguageModel.for_inference(generator_model)
print("Unsloth inference optimizations enabled.")

Unsloth inference optimizations enabled.


In [None]:

# Define Retrieval Function

def get_relevant_addresses(query_address: str, top_k: int = TOP_K_RETRIEVED) -> list:
    # Generate embedding for the query address
    query_embedding = retriever_model.encode(
        query_address,
        convert_to_tensor=False,
        device=str(device)
    ).tolist()

    # Query Pinecone
    query_response = index.query(
        vector=query_embedding,
        top_k=top_k,
        include_metadata=True
    )

    # Extract address text from metadata
    context_addresses = []
    if query_response.matches:
        for match in query_response.matches:
            if match.metadata and 'address_text' in match.metadata:
                context_addresses.append(match.metadata['address_text'])
            print(f"  - Retrieved ID: {match.id}, Score: {match.score:.4f}")

    print(f"Retrieved {len(context_addresses)} context addresses.")
    print(context_addresses)
    return context_addresses

In [None]:
# Define Prediction Function

llama3_2_instruct_template = """

You are an address rewriting bot, please rewrite the following address according to standard address hierarchy and
related addresses to onea single correct address. Related addresses are possibly geographically close to the address to be rewritten.
If no rewrite is needed, output the original address.

Address to be rewritten: {query_address}

Address Hierarchy: [Number, Street, City, State, ZIP]

Examples: 6, Jade Street, Methuen, Essex County, Massachusetts, 01844

Related Address: {context_list}

The generated result should be one line containing only one corrected address as below
#Corrected Address: (generated address) #

"""


def predict_address(query_address: str, context_addresses: list) -> str:
    print("Constructing prompt and generating prediction...")

    if context_addresses:
        context_str = "\n".join([f"- {addr}" for addr in context_addresses])
    else:
        context_str = "None provided."

    # Create the prompt using the template
    prompt = llama3_2_instruct_template.format(
        query_address=query_address,
        context_list=context_str
    )
    print(f'Prompt:{prompt}')
    # Prepare input for the generator model
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
    ).to(device)

    # Generation parameters (adjust as needed)
    generation_params = {
        "max_new_tokens": 1000,
        "do_sample": False,
        "pad_token_id": tokenizer.eos_token_id
    }

    # Generate the output
    try:
        print("Generating response...")
        outputs = generator_model.generate(**inputs, **generation_params)
        generated_ids = outputs[0, inputs['input_ids'].shape[1]:]
        result = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
        print("Generation complete.")
        return result
    except Exception as e:
        print(f"Error during generation: {e}")
        return f"Error: Could not generate prediction ({e})"


In [None]:

if __name__ == "__main__":
    #query address 
    query = "32, Clarkwood Street testing, Unit 3 KUHIUHBK, Mattapan, 02126"

    print(f"\n--- Running RAG for Query: '{query}' ---")

    # 1. Retrieve context
    context = get_relevant_addresses(query)

    # 2. Generate prediction using context
    predicted_address = predict_address(query, context)
    print("\n--- Prediction Result ---")
    print(f"Original Query: {query}")
    print(f"Predicted/Rewritten Address: {predicted_address}")
    print("-------------------------")


--- Running RAG for Query: '32, Clarkwood Street testing, Unit 3 KUHIUHBK, Mattapan, 02126' ---
  - Retrieved ID: 24428299, Score: 0.9584
  - Retrieved ID: 24563225, Score: 0.9577
  - Retrieved ID: 24464669, Score: 0.9572
  - Retrieved ID: 24548964, Score: 0.9571
  - Retrieved ID: 24429973, Score: 0.9567
Retrieved 5 context addresses.
['1000, Harvard Street, Unit 12, Mattapan, Suffolk County, Massachusetts, 02126', '40, Fairlawn Avenue, Unit c7, Mattapan, Suffolk County, Massachusetts, 02126', '219, Delhi Street, Unit 11, Mattapan, Suffolk County, Massachusetts, 02126', '49, Woolson Street, Unit 3, Mattapan, Suffolk County, Massachusetts, 02126', '32, Clarkwood Street, Unit 3, Mattapan, Suffolk County, Massachusetts, 02126']
Constructing prompt and generating prediction...
Prompt:

You are an address rewriting bot, please rewrite the following address according to standard address hierarchy and
related addresses to onea single correct address. Related addresses are possibly geographic