In [1]:
# Imports
import re
import os
import numpy as np
import pandas as pd
import json
from google import genai
from google.genai import types

In [None]:
# Configuration
gemini_api_key = os.getenv("GEMINI_API_KEY")
client = genai.Client(api_key=gemini_api_key)

model_name = "gemini-3-flash-preview"
chunk_size = 10
max_iterations = 10

master_file_path = f"/home/wrishav/Desktop/Workspace/SKU-Retail/redundant/data_master.csv"
m_columns = ['itemcode', 'catcode', 'category', 'subcat', 'ssubcat', 'company', 'mbrand', 'brand', 'sku',
             'packtype', 'base_pack', 'flavor', 'color', 'wght', 'uom', 'mrp']

transaction_file_path = f"/home/wrishav/Desktop/Workspace/SKU-Retail/redundant/data_transaction.csv"
t_columns = ['CATEGORY', 'MANUFACTURE', 'BRAND', 'ITEMDESC', 'MRP', 'PACKSIZE', 'PACKTYPE']

In [3]:
# Data loader
master_file = pd.read_csv(master_file_path, usecols=m_columns)
master_dict = master_file.to_dict(orient='index')

transaction_file = pd.read_csv(transaction_file_path, usecols=t_columns)
transaction_dict = transaction_file.to_dict(orient='index')

In [4]:
# Format data for prompt generation
def format_master(master_row):
    return f"""
Item Code: {master_row['itemcode']}
Category Code: {master_row['catcode']}
Category: {master_row['category']}
Subcategory: {master_row['subcat']}
Sub-Subcategory: {master_row['ssubcat']}
Company: {master_row['company']}
Main Brand: {master_row['mbrand']}
Brand: {master_row['brand']}
Pack Type: {master_row['packtype']}
Pack Size: {master_row['base_pack']}
Flavor: {master_row['flavor']}
Color: {master_row['color']}
Unit of Measure: {master_row['uom']}
MRP: {master_row['mrp']}
    """

def format_transaction(transaction_row):
    return f"""
Category Code: {transaction_row['CATEGORY']}
Company: {transaction_row['MANUFACTURE']}
Brand: {transaction_row['BRAND']}
Item Description: {transaction_row['ITEMDESC']}
MRP: {transaction_row['MRP']}
Pack Size: {re.match(r'(\d+)\s*(.*)', transaction_row['PACKSIZE']).group(1)}
Unit of Measure: {re.match(r'(\d+)\s*(.*)', transaction_row['PACKSIZE']).group(2)}
Pack Type: {transaction_row['PACKTYPE']}
    """

# Format context data for LLM prompt
def format_context_matches(master_dictionary) -> str:
    """Format context matches for the LLM prompt"""
    context_lines = []
    
    for idx, master_row in master_dictionary.items():
        formatted_row = format_master(master_row)
        context_lines.append(f"Context item {idx}:{formatted_row}")
    return "\n".join(context_lines)

# Format query data for LLM prompt
def format_query_match(transaction_dictionary) -> str:
    """Format context matches for the LLM prompt"""
    context_lines = []
    
    for idx, master_row in transaction_dictionary.items():
        formatted_row = format_transaction(master_row)
        context_lines.append(f"Transaction item {idx}:{formatted_row}")
    return "\n".join(context_lines)

In [5]:
# Generate prompt
def generate_prompt(context_rows, transaction_row):
  prompt = f"""
You are an expert product matching AI.
You have been given a transaction item and a set of context items from the master catalog.
Determine which context item best matches the transaction item.

Here are the context items from the master catalog:
\n{format_context_matches(context_rows)}

Here is the transaction item to match:
\n{format_query_match(transaction_row)}

Matching criteria (Priority Order):
1. Category code alignment
2. EXACT company match
3. EXACT brand match
4. EXACT pack size and pack type match
5. MRP/price similarity

Instructions:
1. Compare transaction item with each context item carefully
2. Prioritize exact matches in category code, company, brand, pack size, and pack type
3. Consider MRP similarity as a secondary factor

Based on the attributes provided, identify the best matching context item for the transaction item.
Respond strictly with a JSON object in the following format:
{{
  "context_item": "<The context item number, for example, 0 if 'Context item 0'>",
  "score": "<The confidence score normalized between 0 and 1, with 1 being a perfect match>",
}}
  """
  return prompt

In [6]:
# Call LLM
def call_llm(prompt):
    response = client.models.generate_content(
        model=model_name,
        contents=prompt,
        # config=gen_config
    )
    
    return response

In [7]:
chunksize = 10
context_len = len(master_dict)
responses = []

start = 0
while start < context_len:
    end = min(start + chunksize, context_len)
    print(f"Processing {start} to {end-1}\nChunk start index: {start}\n")
    context_rows = {i: master_dict[i] for i in range(start, end)}
    transaction_row = {0: transaction_dict[0]}
    prompt = generate_prompt(context_rows, transaction_row)
    response = call_llm(prompt)
    responses.append(response)
    print(f"LLM Response:\n{response}\n")
    start += chunksize

Processing 0 to 9
Chunk start index: 0

LLM Response:
sdk_http_response=HttpResponse(
  headers=<dict len=11>
) candidates=[Candidate(
  content=Content(
    parts=[
      Part(
        text="""```json
{
  "context_item": "8",
  "score": 0.7
}
```""",
        thought_signature=b'\x12\xa3\xf3\x01\n\x9f\xf3\x01\x01r\xc8\xda|\x93\xaf\\\xba\xf2\xf2\x01+!\xcf\x1d\xa1\xdc\xc6o\x15\x9a\xbe9\xa9hUv\xd9\xea\xfd\xcf\xa2\'7\x04Y\xffa\xc0\x02\xcc\xdd\x89/\xa9pL<\xc8\x9394g\xd3\x1b\xf9\xff\xfb\x0b\xa2:\x83\xc5B\x0ba\x03\x89\xaf\xfdp\xfcoQ?}\x87\xb0"\x05\x91\xccJ\xc6EC\xfb ...'
      ),
    ],
    role='model'
  ),
  finish_reason=<FinishReason.STOP: 'STOP'>,
  index=0
)] create_time=None model_version='gemini-3-flash-preview' prompt_feedback=None response_id='W6hkadHhDIiqjuMP3p3o-AI' usage_metadata=GenerateContentResponseUsageMetadata(
  candidates_token_count=27,
  prompt_token_count=1416,
  prompt_tokens_details=[
    ModalityTokenCount(
      modality=<MediaModality.TEXT: 'TEXT'>,
      token_co

In [8]:
responses

[GenerateContentResponse(
   automatic_function_calling_history=[],
   candidates=[
     Candidate(
       content=Content(
         parts=[
           Part(
             text="""```json
 {
   "context_item": "8",
   "score": 0.7
 }
 ```""",
             thought_signature=b'\x12\xa3\xf3\x01\n\x9f\xf3\x01\x01r\xc8\xda|\x93\xaf\\\xba\xf2\xf2\x01+!\xcf\x1d\xa1\xdc\xc6o\x15\x9a\xbe9\xa9hUv\xd9\xea\xfd\xcf\xa2\'7\x04Y\xffa\xc0\x02\xcc\xdd\x89/\xa9pL<\xc8\x9394g\xd3\x1b\xf9\xff\xfb\x0b\xa2:\x83\xc5B\x0ba\x03\x89\xaf\xfdp\xfcoQ?}\x87\xb0"\x05\x91\xccJ\xc6EC\xfb ...'
           ),
         ],
         role='model'
       ),
       finish_reason=<FinishReason.STOP: 'STOP'>,
       index=0
     ),
   ],
   model_version='gemini-3-flash-preview',
   response_id='W6hkadHhDIiqjuMP3p3o-AI',
   sdk_http_response=HttpResponse(
     headers=<dict len=11>
   ),
   usage_metadata=GenerateContentResponseUsageMetadata(
     candidates_token_count=27,
     prompt_token_count=1416,
     prompt_tokens_details

In [9]:
import json

text = responses[1].text
start = text.find('{')
end = text.rfind('}') + 1
json_response = json.loads(text[start:end])
json_response

{'context_item': '10', 'score': 0.98}

In [10]:
responses[0].candidates[0].content.parts[0].text

'```json\n{\n  "context_item": "8",\n  "score": 0.7\n}\n```'

In [11]:
# Call LLM
def call_llm(prompt):
    response = client.models.generate_content(
        model=model_name,
        contents=prompt,
        config=types.GenerateContentConfig(
            thinking_config=types.ThinkingConfig(thinking_budget=0), # Disables thinking
            system_instruction="""
                You are a an expert product matching AI.
                Return ONLY a valid JSON object that strictly adheres to the specified schema.
                """,
            temperature=0.1,
            response_mime_type="application/json",
            response_schema={
                "type": "object",
                "properties": {
                    "context_item": {"type": "string"},
                    "score": {"type": "number"}
                },
                "required": ["context_item", "score"]
            }
        )
    )
    
    return response

In [12]:
chunksize = 10
context_len = len(master_dict)
responses = []

start = 0
while start < context_len:
    end = min(start + chunksize, context_len)
    print(f"Processing {start} to {end-1}\nChunk start index: {start}\n")
    context_rows = {i: master_dict[i] for i in range(start, end)}
    transaction_row = {0: transaction_dict[0]}
    prompt = generate_prompt(context_rows, transaction_row)
    response = call_llm(prompt)
    responses.append(response)
    print(f"LLM Response:\n{response}\n")
    start += chunksize

Processing 0 to 9
Chunk start index: 0

LLM Response:
sdk_http_response=HttpResponse(
  headers=<dict len=11>
) candidates=[Candidate(
  content=Content(
    parts=[
      Part(
        text='{"context_item": "9", "score": 0.1}',
        thought_signature=b'\x124\n2\x01r\xc8\xda|\xa1\xba\x9b\x1e\xd6\x8d\x97\xbe\x9c\xb4\x03\xb0\xe1\x02A\x06\x0f\xd9\xca\xc3\x8d\xa2\xbd0\x94\x07}\x0e\x9d$\x9c(l\x82\x1e\xc0}\xe1\xc7\xcd\xe9\x97\xef\xf6\xb1'
      ),
    ],
    role='model'
  ),
  finish_reason=<FinishReason.STOP: 'STOP'>,
  index=0
)] create_time=None model_version='gemini-3-flash-preview' prompt_feedback=None response_id='YqhkafPxE-uJ4-EPwM6W8QI' usage_metadata=GenerateContentResponseUsageMetadata(
  candidates_token_count=16,
  prompt_token_count=1446,
  prompt_tokens_details=[
    ModalityTokenCount(
      modality=<MediaModality.TEXT: 'TEXT'>,
      token_count=1446
    ),
  ],
  total_token_count=1462
) automatic_function_calling_history=[] parsed={'context_item': '9', 'score': 0.1}



In [13]:
responses

[GenerateContentResponse(
   automatic_function_calling_history=[],
   candidates=[
     Candidate(
       content=Content(
         parts=[
           Part(
             text='{"context_item": "9", "score": 0.1}',
             thought_signature=b'\x124\n2\x01r\xc8\xda|\xa1\xba\x9b\x1e\xd6\x8d\x97\xbe\x9c\xb4\x03\xb0\xe1\x02A\x06\x0f\xd9\xca\xc3\x8d\xa2\xbd0\x94\x07}\x0e\x9d$\x9c(l\x82\x1e\xc0}\xe1\xc7\xcd\xe9\x97\xef\xf6\xb1'
           ),
         ],
         role='model'
       ),
       finish_reason=<FinishReason.STOP: 'STOP'>,
       index=0
     ),
   ],
   model_version='gemini-3-flash-preview',
   parsed={
     'context_item': '9',
     'score': 0.1
   },
   response_id='YqhkafPxE-uJ4-EPwM6W8QI',
   sdk_http_response=HttpResponse(
     headers=<dict len=11>
   ),
   usage_metadata=GenerateContentResponseUsageMetadata(
     candidates_token_count=16,
     prompt_token_count=1446,
     prompt_tokens_details=[
       ModalityTokenCount(
         modality=<MediaModality.TEXT: 'TEX

In [14]:
import json

text = responses[1].text
start = text.find('{')
end = text.rfind('}') + 1
json_response = json.loads(text[start:end])
json_response

{'context_item': '10', 'score': 0.95}

In [15]:
responses[0].candidates[0].content.parts[0].text

'{"context_item": "9", "score": 0.1}'

In [16]:
display(responses[0].parsed, responses[1].parsed)

{'context_item': '9', 'score': 0.1}

{'context_item': '10', 'score': 0.95}