In [None]:
def classify_category_prompt_template(category: str) -> str:
    """
    Classify the given category as either "E-commerce" or "Classified"
    based on predefined criteria.

    Parameters:
    category (str): The product or service category to classify.

    Returns:
    str: The classification result ("E-commerce" or "Classified").
    """

    PROMPT_TEMPLATE = f"""
    Role: Act as an expert marketplace analyst with specialized knowledge in category management for platforms like BDstall.com.

    Objective: Analyze the provided list of product and service categories. For each item, classify it as either "E-commerce" or "Classified" based on the detailed criteria below.

    Primary Classification Mandate:
    The classification must be based on the item's physical size, typical purchase frequency (Lifetime Value - LTV), and the nature of the transaction (direct purchase vs. service/negotiation-based).

    Classification Criteria:

    1. Classify as "Classified" if the item meets ANY of the following conditions:
    * Nature: It is a service, rental, support, real estate, or a job posting.
    * Size & Value: It is a large, high-value item that typically requires physical inspection, delivery logistics, or installation (e.g., heavy machinery, vehicles, large appliances).
    * LTV (Lifetime Value): It is a one-time or very infrequent purchase for the average customer (LTV is approximately 1).
    * Keywords: Rent, Service, Repair, Land, Flat, Apartment, Vehicle, Car, Generator, Lift, Elevator, Installation, Used (for large items), Job, Tour.
    * Examples: Refrigerator, Air Conditioner, Generator, Elevator, Car, Land, Apartment for Rent, AC Repair Service, Web Design Service, Used Furniture.

    2. Classify as "E-commerce" if the item meets ALL of the following conditions:
    * Nature: It is a tangible product that can be easily packaged and shipped.
    * Size & Value: It is typically small to medium in size and suitable for a standard "add-to-cart" online shopping experience.
    * LTV (Lifetime Value): It has the potential for repeat purchases or a higher customer lifetime value (e.g., fashion, accessories, consumables, gadgets).
    * Keywords: Mobile, Laptop, Watch, Trimmer, Cosmetics, Shirt, Belt, Accessories, RAM, Keyboard, Mouse, Headphone, Book.
    * Examples: Smartwatch, Trimmer, Cosmetics, T-Shirt, Belt, Laptop Accessories, RAM, Mobile Phone, Office Chair.

    EXCEPTION: 
    1. For items that are used like "Used Mobile" or "Used Laptop", classify them as "E-commerce" due to their smaller size and higher likelihood of repeat purchases. Don't classify by the word "Used" alone, consider the item's overall characteristics.
    2. Services and supports are always "Classified", For example: Repair service like "AC Repair", "TV Repair Service"; support like "Network Support"; appointment like "Doctor Appointment"; ticket booking like "Train Ticket", "Air Ticket" etc. 
    3. The software and digital products are always "Classified", For example: "Antivirus Software", "Website design and development", "Video Editing Software", "E-book", "Online Course", "Digital Marketing service" etc.
    
    Input Format:
    You will receive a single string as category input.
    Now evaluate whether it is "Classified" or "E-commerce" based on the above criteria.

    Output Format (Strict JSON):
    {{
        "Classified": "YES / NO", 
        "E-commerce": "YES / NO"
    }}

    Task:
    Now, process the category: "{category}"
    Don't return any explanation, only provide the JSON output as specified.
    """

    return PROMPT_TEMPLATE

In [35]:
import pandas as pd

input_file_location = "D:\\My Learning\\Machine-Learning-AI\\Full Stack AI Engineering\\Day 03\\bdstall_item_categories.csv"

df = pd.read_csv(input_file_location) 

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 575 entries, 0 to 574
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Serial No.     575 non-null    int64  
 1   Category Name  70 non-null     object 
 2   Item List      565 non-null    object 
 3   Classified     0 non-null      float64
 4   E-commerce     0 non-null      float64
dtypes: float64(2), int64(1), object(2)
memory usage: 22.6+ KB


In [3]:
df.columns = df.columns.str.strip()
all_item_list = df['Item List']

## Configure the LLM (Groq)

In [None]:
import json
import re
from groq import Groq


def clean_and_parse_json(response_text: str):
    """
    Cleans LLM response and converts it into a valid JSON object.
    """
    if not response_text:
        raise ValueError("Empty or None response received from AI model")

    # Step 1: Extract only the JSON part using regex
    json_match = re.search(r'\{[\s\S]*\}', response_text)
    if not json_match:
        raise ValueError("No JSON object found in response")

    json_str = json_match.group(0).strip()

    # Step 2: Parse safely
    try:
        return json.loads(json_str)
    except json.JSONDecodeError as e:
        print("JSON parsing error:", e)
        print("Attempted JSON string:", json_str)
        return None


def get_ai_classification(prompt) -> str:
    """
    Sends the classification prompt to the Groq API and returns the LLM's full response as text.
    """
    client = Groq()

    completion = client.chat.completions.create(
        model="meta-llama/llama-4-scout-17b-16e-instruct",
        messages=[
            {"role": "user", "content": prompt}
        ],
        temperature=1,
        max_completion_tokens=1024,
        top_p=1,
        stream=True,
        stop=None
    )

    full_response = "" 

    for chunk in completion:
        delta = chunk.choices[0].delta.content or ""
        # print(delta, end="")    
        full_response += delta       

    return full_response             


In [31]:
# iterates over all non-empty items in the 'Item List' column
def iterate_all_items():
    non_empty_items = df['Item List'].dropna().apply(lambda x: x.strip() if isinstance(x, str) else x)
    non_empty_items = non_empty_items[non_empty_items != '']

    # Iterate over the non-empty values
    cnt = 0
    for item in non_empty_items[:5]:
        cnt += 1
        print("\n" + item)

        prompt = classify_category_prompt_template(item)
        result = get_ai_classification(prompt)       
        parsed = clean_and_parse_json(result)

        print("✅ Parsed JSON: \n\n", parsed)
        
    
    print(f"Total generated and parsed: {cnt}")


iterate_all_items()


Laptop
✅ Parsed JSON: 

 {'Classified': 'NO', 'E-commerce': 'YES'}

Used laptop
✅ Parsed JSON: 

 {'Classified': 'NO', 'E-commerce': 'YES'}

Desktop PC
✅ Parsed JSON: 

 {'Classified': 'NO', 'E-commerce': 'YES'}

Mini PC
✅ Parsed JSON: 

 {'Classified': 'NO', 'E-commerce': 'YES'}

Graphics Tablet
✅ Parsed JSON: 

 {'Classified': 'NO', 'E-commerce': 'YES'}
Total generated and parsed: 5


In [42]:
df.columns

Index(['Serial No.', 'Category Name', 'Item List ', 'Classified',
       'E-commerce '],
      dtype='object')

## Final: Saving the result into the CSV

### Use this script only 

In [None]:
import json
import re
import pandas as pd
from groq import Groq


def clean_and_parse_json(response_text: str):
    """
    Cleans LLM response and converts it into a valid JSON object.
    """
    if not response_text:
        raise ValueError("Empty or None response received from AI model")

    # Extract only the JSON part using regex
    json_match = re.search(r'\{[\s\S]*\}', response_text)
    if not json_match:
        raise ValueError("No JSON object found in response")

    json_str = json_match.group(0).strip()

    try:
        return json.loads(json_str)
    except json.JSONDecodeError as e:
        print("⚠️ JSON parsing error:", e)
        print("Attempted JSON string:", json_str)
        return None


def get_ai_classification(prompt) -> str:
    """
    Sends the classification prompt to the Groq API and returns the LLM's full response as text.
    """
    client = Groq()

    completion = client.chat.completions.create(
        model="openai/gpt-oss-120b",
        messages=[{"role": "user", "content": prompt}],
        temperature=1,
        max_completion_tokens=1024,
        top_p=1,
        stream=True,
        stop=None
    )

    full_response = ""

    for chunk in completion:
        delta = chunk.choices[0].delta.content or ""
        full_response += delta

    return full_response


def iterate_all_items():
    non_empty_items = df['Item List '].dropna().apply(lambda x: x.strip() if isinstance(x, str) else x)
    non_empty_items = non_empty_items[non_empty_items != '']

    results = []  # to store (item, category)

    for idx, item in enumerate(non_empty_items, 1):
        print(f"\n🔹 Processing ({idx}): {item}")

        prompt = classify_category_prompt_template(item)
        result = get_ai_classification(prompt)
        parsed = clean_and_parse_json(result)

        if not parsed:
            print("⚠️ Skipped due to invalid JSON.")
            continue

        # Determine category based on parsed JSON
        if parsed.get("E-commerce", "").strip().upper() == "YES":
            category = "E-commerce"
        elif parsed.get("Classified", "").strip().upper() == "YES":
            category = "Classified"
        else:
            category = "Unknown"

        print(f"✅ Parsed JSON: {parsed}")
        print(f"➡️ Final Category: {category}")

        results.append({"Item name": item, "Category": category})

    # Save all results to CSV
    output_df = pd.DataFrame(results)
    output_df.to_csv("badstall_categories_ai_analyzed.csv", index=False, encoding="utf-8-sig")

    print(f"\n✅ Total saved: {len(results)}")
    print("📁 File saved as: badstall_categories_ai_analyzed_gpt_oss.csv")


iterate_all_items()