In [2]:
pip install pandas transformers sentence-transformers chromadb peft fastapi uvicorn pydantic redis torch datasets



In [10]:
import pandas as pd
df = pd.read_csv('/content/flipkart_com-ecommerce_sample.csv.csv')  # Adjust filename
print(df.columns.tolist())

['category_1', 'category_2', 'category_3', 'title', 'product_rating', 'selling_price', 'mrp', 'seller_name', 'seller_rating', 'description', 'highlights', 'image_links']


In [9]:
print(df.dtypes)

category_1         object
category_2         object
category_3         object
title              object
product_rating    float64
selling_price      object
mrp                object
seller_name        object
seller_rating     float64
description        object
highlights         object
image_links        object
dtype: object


In [24]:
import pandas as pd
import hashlib

# Load CSV (adjust filename if different)
df = pd.read_csv('/content/flipkart_com-ecommerce_sample.csv.csv')

# Log columns for verification
print("Original columns:", df.columns.tolist())
print(f"Original rows: {len(df)}")

# Subsample (1K for prototype; comment for full) - Moved before dropna
df = df.head(1000)
print(f"Rows after subsampling: {len(df)}")

# Standardize columns
df = df.rename(columns={
    'title': 'title',
    'description': 'description',
    'selling_price': 'price',
    'category_1': 'category_1',
    'category_2': 'category_2',
    'category_3': 'category_3'
})

# Merge description and highlights
df['description'] = df.apply(
    lambda row: (str(row['description']) + ' ' + str(row['highlights']) if pd.notna(row['description']) and pd.notna(row['highlights'])
                 else str(row['description']) if pd.notna(row['description'])
                 else str(row['highlights']) if pd.notna(row['highlights'])
                 else ''),
    axis=1
)
print(f"Rows after creating description: {len(df)}")
print(df[['description']].head())


# Category: Combine into path
df['category'] = df.apply(
    lambda row: ' > '.join([c for c in [row['category_1'], row['category_2'], row['category_3']] if pd.notna(c)]),
    axis=1
)
print(f"Rows after creating category: {len(df)}")
print(df[['category']].head())


# Clean price: Use selling_price, fallback to mrp
df['price'] = pd.to_numeric(df['price'], errors='coerce')
df['price'] = df.apply(lambda row: pd.to_numeric(row['mrp'], errors='coerce') if pd.isna(row['price']) else row['price'], axis=1)
print(f"Rows after cleaning price: {len(df)}")
print(df[['price']].head())


# Brand: Extract from title (heuristic)
common_brands = ['Alisha', 'Samsung', 'Bose', 'FabHomeDecor']
def extract_brand(title):
    if pd.isna(title):
        return 'Unknown'
    words = title.split()
    for brand in common_brands:
        if brand.lower() in ' '.join(words[:2]).lower():
            return brand
    return words[0] if words else 'Unknown'

df['brand'] = df['title'].apply(extract_brand)

# ASIN: Generate from title + category
df['asin'] = df.apply(lambda row: hashlib.sha256((str(row['title']) + str(row['category'])).encode()).hexdigest()[:10], axis=1)


# Drop rows with nulls in key fields - Now applied after subsampling
# Temporarily removed 'price' from subset to see if other data is retained
df = df.dropna(subset=['title', 'description', 'category'])
print(f"Rows after dropping nulls (excluding price): {len(df)}")


# Save cleaned
df.to_csv('cleaned_products.csv', index=False)

print(f"Cleaned: {len(df)} rows")
print(df[['title', 'description', 'category', 'price', 'brand', 'asin']].head())

Original columns: ['category_1', 'category_2', 'category_3', 'title', 'product_rating', 'selling_price', 'mrp', 'seller_name', 'seller_rating', 'description', 'highlights', 'image_links']
Original rows: 12041
Rows after subsampling: 1000
Rows after creating description: 1000
                                         description
0  Cricket Practice Net NYLON HDPE Material W x H...
1  10 X  10 GREEN CRICKET NET HDPE NYLON. Cricket...
2                  Cricket Rubber Ball Weight: 110 g
3               Cricket Synthetic Ball Weight: 110 g
4  The Ceat Poplar Willow Cricket Bat has been de...
Rows after creating category: 1000
                                     category
0  Sports, Books and More > Sports > Cricket 
1  Sports, Books and More > Sports > Cricket 
2  Sports, Books and More > Sports > Cricket 
3  Sports, Books and More > Sports > Cricket 
4  Sports, Books and More > Sports > Cricket 
Rows after cleaning price: 1000
   price
0    NaN
1    NaN
2    NaN
3    NaN
4    NaN
Rows afte

In [26]:
import chromadb
from sentence_transformers import SentenceTransformer
import pandas as pd

# Load cleaned
df = pd.read_csv('/content/cleaned_products.csv')

# Embedding model
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Embed descriptions
# Ensure all descriptions are strings to prevent TypeError
descriptions = df['description'].astype(str).tolist()
embeddings = embedder.encode(descriptions, batch_size=32, show_progress_bar=True)

# Chroma setup
client = chromadb.PersistentClient(path="./chroma_db")
collection = client.get_or_create_collection(name="products")

# Upsert
for i in range(len(df)):
    row = df.iloc[i]
    # Ensure metadata values are compatible with ChromaDB
    metadata = {
        'title': str(row['title']),
        'description': str(row['description']),
        'category': str(row['category']),
        'price': float(row['price']) if pd.notna(row['price']) else None, # Handle NaN price
        'brand': str(row['brand']),
        'asin': str(row['asin'])
    }
    collection.upsert(
        ids=[str(row['asin'])],
        embeddings=[embeddings[i].tolist()],
        metadatas=[metadata]
    )

print(f"Ingested {collection.count()} products")

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Ingested 992 products


In [28]:
import pandas as pd
import random
from datasets import Dataset

# Load cleaned
df = pd.read_csv('cleaned_products.csv')

def generate_example(row):
    brand = str(row['brand'])
    category_last = str(row['category']).split(' > ')[-1].strip() if ' > ' in str(row['category']) else str(row['category'])
    price = row['price']
    desc_words = str(row['description']).split()
    keyword = random.choice(desc_words) if desc_words else 'features'

    # Handle potential NaN price
    price_str = f"{price:.2f}" if pd.notna(price) else "N/A"
    max_price_json = price * 0.8 if pd.notna(price) else None


    intents = [
        f"find affordable {brand} {category_last} under {price_str} with {keyword}",
        f"show budget {category_last} from {brand} below {price_str} featuring {keyword}"
    ]
    intent = random.choice(intents)

    json_target = {
        "keywords": f"{keyword} {category_last}",
        "brand": brand,
        "max_price": max_price_json,
        "category_contains": category_last
    }

    return {"input": f"Parse this search intent to JSON: {intent}", "output": str(json_target)}

# Generate 500
examples = [generate_example(row) for _, row in df.sample(min(500, len(df))).iterrows()]

# Dataset
train_dataset = Dataset.from_list(examples).train_test_split(test_size=0.1)
train_dataset.save_to_disk('intent_training_data')

print("Training data saved")

Saving the dataset (0/1 shards):   0%|          | 0/450 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50 [00:00<?, ? examples/s]

Training data saved


In [30]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_from_disk

# Load data
dataset = load_from_disk('/content/intent_training_data')

# Model/tokenizer
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# PEFT LoRA
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q", "v"]
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# Preprocess
def preprocess(examples):
    inputs = tokenizer(examples['input'], truncation=True, padding="max_length", max_length=128)
    labels = tokenizer(examples['output'], truncation=True, padding="max_length", max_length=128)
    inputs['labels'] = labels['input_ids']
    return inputs

tokenized_dataset = dataset.map(preprocess, batched=True)

# Args
training_args = TrainingArguments(
    output_dir="./fine_tuned_model",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    eval_strategy="epoch", # Corrected argument name
    save_strategy="epoch",
    load_best_model_at_end=True
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test']
)

trainer.train()

# Save
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

print("Fine-tuning done")

trainable params: 884,736 || all params: 248,462,592 || trainable%: 0.3561


Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbsse1414[0m ([33mbsse1414-university-of-dhaka[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss
1,No log,30.725758
2,No log,29.035202




Epoch,Training Loss,Validation Loss
1,No log,30.725758
2,No log,29.035202
3,No log,24.734243


Fine-tuning done
