In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import ast
import spacy
import re
import nltk
from nltk.tokenize import word_tokenize
from transformers import AutoTokenizer, AutoModel
import torch
import json
from llmlingua import PromptCompressor
import faiss
import time
import gradio as gr
import google.generativeai as genai


In [2]:
df=pd.read_csv(r"updated_shoes_dataset.csv")

In [3]:
def safe_literal_eval(val):
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError):
        return val

df['size'] = df['size'].apply(safe_literal_eval)

In [4]:
df['size'] = df['size'].apply(lambda x: list(map(float, x)))

In [6]:
def preprocess_query(query):
    query = query.lower()  # Lowercase the query
    tokens = word_tokenize(query)  # Tokenize the query
    return tokens
# Load a pre-trained spaCy model for entity recognition
nlp = spacy.load('en_core_web_sm')

# Function to recognize entities and map them to dataset categories
def parse_query(query,df):
    doc = nlp(query)
    entities = {}

    # Expanded dictionary of synonyms for occasions and genders
    occasion_synonyms = {
        'formal': ['formal', 'office', 'business', 'professional'],
        'casual': ['casual', 'everyday', 'daily', 'regular', 'weekend'],
        'ethnic': ['ethnic', 'traditional', 'cultural', 'heritage'],
        'party': ['party', 'fancy', 'evening', 'event', 'birthday', 'celebration', 'festive'],
        'riding': ['riding', 'biking', 'motorcycle', 'cycling'],
        'sports': ['sports', 'athletic', 'gym', 'training', 'running', 'soccer', 'football', 'basketball', 'tennis', 'hiking', 'fitness'],
        'wedding': ['wedding', 'bride', 'groom', 'marriage', 'nuptial'],
        'work': ['work', 'standing', 'office', 'workplace', 'job'],
        'summer': ['summer', 'sandals', 'beach', 'vacation'],
        'winter': ['winter', 'cold', 'snow', 'boots'],
    }

    gender_synonyms = {
        'male': [
            'male', 'men', 'man', 'boy', 'son', 'gentleman',
            'guy', 'lad', 'brother', 'father', 'husband',
            'he', 'him', 'his', 'masculine', 'dude',
            'mr', 'sir', 'uncle', 'nephew', 'grandfather',
            'dad', 'papa', 'pa', 'fella'
        ],
        'female': [
            'female', 'women', 'woman', 'girl', 'wife', 'daughter',
            'lady', 'miss', 'mrs', 'ms', 'sister',
            'mother', 'aunt', 'niece', 'grandmother',
            'she', 'her', 'hers', 'feminine', 'madam',
            'ma\'am', 'gal', 'mom', 'mama', 'mum',
            'mummy', 'mommy', 'granny', 'lass', 'queen',
            'princess', 'dame'
        ],
    }

    description_keywords = {
        'comfortable': ['comfortable', 'cozy', 'snug', 'comfy', 'relaxed'],
        'stylish': ['stylish', 'fashionable', 'trendy', 'chic', 'elegant'],
        'durable': ['durable', 'long-lasting', 'sturdy', 'tough', 'hardy'],
        'lightweight': ['lightweight', 'light', 'easy', 'flexible'],
        'unique': ['unique', 'special', 'different', 'distinctive'],
        'support': ['support', 'good support', 'arch support', 'cushioning'],
        'sparkle': ['sparkle', 'glitter', 'shiny', 'dazzling'],
    }

    # Extracting entities from the query
    for ent in doc.ents:
        if ent.label_ == "MONEY":
            price_match = re.search(r'\d+', ent.text)
            if price_match:
                entities['price'] = int(price_match.group())
        elif ent.label_ == "CARDINAL":
          if ent.text.isdigit():
            number = int(ent.text)
            if 1 <= number <= 15:  # Common shoe sizes range
                entities.setdefault('size', []).append(float(number))
    brands=[]
    tokens = preprocess_query(query)
    unique_brands = df['brand'].unique()
    unique_brands = [brand.lower() for brand in unique_brands]
    for token in tokens:
        if token in unique_brands:
            brands.append(token)
    if brands:
        entities['brand']=brands
    # Directly check the query for gender-related terms
    for gender, keywords in gender_synonyms.items():
        for keyword in keywords:
            if re.search(rf'\b{keyword}\b', query, re.IGNORECASE):
                entities['gender'] = gender

    # Enhanced handling of occasions
    for occasion, keywords in occasion_synonyms.items():
        for keyword in keywords:
            if re.search(rf'\b{keyword}\b', query, re.IGNORECASE):
                entities['occasion'] = occasion

    # Enhanced handling of descriptions
    matched_descriptions = []
    for description, keywords in description_keywords.items():
        for keyword in keywords:
            if re.search(rf'\b{keyword}\b', query, re.IGNORECASE):
                matched_descriptions.append(description)
    if matched_descriptions:
        entities['description'] = ' '.join(matched_descriptions)

    # Handling price ranges
    price_range_match = re.search(r'price (range )?between (\d+) (and|to) (\d+)', query, re.IGNORECASE)
    if price_range_match:
        entities['price_range'] = (int(price_range_match.group(2)), int(price_range_match.group(4)))

    price_limit_match = re.search(r'under (\d+)', query, re.IGNORECASE)
    if price_limit_match:
        entities['cprice'] = int(price_limit_match.group(1))

    # Handling discount or offer
    offer_match = re.search(r'(\d+)% (off|discount)', query, re.IGNORECASE)
    if offer_match:
        entities['offer'] = float(offer_match.group(1))

    # Handling ratings
    rating_match = re.search(r'rating (above|greater than|over) (\d+(\.\d+)?)', query, re.IGNORECASE)
    if rating_match:
        entities['rating_threshold'] = float(rating_match.group(2))

    # Handling top rated requests
    if "top rated" in query.lower():
        entities['rating'] = 5

    return entities


In [8]:
def filter_shoes(shoes_data, parsed_query):
    # Filter based on gender
    if 'gender' in parsed_query:
        shoes_data = shoes_data[shoes_data['gender'].str.lower() == parsed_query['gender'].lower()]
    # Filter based on price range

    # Filter based on brand
    if 'brand' in parsed_query:
        shoes_data = shoes_data[shoes_data['brand'].str.lower().isin(parsed_query["brand"])]

    # Filter based on occasion
    if 'occasion' in parsed_query:
        shoes_data = shoes_data[shoes_data['ocassion'].str.lower() == parsed_query['occasion'].lower()]
        
    # Filter based on minimum rating
    if 'min_rating' in parsed_query:
        shoes_data = shoes_data[shoes_data['rating'] >= parsed_query['min_rating']]
    
    if 'price_range' in parsed_query:
        min_price, max_price = parsed_query['price_range']
        shoes_data = shoes_data[(shoes_data['cprice'] >= min_price) & (shoes_data['cprice'] <= max_price)]
    # Filter based on size
    if 'size' in parsed_query:
        desired_size = parsed_query['size'][0]
        shoes_data = shoes_data[shoes_data['size'].apply(lambda sizes: desired_size in sizes)]

    return shoes_data


In [9]:
# Load the MiniLM tokenizer and model
model_name = 'sentence-transformers/all-MiniLM-l12-v2'
#model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Move the model to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()  # Set the model to evaluation mode

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 384, padding_idx=0)
    (position_embeddings): Embedding(512, 384)
    (token_type_embeddings): Embedding(2, 384)
    (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=384, out_features=384, bias=True)
            (key): Linear(in_features=384, out_features=384, bias=True)
            (value): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [10]:
torch.cuda.is_available()

True

In [11]:
def generate_batch_embeddings(text_list, batch_size=32):
    embeddings_list = []
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True).to(device)
        
        with torch.no_grad():
            outputs = model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1)
        
        embeddings_list.append(embeddings.cpu().numpy())
    
    return np.concatenate(embeddings_list, axis=0)

# Example: Embeddings for a list of sentences
text_list = df["descrption"].tolist()
s=time.time()
embeddings = generate_batch_embeddings(text_list)
l=time.time()
print("Batch embeddings shape:", embeddings.shape,"total time taken:",l-s)


Batch embeddings shape: (46879, 384) total time taken: 47.75185942649841


In [12]:
embedding_df=pd.DataFrame(embeddings.tolist())
#filterd_embedding_df=embedding_df.iloc[filtered_df.index]

In [13]:
def generate_text_embedding(text):
    # Tokenize and move the inputs to GPU
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True).to(device)
    
    with torch.no_grad():  # No need to compute gradients
        outputs = model(**inputs)
        # Perform mean pooling
        embeddings = outputs.last_hidden_state.mean(dim=1)
    
    return embeddings.cpu().numpy()  # Move embeddings back to CPU for further processing

In [18]:
def silirity_search(embeddings_df,query):
    embedding=embeddings_df.to_numpy()
    d = embedding.shape[1]
    res = faiss.StandardGpuResources()  # Use a single GPU
    index = faiss.IndexFlatL2(d)  # L2 distance
    gpu_index = faiss.index_cpu_to_gpu(res, 0, index)  # Move index to GPU
    gpu_index.add(embedding)
    query_embedding = generate_text_embedding(query)
    k = 10  # Number of nearest neighbors to retrieve
    distances, indices = gpu_index.search(np.array([query_embedding.flatten()]), k)
    return distances, indices

In [14]:
def boost_results(result_df, rating_weight=0.6, review_weight=0.4, top_k=5):
    boosted_results = []

    for i in range(len(result_df)):
        similarity_score = 1 / (1 + result_df["distance"].iloc[i])  # Convert distance to similarity
        rating_score = result_df['rating'].iloc[i] / 5.0  # Assuming rating is out of 5
        review_score = np.log1p(result_df['reviews'].iloc[i]) / np.log1p(1000)  # Normalize based on an assumed max of 1000 reviews
        boosted_score = similarity_score + (rating_weight * rating_score) + (review_weight * review_score)
        boosted_results.append(boosted_score)
    result_df["boosted_score"]=boosted_results
    #sorting from higher to lower
    result_df = result_df.sort_values(by='boosted_score', ascending=False)
    return result_df

In [15]:

llm_lingua = PromptCompressor(
    model_name="microsoft/llmlingua-2-bert-base-multilingual-cased-meetingbank",
    model_config={"revision": "main"},
    use_llmlingua2=True,
    device_map="cuda:0",
)

# Function definition
def compress_query_prompt(query):
    demonstration_str = query['demonstration_str']
    instruction = query['instruction']
    question = query['question']
    # 6x Compression
    compressed_prompt = llm_lingua.compress_prompt(
        demonstration_str.split("\n"),
        instruction=instruction,
        question=question,
        target_token=300,
        rank_method="longllmlingua",
        context_budget="+100",
        dynamic_context_compression_ratio=0.4,
        reorder_context="sort",
    )

    return json.dumps(compressed_prompt, indent=4)

In [16]:
genai.configure(api_key="GEMINI-API-KEY")

# Create the model
generation_config = {
  "temperature": 1,
  "top_p": 0.95,
  "top_k": 64,
  "max_output_tokens": 200,
  "response_mime_type": "text/plain",
}

model_gen = genai.GenerativeModel(
  model_name="gemini-1.5-flash",
  generation_config=generation_config,
  system_instruction="your  a shoe recommendation system you will answer user query using search results",
)

def handle_user_query(query,compressed_prompt):
    prompt=f"You are an shoe recommendation system. Answer this user query: '{query}' with the following context:\n{compressed_prompt}"
    response = model_gen.generate_content(prompt)
    return(response.text)

In [20]:
def pipeline(message, history):
    query= message
    #query parser seperating entites from query
    value=parse_query(query,df)
    #filter data using entites
    filtered_data=filter_shoes(df,value)
    #storing embedded data into pandas dataframe
    embedding_df=pd.DataFrame(embeddings.tolist())
    #extracting filter data index from embbeddings
    filterd_embedding_df=embedding_df.iloc[filtered_data.index]
    #similarty search
    distance,indices=silirity_search(filterd_embedding_df,query)
    result_df=filtered_data.iloc[indices[0]]
    result_df['distance']=distance.flatten().tolist()
    #adding boosted values
    result=boost_results(result_df.head())
    result=result.drop(["combined_description","UID","distance","boosted_score"],axis=1)
    query_info = {
     'demonstration_str': result.to_string(),  # Results from information retrieval process
     'instruction': "Write a high-quality answer for the given question using only the provided search results.",
     'question': query 
    }
    #prompt engineering and prompt compression
    res = compress_query_prompt(query_info)
    data = json.loads(res)
    compressed_prompt = data["compressed_prompt"]
    #using gemini for generative model
    ans=handle_user_query(query,compressed_prompt)
    return ans

gr.ChatInterface(
    pipeline,
    chatbot=gr.Chatbot(height=300),
    textbox=gr.Textbox(placeholder="Type here..", container=False, scale=7),
    title="shoe recommendation system",
    description="Please specify your requirement like gender,ocassion,brand,size,price and rating",
    theme="soft",
    examples=["Adidas men shoes for walking or running "],
    cache_examples=True,
    retry_btn=None,
    undo_btn="Delete Previous",
    clear_btn="Clear",
).launch(share=True)

Using cache from 'C:\Users\Suresh.K\gradio_cached_examples\15' directory. If method or examples have changed since last caching, delete this folder to clear cache.

Running on local URL:  http://127.0.0.1:7860

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.


