In [1]:
from elasticsearch import Elasticsearch
es = Elasticsearch(
    "https://localhost:9200",
    basic_auth=("elastic", "elastic"),
    ca_certs="elasticsearch-8.14.1\\config\\certs\\http_ca.crt"
)
es.ping()

True

In [2]:
es.close()

In [3]:
import pandas as pd
df = pd.read_csv("data2.csv")
df.head()

Unnamed: 0,ProductID,ProductName,ProductBrand,Gender,Price (INR),NumImages,Description,PrimaryColor
0,10017413,DKNY Unisex Black & Grey Printed Medium Trolle...,DKNY,Unisex,11745,7,"Black and grey printed medium trolley bag, sec...",Black
1,10016283,EthnoVogue Women Beige & Grey Made to Measure ...,EthnoVogue,Women,5810,7,Beige & Grey made to measure kurta with churid...,Beige
2,10009781,SPYKAR Women Pink Alexa Super Skinny Fit High-...,SPYKAR,Women,899,7,Pink coloured wash 5-pocket high-rise cropped ...,Pink
3,10015921,Raymond Men Blue Self-Design Single-Breasted B...,Raymond,Men,5599,5,Blue self-design bandhgala suitBlue self-desig...,Blue
4,10017833,Parx Men Brown & Off-White Slim Fit Printed Ca...,Parx,Men,759,5,"Brown and off-white printed casual shirt, has ...",White


In [49]:
df.isna().value_counts()
df.fillna("None", inplace=True)

In [50]:
df.isna().value_counts()

ProductID  ProductName  ProductBrand  Gender  Price (INR)  NumImages  Description  PrimaryColor
False      False        False         False   False        False      False        False           12491
Name: count, dtype: int64

In [3]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-mpnet-base-v2")

  from tqdm.autonotebook import tqdm, trange


In [52]:
df["DescriptionVector"] = df["Description"].apply(lambda x: model.encode(x))

In [53]:
df["ProductNameVector"] = df["ProductName"].apply(lambda x: model.encode(x))

In [4]:
index_settings = {
    "settings": {
        "analysis": {
            "analyzer": {
                "english_analyzer": {
                    "type": "standard",
                    "stopwords": "_english_"
                },
                "hindi_analyzer": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": ["lowercase", "asciifolding"]
                },
                "trigram_analyzer": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": ["lowercase", "shingle"]
                }
            },
            "filter": {
                "shingle": {
                    "type": "shingle",
                    "min_shingle_size": 2,
                    "max_shingle_size": 3,
                    "output_unigrams": True
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "ProductID": {"type": "long"},
            "ProductName": {
                "type": "text",
                "fields": {
                    "english": {"type": "text", "analyzer": "english_analyzer"},
                    "hindi": {"type": "text", "analyzer": "hindi_analyzer"},
                    "trigram": {"type": "text", "analyzer": "trigram_analyzer"}
                }
            },
            "ProductName_suggest": {"type": "completion"},
            "ProductBrand": {"type": "text"},
            "Gender": {"type": "text"},
            "Price (INR)": {"type": "long"},
            "NumImages": {"type": "long"},
            "Description": {
                "type": "text",
                "fields": {
                    "english": {"type": "text", "analyzer": "english_analyzer"},
                    "hindi": {"type": "text", "analyzer": "hindi_analyzer"}
                }
            },
            "PrimaryColor": {"type": "text"},
            "DescriptionVector": {"type": "dense_vector", "dims": 768, "index": True, "similarity": "l2_norm"},
            "ProductNameVector": {"type": "dense_vector", "dims": 768, "index": True, "similarity": "l2_norm"}
        }
    }
}

In [57]:
index_name = "all_products"
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)

In [58]:
es.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'all_products'})

In [59]:
record_list = df.to_dict("records")
for record in record_list:
    record["ProductName_suggest"] = record["ProductName"]
    try:
        es.index(index="all_products", document=record, id=record["ProductID"])
    except Exception as e:
        print(e)

In [5]:
import numpy as np
def normalize_vector(vector):
    norm = np.linalg.norm(vector)
    if norm == 0: 
        return vector
    return vector / norm

In [7]:
input_keyword = "jeans "
#spell check

spell_check_query = {
    "suggest": {
        "product-name-suggest": {
            "prefix": input_keyword,
            "completion": {
                "field": "ProductName_suggest",
                "fuzzy": {
                    "fuzziness": "auto"
                }
            }
        }
    }
}

spell_check_res = es.search(index="all_products", body=spell_check_query)
suggestions = spell_check_res.get('suggest', {}).get('product-name-suggest', [])[0].get('options', [])

if suggestions:
    corrected_keyword = suggestions[0]['text']
    print(f"Corrected keyword: {corrected_keyword}")
    vector_of_input_keyword = model.encode(corrected_keyword)
else:
    corrected_keyword = input_keyword
    vector_of_input_keyword = model.encode(input_keyword)

# Autofill suggestion query (same as above spell check)
for suggestion in suggestions:
    print(f"Suggestion: {suggestion['text']}")

knn_query_description = {
    "field": "DescriptionVector",
    "query_vector": vector_of_input_keyword,
    "k": 10,
    "num_candidates": 10000
}

# k-NN query for ProductNameVector
knn_query_productname = {
    "field": "ProductNameVector",
    "query_vector": vector_of_input_keyword,
    "k": 10,
    "num_candidates": 10000
}

# Perform k-NN search
res_description = es.search(index="all_products", knn=knn_query_description, _source=["ProductName", "Description", "DescriptionVector"])
res_productname = es.search(index="all_products", knn=knn_query_productname, _source=["ProductName", "Description", "ProductNameVector"])

# Combine results
combined_hits = res_description["hits"]["hits"] + res_productname["hits"]["hits"]

# Remove duplicate hits based on the document ID
combined_hits = {hit["_id"]: hit for hit in combined_hits}.values()

# Calculate cosine similarity
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# Filter and sort the combined results using mean cosine similarity
filtered_hits = []
for hit in combined_hits:
    description_vector = hit["_source"].get("DescriptionVector")
    productname_vector = hit["_source"].get("ProductNameVector")
    similarity_score = 0

    if description_vector is not None and productname_vector is not None:
        similarity_score = (cosine_similarity(vector_of_input_keyword, description_vector) + cosine_similarity(vector_of_input_keyword, productname_vector)) / 2
    elif description_vector is not None:
        similarity_score = cosine_similarity(vector_of_input_keyword, description_vector)
    elif productname_vector is not None:
        similarity_score = cosine_similarity(vector_of_input_keyword, productname_vector)

    hit["_source"]["similarity_score"] = similarity_score
    filtered_hits.append(hit)

# Sort filtered results by combined score (mean cosine similarity + Elasticsearch score)
filtered_hits = sorted(filtered_hits, key=lambda x: (0.5 * x["_source"]["similarity_score"] + 0.5 * x["_score"]), reverse=True)

# Apply additional filters and scoring adjustments
final_hits = []
for hit in filtered_hits:
    # Dynamically filter results based on keyword
    if 'shoes' in corrected_keyword.lower() and 'shoes' not in hit['_source']['ProductName'].lower():
        continue
    final_hits.append(hit)

# Display the top results
for hit in final_hits[:20]:
    print(f"ProductName: {hit['_source']['ProductName']}, Description: {hit['_source']['Description']}, Score: {hit['_score']}")

# Multi-language query
multi_lang_query = {
    "multi_match": {
        "query": corrected_keyword,  # Use the corrected keyword here
        "fields": ["ProductName^3", "ProductName.english", "ProductName.hindi", "Description", "Description.english", "Description.hindi"]
    }
}

multi_lang_res = es.search(index="all_products", body={"query": multi_lang_query})
for hit in multi_lang_res["hits"]["hits"]:
    print(f"ProductName: {hit['_source']['ProductName']}, Description: {hit['_source']['Description']}, Score: {hit['_score']}")

ProductName: Slub Blue Washed Slim Jeans, Description: A pair of blue 5-pocket mid-rise jeans, lightly washed, has whiskers, a zip fly with a button closure, and a waistband with belt loops, Score: 0.62424034
ProductName: Slub Blue Washed Slim Jeans, Description: A pair of blue 5-pocket mid-rise jeans, lightly washed and lightly distressed, has whiskers and chevrons, a button fly closure, and a waistband with belt loopsOur stylist has paired these jeans with a beltThese jeans will not come with a belt, Score: 0.62424034
ProductName: Calvin Klein Jeans Men Black Slim Fit Light Fade Stretchable Jeans, Description:   Dark light fade black jeans   Slim fit,  mid-rise   Clean look   Stretchable   5 pocket   Length: regular  , Score: 0.60754806
ProductName: Pepe Jeans Men Blue Heavy Fade Stretchable Jeans, Description:   Dark heavy fade blue jeans   Regular fit,  mid-rise   Clean look   Stretchable   5 pocket   Length: regular  , Score: 0.60743755
ProductName: Calvin Klein Jeans Men Grey Sol