In [None]:
import pandas as pd
from elasticsearch import Elasticsearch

# Connect to Elasticsearch
es = Elasticsearch(
    "http://localhost:9200",
    basic_auth=("elastic", "123456")  # Replace with your actual credentials
)

# Load your CSV file
final_df = pd.read_csv('Dataset-final.csv')

# Function to convert binary hash to a list of integers
def binary_hash_to_list(binary_hash):
    return [int(bit) for bit in binary_hash]

# Apply the conversion to the binary_hash column
final_df['hashed_vector'] = final_df['binary_hash'].apply(binary_hash_to_list)

# Define the synonym list
synonyms = [
    "laptop, notebook",
    "tv, television",
    "cellphone, smartphone, mobile phone",
    "headphones, headset",
    "camera, camcorder",
    # Add more synonyms as needed
]

# Define the index mapping and settings
index_name = 'products2'

# First, delete the index if it exists
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)

# Create the index with custom analyzers and mappings
index_settings = {
    "settings": {
        "analysis": {
            "filter": {
                "english_stop": {
                    "type": "stop",
                    "stopwords": "_english_"
                },
                "english_stemmer": {
                    "type": "stemmer",
                    "language": "english"
                },
                "english_possessive_stemmer": {
                    "type": "stemmer",
                    "language": "possessive_english"
                },
                "synonym_filter": {
                    "type": "synonym",
                    "synonyms": synonyms
                }
            },
            "analyzer": {
                "custom_english_analyzer": {
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase",
                        "english_possessive_stemmer",
                        "english_stop",
                        "english_stemmer",
                        "synonym_filter"
                    ]
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "product_id": {"type": "keyword"},
           "hashed_vector": {
              "type": "dense_vector",
              "dims": 64  # Replace 128 with the number of dimensions in your vectors
            },
            "product_url": {"type": "keyword"},
            "product_description": {
                "type": "text",
                "analyzer": "custom_english_analyzer"
            },
            "product_name": {
                "type": "text",
                "analyzer": "custom_english_analyzer"
            },
            "images": {"type": "keyword"},
            "price": {"type": "float"}
        }
    }
}

# Create the index
es.indices.create(index=index_name, body=index_settings)

# Function to index documents
def index_documents(df, index_name):
    print(f"Starting the indexing process for index: {index_name}")
    success_count = 0
    error_count = 0
    for _, row in df.iterrows():
        # Check if 'images' is a string and split if possible, else set to an empty list
        images = row['images'].split(' | ') if isinstance(row['images'], str) else []

        document = {
            "product_id": row['product_id'],
            "hashed_vector": row['hashed_vector'],  # This should be a list of integers
            "product_url": row['Producturl'],
            "product_description": row['description'],
            "product_name": row['name'],
            "images": images,  # Use the split images or empty list
            "price": row['price']
        }
        try:
            # Index the document
            es.index(index=index_name, body=document)
            # print(f"Successfully indexed document: {row['product_id']}")
            success_count += 1
        except Exception as e:
            print(f"Error indexing document {row['product_id']}: {e}")
            error_count += 1

    print(f"Indexing completed. Successfully indexed {success_count} documents. {error_count} errors occurred.")

# Index the documents
index_documents(final_df, index_name)

In [None]:
import json
from elasticsearch import Elasticsearch
import numpy as np
from sklearn.decomposition import PCA
import joblib
import pandas as pd
import faiss

# Connect to Elasticsearch
es = Elasticsearch(
    "http://localhost:9200",
    basic_auth=("elastic", "123456")  # Replace with your actual credentials
)




df = pd.read_csv('Dataset-final.csv')
# 2. Prepare the hash codes
def process_binary_hashes(binary_hash_str):
    return np.array(list(map(int, binary_hash_str.strip())), dtype=np.uint8)

df['binary_code'] = df['binary_hash'].apply(process_binary_hashes)

binary_codes = np.vstack(df['binary_code'].values)

def pack_binary_codes(binary_codes):
    binary_codes_packed = np.packbits(binary_codes, axis=1)

    return binary_codes_packed

binary_codes_packed = pack_binary_codes(binary_codes)

# 3. Build the binary index
num_bits = binary_codes.shape[1]
index = faiss.IndexBinaryFlat(num_bits)
index.add(binary_codes_packed)

index_name = 'products2'  # Your existing index name
pca = joblib.load('pca_model.joblib')
rotation_matrix = np.load('rotation_matrix.npy')
embeddings_mean = np.load('embeddings_mean.npy')
# Load the embedding model (replace with your actual model)
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('all-mpnet-base-v2')

def query_to_hashed_vector(query):
    """
    Convert query text to a hashed vector using the same process as the dataset.

    Args:
        query (str): The search query text.

    Returns:
        binary_code_list (list): The binary hash code of the query as a list of integers.
    """
    # Step 1: Generate the embedding
    query_embedding = embedding_model.encode(query)

    # Step 2: Apply PCA transformation
    query_embedding_pca = pca.transform([query_embedding])[0]

    # Step 3: Zero-center the embedding using embeddings_mean from the dataset
    centered_query_embedding = query_embedding_pca - embeddings_mean

    # Step 4: Apply the ITQ rotation
    rotated_query_embedding = np.dot(centered_query_embedding, rotation_matrix)

    # Step 5: Binarize (quantize)
    binary_code = (rotated_query_embedding > 0).astype(int)

    # Convert binary code to list of integers
    binary_code_list = binary_code.tolist()

    return binary_code_list

def keyword_search(query_text, index_name):
    # Perform the keyword search
    keyword_query = {
        "query": {
            "multi_match": {
                "query": query_text,
                "fields": [
                    "product_name",  # Boost product_name field
                    "product_description^3"
                ],
                # "analyzer": "your_custom_analyzer",  # Uncomment if using a custom analyzer
                "type": "best_fields"
            }
        }
    }

    keyword_response = es.search(index=index_name, body=keyword_query)
    return keyword_response



def semantic_search_hash(query, index, df, k=10):
    query_hash = query_to_hashed_vector(query)

    # Convert the binary code list to a numpy array with dtype uint8
    query_hash = np.array(query_hash, dtype=np.uint8)

    # Ensure it's packed like the dataset hashes
    query_hash_packed = np.packbits(query_hash)
    query_hash_packed = np.expand_dims(query_hash_packed, axis=0)

    # Perform the search
    distances, indices = index.search(query_hash_packed, k)

    # Retrieve the results
    results = df.iloc[indices[0]].copy()
    results['hamming_distance'] = distances[0]

    # Compute similarity (based on hamming distance)
    max_distance = num_bits
    results['similarity'] = 1 - (results['hamming_distance'] / max_distance)

    return results

def hybrid_search_hash(query, index, df, k=10, alpha=0.7):
    # Perform the semantic search
    semantic_results = semantic_search_hash(query, index, df, k)
    semantic_results['semantic_score'] = semantic_results['similarity']

    # Reformat the semantic results to match keyword format
    semantic_results = semantic_results.rename(columns={'Producturl': 'product_url', 'name': 'product_name', 'price': 'product_price', 'description': 'product_description', 'images': 'product_images'})
    # Perform the keyword search and structure the output into a DataFrame

    keyword_results = keyword_search(query, index_name)

    keyword_hits = keyword_results['hits']['hits']
    keyword_data = [
        {
            'product_id': hit['_source']['product_id'],
            'product_price': hit['_source']['price'],
            'product_url': hit['_source']['product_url'],
            'product_name': hit['_source'].get('product_name', ''),
            'product_description': hit['_source'].get('product_description', ''),
            'product_images': hit['_source'].get('images', []),  # Include images
            'keyword_score': hit['_score']
        }
        for hit in keyword_hits
    ]

    # Convert keyword search results into a DataFrame
    keyword_df = pd.DataFrame(keyword_data)

    # Merge the semantic and keyword results by product_id
    combined = pd.merge(semantic_results, keyword_df, on='product_id', how='outer')

    # Fill missing scores for keyword and semantic
    combined['semantic_score'] = combined['semantic_score'].fillna(0)
    combined['keyword_score'] = combined['keyword_score'].fillna(0)

    # Calculate the combined score
    combined['combined_score'] = alpha * combined['semantic_score'] + (1 - alpha) * combined['keyword_score']

    # Sort by combined score and return top k results
    combined = combined.sort_values(by='combined_score', ascending=False)

    return combined.head(k)





query = "I want some gym clothes"
results = hybrid_search_hash(query, index, df, k=10, alpha=0.7)
columns_to_keep = [
    "product_id", "product_url_y", "product_name_y", "product_description_y",
    "product_images_y", "keyword_score", "combined_score","product_price_y"
]

# Filter the results DataFrame to only include the columns you want
filtered_results = results[columns_to_keep]

# Convert the filtered DataFrame to a dictionary
filtered_results_dict = filtered_results.to_dict(orient='records')

# Convert the dictionary to a JSON string
filtered_results_json = json.dumps(filtered_results_dict, indent=4)

# Output the filtered JSON string
print(filtered_results_json)