In [None]:
import pandas as pd
import ast
import re
import spacy
import nltk

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

try:
    from nltk.corpus import wordnet
    wordnet.ensure_loaded()
except LookupError:
    print("WordNet not found, please run nltk.download('wordnet') in your environment.")

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')  # Download wordnet resource

# Load spaCy's English model
nlp = spacy.load('en_core_web_sm')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Load the CSV file into a DataFrame
csv_file_path = '/content/Dataset.csv'  # Update this with the actual file path
df = pd.read_csv(csv_file_path)

# Updated function to parse the Description field
def parse_description(desc_str):
    try:
        if isinstance(desc_str, str):
            # Handle possible leading/trailing whitespace
            desc_str = desc_str.strip()
            # Check if there's a list in the description
            list_start = desc_str.find('[')
            if list_start != -1:
                # Extract the text before the list
                pre_list_text = desc_str[:list_start].strip()
                # Extract the list part
                list_str = desc_str[list_start:]
                # Use ast.literal_eval to safely evaluate the list string
                desc_list = ast.literal_eval(list_str)
                # Extract text from each dictionary
                texts = []
                for item in desc_list:
                    for key, value in item.items():
                        texts.append(str(value))
                # Combine all texts into a single string
                combined_text = ' '.join([pre_list_text] + texts)
                return combined_text.strip()
            else:
                # No list found, return the description as is
                return desc_str.strip()
        else:
            # desc_str is not a string (could be NaN), return empty string or handle accordingly
            return ""
    except Exception as e:
        print(f"Error parsing Description: {e}")
        print(f"Raw Description Value: {desc_str}")  # Debug print
        return ""

# Apply the parsing function to the Description column
df['combined_description'] = df['Description'].apply(parse_description)

# Preprocessing function remains the same
def preprocess_text(text):
    # 1. Lowercasing
    text = text.lower()

    # 2. Remove brackets but retain content
    text = re.sub(r'[\(\)\[\]\{\}]', '', text)

    # 3. Remove unnecessary punctuation except hyphens and slashes
    text = re.sub(r'[^\w\s\-\/]', ' ', text)

    # 4. Tokenization using spaCy
    doc = nlp(text)
    tokens = []
    for token in doc:
        if token.text in stop_words:
            continue
        # Preserve numbers with units (e.g., "12-inch", "500ml")
        if re.match(r'^\d+[-/]\w+$', token.text):
            tokens.append(token.text)
        elif token.is_alpha or token.like_num:
            # Lemmatize the token
            lemma = lemmatizer.lemmatize(token.text)
            tokens.append(lemma)

    # 5. Join tokens back into a string
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

# Apply preprocessing to the combined_description
df['preprocessed_description'] = df['combined_description'].apply(preprocess_text)

# Preprocess the 'name' field
df['preprocessed_name'] = df['name'].apply(preprocess_text)

# Display the results
print("\nProcessed Results:")
print(df[['product_id', 'preprocessed_name', 'preprocessed_description']].head())

In [None]:
# Install required library
!pip install -U sentence-transformers

import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm import tqdm

# Load your preprocessed DataFrame 'df'
# Assuming 'df' already has 'preprocessed_name' and 'preprocessed_description'

# Combine the preprocessed name and description
df['text_for_vectorization'] = df['preprocessed_name'] + ' ' + df['preprocessed_description']

# Load the pre-trained model
model_name = 'all-mpnet-base-v2'
model = SentenceTransformer(model_name)

# Function to encode texts in batches
def encode_texts_in_batches(texts, model, batch_size=64):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i+batch_size]
        batch_embeddings = model.encode(batch_texts)
        embeddings.extend(batch_embeddings)
    return embeddings

# Encode the texts
texts = df['text_for_vectorization'].tolist()
vector_embeddings = encode_texts_in_batches(texts, model, batch_size=64)

# Add embeddings to the DataFrame
df['vector_embeddings'] = vector_embeddings

# Verify the embeddings
print(f"Number of embeddings: {len(vector_embeddings)}")
print(f"Embedding dimension: {len(vector_embeddings[0])}")

# Save the embeddings (optional)
np.savez_compressed('product_embeddings.npz', embeddings=np.vstack(vector_embeddings), product_ids=df['product_id'].values)

In [None]:
!pip install faiss
import faiss
import numpy as np
from sklearn.decomposition import PCA
import joblib  # Import joblib to save the PCA model

# Load the vector embeddings (assuming they are already in df['vector_embeddings'])
embeddings = np.vstack(df['vector_embeddings'].values)

# 1. Apply PCA to reduce dimensionality (optional, helps in reducing the vector size)
pca_dims = 64  # Choose appropriate dimensions for PCA
pca = PCA(n_components=pca_dims)
embeddings_pca = pca.fit_transform(embeddings)

# 2. ITQ Implementation
def iterative_quantization(embeddings, num_bits):
    """
    Apply ITQ to reduce embeddings to binary codes.

    Args:
        embeddings (np.array): Input embeddings (PCA-reduced).
        num_bits (int): Number of bits for binary encoding.

    Returns:
        binary_codes (np.array): Binary codes.
        rotation_matrix (np.array): Learned rotation matrix.
    """
    # Step 1: Zero-center the data
    embeddings_mean = np.mean(embeddings, axis=0)
    centered_embeddings = embeddings - embeddings_mean

    # Step 2: Initialize random rotation matrix
    _, pca_dims = centered_embeddings.shape
    rotation_matrix = np.random.randn(pca_dims, pca_dims)
    rotation_matrix, _ = np.linalg.qr(rotation_matrix)  # QR decomposition to get orthogonal matrix

    # Step 3: Perform ITQ iterations (typically 50 iterations)
    num_iterations = 50
    for i in range(num_iterations):
        # Step 3.1: Rotate the embeddings
        rotated_embeddings = np.dot(centered_embeddings, rotation_matrix)

        # Step 3.2: Binarize (quantize)
        binary_codes = np.sign(rotated_embeddings)

        # Step 3.3: Optimize the rotation matrix
        C = np.dot(centered_embeddings.T, binary_codes)
        U, _, Vt = np.linalg.svd(C)
        rotation_matrix = np.dot(U, Vt)

    # Step 4: Final binary codes
    final_rotated_embeddings = np.dot(centered_embeddings, rotation_matrix)
    binary_codes = (final_rotated_embeddings > 0).astype(int)

    return binary_codes, rotation_matrix, embeddings_mean

# Choose the number of bits for hashing (e.g., 64-bit binary codes)
num_bits = 64

# Apply ITQ to the embeddings
binary_codes, rotation_matrix, embeddings_mean = iterative_quantization(embeddings_pca, num_bits)

# Save PCA model
joblib.dump(pca, 'pca_model.joblib')

# Save rotation matrix and embeddings mean
np.save('rotation_matrix.npy', rotation_matrix)
np.save('embeddings_mean.npy', embeddings_mean)

# Convert binary codes to string or integer representation (optional)
binary_strings = [''.join(map(str, binary_code)) for binary_code in binary_codes]
binary_integers = [int(b_str, 2) for b_str in binary_strings]

# Add binary codes to the DataFrame
df['binary_hash'] = binary_strings
df['binary_int'] = binary_integers

# Save binary codes to file (optional)
df[['product_id', 'binary_hash']].to_csv('binary_hashes.csv', index=False)import pandas as pd

# Load the first CSV (the one with embeddings)
embeddings_csv_path = 'binary_hashes.csv'  # Change this to the path of your first CSV
embeddings_df = pd.read_csv(embeddings_csv_path)

# Load the second CSV (the one with product details)
product_details_csv_path = 'Dataset.csv'  # Change this to the path of your second CSV
product_details_df = pd.read_csv(product_details_csv_path)

# Ensure both DataFrames have a common column to concatenate on (e.g., product_id)
# If they do, use the following line to merge based on 'product_id'
final_df = pd.merge(embeddings_df, product_details_df, on='product_id', how='inner')

# Save the final DataFrame to a new CSV file
final_dataset_path = 'final_dataset.csv'
final_df.to_csv(final_dataset_path, index=False)

print(f"Final dataset saved to {final_dataset_path}")