In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity
from scipy.spatial.distance import cityblock, jaccard
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load the dataset
file_path = "/content/drive/MyDrive/NLP/Project /ecommerce_complete_Rank_Crowd_AGG_Descriptions_ALL_ROWS.csv"
data = pd.read_csv(file_path, encoding='latin-1')

print(data.head())
print(data.columns)

# Extract product descriptions
descriptions = data['product_description'].dropna()

# Initialize the CountVectorizer
vectorizer = CountVectorizer(stop_words='english', max_features=1000)
bow_matrix = vectorizer.fit_transform(descriptions)
product_vectors = bow_matrix.toarray()

# Define the list of queries
queries = [
    "Wireless Bluetooth Headphones",
    "Stainless Steel Water Bottle",
    "Laptop Carrying Case",
    "Gaming Mouse with RGB Lighting",
    "Organic Cotton Bedsheets",
    "Action Camera Waterproof 4K",
    "Electric Standing Desk Adjustable Height",
    "Smartphone Screen Protector Tempered Glass",
    "Noise-Canceling Earbuds for Travel",
    "Portable Mini Projector for Home Theater"
]

# Iterate over each query and run the algorithms
for query in queries:
    print(f"\n--- Results for Query: '{query}' ---\n")

    # Transform the query into BoW
    query_bow = vectorizer.transform([query])
    query_vector = query_bow.toarray()[0]

    # Calculate distances and similarities
    euclidean = euclidean_distances(product_vectors, [query_vector]).flatten()
    cosine = cosine_similarity(product_vectors, [query_vector]).flatten()
    manhattan = np.array([cityblock(product, query_vector) for product in product_vectors])
    jaccard_scores = np.array([jaccard(product > 0, query_vector > 0) for product in product_vectors])

    # Combine results for ranking
    results = []
    for i, idx in enumerate(descriptions.index):
        title = data['product_title'].iloc[idx]  # Align titles with descriptions
        results.append({
            'title': title,
            'euclidean': euclidean[i],
            'cosine': cosine[i],
            'manhattan': manhattan[i],
            'jaccard': jaccard_scores[i]
        })

    # Display top 50 matches for each metric
    metrics = ['euclidean', 'cosine', 'manhattan', 'jaccard']
    for metric in metrics:
        print(f"\nSorted by {metric.capitalize()}:\n")
        sorted_results = sorted(results, key=lambda x: x[metric] if metric != 'cosine' else -x[metric])[:50]
        for match in sorted_results:
            print(f"Title: {match['title']}, {metric.capitalize()}: {match[metric]}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
    _unit_id  relevance  relevance:variance  \
0  711158459       3.67               0.471   
1  711158460       4.00               0.000   
2  711158461       4.00               0.000   
3  711158462       3.67               0.471   
4  711158463       3.33               0.471   

                                       product_image  \
0  http://thumbs2.ebaystatic.com/d/l225/m/mzvzEUI...   
1  http://thumbs3.ebaystatic.com/d/l225/m/mJNDmSy...   
2  http://thumbs4.ebaystatic.com/d/l225/m/m10NZXA...   
3  http://thumbs2.ebaystatic.com/d/l225/m/mZZXTmA...   
4  http://thumbs3.ebaystatic.com/d/l225/m/mzvzEUI...   

                                        product_link  \
0  http://www.ebay.com/itm/Sony-PlayStation-4-PS4...   
1  http://www.ebay.com/itm/Sony-PlayStation-4-Lat...   
2  http://www.ebay.com/itm/Sony-PlayStation-4-PS4...   
3  http://www.ebay.com/itm/

*Average Scores*

* Eculidian: 2.5/50 ( lowest average score)
* Cosine: 24.5/50
* Jaccard: 16.7/50
* Manhattan: 0/50 ( highest average score, but also indicates no matches found for most queries)

*Top Two Algorithms*

Based on the average scores, the top two distance algorithms are:

1. *Cosine*: with an average score of 24.5/50, indicating a moderate level of similarity between product descriptions.
2. *Jaccard*: with an average score of 16.7/50, indicating a relatively low level of similarity between product descriptions.

These results suggest that the Cosine algorithm is more effective at capturing semantic similarities between product descriptions, while the Jaccard algorithm is more sensitive to keyword matches.
