In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from io import StringIO
import requests
import json

In [2]:
# Load the dataset
file_id = '1KgGlozneNOpYSpQA8DRl78ovpDNRDGEo'
api_key = 'AIzaSyCPAiXOoDOc2FXJ35dgcchnmruCOq6j7P4'
url = f'https://www.googleapis.com/drive/v3/files/{file_id}?alt=media&key={api_key}'
response = requests.get(url)
df = pd.read_csv(StringIO(response.text))

In [3]:
def clean_text(text):
    # Remove special characters and punctuation
    text = re.sub(r"[^\w\s]", "", text)
    return text

In [4]:
# Function to retrieve the product URL from web scraping
def get_product_url(product_name):
    # Perform a web search for the product name
    driver = webdriver.Chrome()
    search_url = f'https://www.google.com/search?q={product_name}'
    driver.get(search_url)
    for i in driver.find_elements(By.CSS_SELECTOR, "div.yuRUbf > a"):
        if i.text != '':
            return i.get_attribute('href')

In [5]:
# Preprocess the text data
df['description'] = df['description'].astype(str)  # Convert to string type
df['description'] = df['description'].str.lower()  # Convert to lowercase
df['description'] = df['description'].apply(clean_text)

In [6]:
# Similarity measurement
def compute_similarity(input_text, vectorizer):
    input_features = vectorizer.transform([input_text])
    similarities = cosine_similarity(input_features, features)
    return similarities

In [7]:
# Function to retreive similar items
def get_top_similar_items(input_text, vectorizer, n=5):
    similarities = compute_similarity(input_text, vectorizer)
    indices = np.argsort(similarities)[0][::-1][:n]  # Get indices of top-n similar items
    top_items = df.iloc[indices]
    result = []
    for _, item in top_items.iterrows():
        result.append({
            'name': item['name'],
            'url': get_product_url(item['name']),
            'similarity': similarities[0][item.name]
        })
    result = json.dumps(result)
    return result

In [9]:
vectorizer = TfidfVectorizer()
features = vectorizer.fit_transform(df['description'])
x = get_top_similar_items("Black printed sweatshirt has long sleeves", vectorizer)
print(x)

[{"name": "Killer Men Black Printed Sweatshirt", "url": "https://www.flipkart.com/mens-sweatshirts/killer~brand/pr?sid=clo%2Cqvw%2C64a%2Cvui", "similarity": 0.8747141077008388}, {"name": "Bossini Men Black Printed Hooded Sweatshirt", "url": "https://www.flipkart.com/mens-sweatshirts/bossini~brand/pr?sid=clo%2Cqvw%2C64a%2Cvui", "similarity": 0.8584353189882535}, {"name": "Indian Terrain Boys Black Printed Sweatshirt", "url": "https://www.indianterrain.com/boys-sweatshirts", "similarity": 0.8584353189882535}, {"name": "Genius18 Men Grey & Black Printed Sweatshirt", "url": "https://www.flipkart.com/genius18-self-design-men-grey-track-pants/p/itmfg8dfdnuqm9zk", "similarity": 0.7928853130529803}, {"name": "Calvin Klein Jeans Men Black Printed Sweatshirt", "url": "https://www.flipkart.com/mens-sweatshirts/calvin-klein-jeans~brand/pr?sid=clo%2Cqvw%2C64a%2Cvui", "similarity": 0.7891953909584304}]
