In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import json
from pathlib import Path

In [None]:
PRODUCT_INFO_DATASET = r'C:\SEM6\BMC_Hackathon\Datasets\TakeHome-Data Scientist\Data\product_info.json' # Replace with your file path 

In [5]:
def getProductDetails(file_path):
    file_path = Path(file_path).resolve()

    try:
        with open(file_path, 'r') as jsonFile:
            data = json.load(jsonFile)
        
        schemaFields = data['schema']['fields']
        column_names = []

        dataObjects = data['data']

        for i in range(0, len(schemaFields)):
            column_names.append(schemaFields[i]['name'])

        product_df = pd.DataFrame(data=dataObjects, columns=column_names)

        return product_df

        

    except Exception as e:
        print(e)

In [7]:
product_df = getProductDetails(PRODUCT_INFO_DATASET)

In [9]:
product_df["Product_info"] = product_df["Category"] + " " + product_df["Product Name"] + " " + product_df["Sub-Category"]


In [10]:
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
product_vectors = vectorizer.fit_transform(product_df["Product_info"])


def getRelatedProducts(keyword, top_n=5):
    keyword_vector = vectorizer.transform([keyword])

    if keyword_vector.sum() == 0:
        print("Warning: Keyword not found in vocabulary!")
        return []

    similarities = cosine_similarity(keyword_vector, product_vectors).flatten()


    top_indices = similarities.argsort()[::-1][:top_n]

    results = [(product_df.iloc[i]["Product ID"], product_df.iloc[i]["Product Name"], similarities[i]) for i in top_indices]

    return results


In [11]:
query = "printer"
recommended_products = getRelatedProducts(query)
print(recommended_products)

[('TEC-MA-10003230', 'Okidata C610n Printer', np.float64(0.39668129355799125)), ('TEC-MA-10001856', 'Okidata C610n Printer', np.float64(0.39668129355799125)), ('TEC-MA-10000984', 'Okidata MB760 Printer', np.float64(0.38848764331467606)), ('TEC-MA-10003337', 'Okidata B401 Printer', np.float64(0.38848764331467606)), ('TEC-MA-10003176', 'Okidata B400 Printer', np.float64(0.38848764331467606))]
