In [1]:
import pandas as pd

df = pd.read_csv('product_new.csv')

In [2]:
df['combined'] = df['product name'] + ' ' + df['product description']

In [4]:
import pandas as pd
import re
from nltk.corpus import stopwords

import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if isinstance(text, float):  
        return ""
    text = str(text).lower() 
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

df = pd.read_csv('product_new.csv')

df['combined'] = df['product name'].fillna('') + ' ' + df['product description'].fillna('')

df['cleaned'] = df['combined'].apply(preprocess_text)

print(df['cleaned'].head())


0    green lays 20 great flavour perfect crunch 20 ...
1    green lays pag mat mild amcor haridwar cpcb re...
2    green lays 10014064000435 mkt address scan bar...
3    green lays sico india holdings 27 dlf qutab en...
4    green lays vour takable cooked cously perfecti...
Name: cleaned, dtype: object


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vivek\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(df['cleaned'])


In [6]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(X, X)


In [7]:
import pickle

with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)
    
with open('cosine_similarity.pkl', 'wb') as f:
    pickle.dump(cosine_sim, f)


In [8]:
import pickle
from sklearn.metrics.pairwise import cosine_similarity

with open('vectorizer.pkl', 'rb') as f:
    loaded_vectorizer = pickle.load(f)

with open('cosine_similarity.pkl', 'rb') as f:
    loaded_cosine_sim = pickle.load(f)

new_data = ["NOUSTRES LTD, PLIT NIE SECTIONE PRATNAGAR, UDKAN SINGH MAGAL RORAPUR SI, UTTARAKHAND Lic. No. 10012012000222, DA SUPER SNACKS PVT LTD, BIA MEERUT HAD ABANUTTAR PRADES HANNA INDUSTRIES LTD, PLOT NO C3, C4 TO C-11, EPIP HAJIPUR INDUSTRIAL AREA, DIST WASHALL BAJPURI, B NDUSTRIES LTD, PLOT NO 621, INDUSTRIAL ESTATE, EMADE KHORCHA ORISSA Lic. No. 2SCURT CO PVT LTD, PUB-BAGAN RACH DET ERICE, PERUNDURA ASSAM Lic. e. 10012470000791 BRITANNIA INDUSTRIES LTD, IS TAVATILA ROAD, KOLKATA TOOL WEST BENGAL Lic. No. 10012001000044-CBHAGWAT PENDERAAPUT KASARA UTTAR PRADESH Lic. No. 1312051000177,50 TANIA INDUSTRIES LTD, PLOT NO MM & SIPCET INDUSTRIAL GROWTH CENTREIN TAMIL NADU L No. 10015020287,6 REAL AGRO INDUSTRIES PVT LTD, SENO ZUA ЖЦА В ЖУЛА, НАЈАВOLLARAM, MEDCHAL MANGAL 17RITANNIA INDUSTRES LTD, PLATNO 2.21 BDA INDUSTRIAL AREA, BALAVEERANA HALLL, BIGAN HOBLI DIST RAMANAGΑΛΑ ANA INDUSTRIES LTD, PUT NO 21, SOC ESTATE, VILLA RANDED JA JAGADIA DIST BHARUCH-1811-10 PALASHBAR DIST KAMRUP GUMAKATI ASSAM Lic. No. 100180710F RANNIA"]

new_vec = loaded_vectorizer.transform(new_data)

df_vectorized = loaded_vectorizer.transform(df['cleaned']) 

similarities = cosine_similarity(new_vec, df_vectorized)

print(similarities)


[[0.         0.05614544 0.13521852 0.09715851 0.05502592 0.
  0.0644062  0.13554688 0.01243613 0.         0.         0.11607243
  0.15177494 0.15516248 0.01058123 0.88353842 0.02036846 0.07210353
  0.         0.         0.         0.         0.         0.07110388
  0.         0.         0.         0.01171679 0.         0.2253009
  0.01195556 0.03352265 0.         0.         0.17960821 0.
  0.         0.09587913 0.         0.         0.00789025 0.01081061
  0.         0.14163893 0.03824445 0.08388156 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
 

In [9]:
import numpy as np

similarity_scores = similarities[0]

top_5_indices = np.argsort(similarity_scores)[-5:][::-1]  # Sort and reverse for descending order

print("Top 5 similar products:")
for idx in top_5_indices:
    print(f"Product: {df['product name'].iloc[idx]} | Similarity Score: {similarity_scores[idx]}")


Top 5 similar products:
Product: Good day biscuit | Similarity Score: 0.883538420618093
Product: Maaza | Similarity Score: 0.22530090209136064
Product: Closeup | Similarity Score: 0.17960820763823476
Product: kurkure | Similarity Score: 0.15516248176365563
Product: Kurkure | Similarity Score: 0.15177494447240103


In [10]:
import numpy as np

similarity_scores = similarities[0]

sorted_indices = np.argsort(similarity_scores)[::-1] 

unique_products = set()

count = 0

print("Top 5 similar products:")
for idx in sorted_indices:
    product_name = df['product name'].iloc[idx]

    if product_name not in unique_products:
        print(f"Product: {product_name} | Similarity Score: {similarity_scores[idx]}")
        unique_products.add(product_name) 
        count += 1
        
    if count == 5:
        break


Top 5 similar products:
Product: Good day biscuit | Similarity Score: 0.883538420618093
Product: Maaza | Similarity Score: 0.22530090209136064
Product: Closeup | Similarity Score: 0.17960820763823476
Product: kurkure | Similarity Score: 0.15516248176365563
Product: Kurkure | Similarity Score: 0.15177494447240103
