In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('/content/final_data.csv')

In [3]:
df

Unnamed: 0,Discription,url,Vendor
0,KINGDOM OF WHITE Replay Full Sleeves White Shi...,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,Amazon
1,United Colors of Benetton Men Shirt,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,Amazon
2,KINGDOM OF WHITE Cloudy Full Sleeves White Kur...,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,Amazon
3,United Colors of Benetton Men Shirt,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,Amazon
4,Dennis Lingo Men's Solid Slim Fit Cotton Casua...,https://www.amazon.in/Dennis-Lingo-Solid-Casua...,Amazon
...,...,...,...
1251,Multi-color High Neck Heart Print Top,https://www.nykaafashion.com/mixt-by-nykaa-fas...,NykaaFashion
1252,Women Self Design Regular Fit Top,https://www.nykaafashion.com/only-women-self-d...,NykaaFashion
1253,Lilac Crop Top,https://www.nykaafashion.com/i-saw-it-first-li...,NykaaFashion
1254,Blue Printed A-Line Top,https://www.nykaafashion.com/qomn-blue-printed...,NykaaFashion


In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re

In [5]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
# Define stopwords
stop_words = set(stopwords.words('english'))

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

In [7]:
# Function to preprocess text
def preprocess_text(text):
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    
    # Convert text to lowercase
    text = text.lower()
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove stopwords and perform lemmatization
    cleaned_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    
    # Join tokens back into a single string
    cleaned_text = ' '.join(cleaned_tokens)
    
    return cleaned_text

In [8]:
# Apply preprocessing to 'description' column
df['cleaned_description'] = df['Discription'].apply(preprocess_text)


In [9]:

# Save preprocessed data to a new CSV file
df.to_csv('/content/preprocessed_data.csv', index=False)

In [10]:
df

Unnamed: 0,Discription,url,Vendor,cleaned_description
0,KINGDOM OF WHITE Replay Full Sleeves White Shi...,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,Amazon,kingdom white replay full sleeve white shirt r...
1,United Colors of Benetton Men Shirt,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,Amazon,united color benetton men shirt
2,KINGDOM OF WHITE Cloudy Full Sleeves White Kur...,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,Amazon,kingdom white cloudy full sleeve white kurta w...
3,United Colors of Benetton Men Shirt,https://www.amazon.in/sspa/click?ie=UTF8&spc=M...,Amazon,united color benetton men shirt
4,Dennis Lingo Men's Solid Slim Fit Cotton Casua...,https://www.amazon.in/Dennis-Lingo-Solid-Casua...,Amazon,dennis lingo men solid slim fit cotton casual ...
...,...,...,...,...
1251,Multi-color High Neck Heart Print Top,https://www.nykaafashion.com/mixt-by-nykaa-fas...,NykaaFashion,multi color high neck heart print top
1252,Women Self Design Regular Fit Top,https://www.nykaafashion.com/only-women-self-d...,NykaaFashion,woman self design regular fit top
1253,Lilac Crop Top,https://www.nykaafashion.com/i-saw-it-first-li...,NykaaFashion,lilac crop top
1254,Blue Printed A-Line Top,https://www.nykaafashion.com/qomn-blue-printed...,NykaaFashion,blue printed line top


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [12]:
# Load preprocessed data from CSV file
df = pd.read_csv('/content/preprocessed_data.csv')

# Get the preprocessed descriptions as a list
preprocessed_descriptions = df['cleaned_description'].tolist()


In [13]:
# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the preprocessed descriptions
tfidf_matrix = vectorizer.fit_transform(preprocessed_descriptions)



In [14]:
# Function to compute similarity between input text and database texts
def compute_similarity(input_text):
    # Preprocess the input text
    cleaned_input_text = preprocess_text(input_text)
    
    # Transform the input text using the fitted TF-IDF vectorizer
    input_tfidf = vectorizer.transform([cleaned_input_text])
    
    # Compute cosine similarity between the input text and database texts
    similarity_scores = cosine_similarity(input_tfidf, tfidf_matrix)
    
    # Get indices of top-N most similar items
    top_indices = similarity_scores.argsort()[0][-N:][::-1]
    
    # Get URLs of top-N most similar items
    top_urls = df.loc[top_indices, 'url'].tolist()
    
    return top_urls


In [15]:
# Example usage
input_text = "slim summer t-shirts"
N = 5  # Number of top similar items to retrieve

# Compute similarity and get top-N most similar item URLs
similar_item_urls = compute_similarity(input_text)

print(similar_item_urls)


['https://www.nykaafashion.com/cover-story-summer-peach-shirt/p/7599620', 'https://www.flipkart.com/dennis-lingo-men-striped-casual-black-shirt/p/itm84e2bcc38582a?pid=SHTFNANMQRMCPEH8&lid=LSTSHTFNANMQRMCPEH8D1XR0F&marketplace=FLIPKART&q=shirts&store=clo%2Fash%2Faxc&srno=s_2_74&otracker=search&otracker1=search&fm=organic&iid=e589643c-b331-4c05-b484-15f529f9fe07.SHTFNANMQRMCPEH8.SEARCH&ppt=None&ppn=None&ssid=tngovjzwk00000001684598847225&qH=514ca2b2d46ccf67', 'https://www.amazon.in/DHRUVI-TRENDZ-Abstract-Printed-Regular/dp/B0BZVWJ32L/ref=sxin_20_slsr_d_i_fsnewarrivals_fa_1_B0BZVWJ32L?content-id=amzn1.sym.75fb7dbe-1dae-415e-a068-997b8c3956f7%3Aamzn1.sym.75fb7dbe-1dae-415e-a068-997b8c3956f7&crid=3EKVZNS1FCSRP&cv_ct_cx=shirt&keywords=shirt&pd_rd_i=B0BZVWJ32L&pd_rd_r=ccd40d52-081a-4088-8598-28ca3f836aca&pd_rd_w=hf9Pj&pd_rd_wg=AMbk2&pf_rd_p=75fb7dbe-1dae-415e-a068-997b8c3956f7&pf_rd_r=ZHEDTJH0ACM8JS09E1J7&qid=1684559026&sbo=RZvfv%2F%2FHxDF%2BO5021pAnSA%3D%3D&sprefix=shir%2Caps%2C310&sr=1-2-5b

In [16]:
for item in similar_item_urls:
  print(item)

https://www.nykaafashion.com/cover-story-summer-peach-shirt/p/7599620
https://www.flipkart.com/dennis-lingo-men-striped-casual-black-shirt/p/itm84e2bcc38582a?pid=SHTFNANMQRMCPEH8&lid=LSTSHTFNANMQRMCPEH8D1XR0F&marketplace=FLIPKART&q=shirts&store=clo%2Fash%2Faxc&srno=s_2_74&otracker=search&otracker1=search&fm=organic&iid=e589643c-b331-4c05-b484-15f529f9fe07.SHTFNANMQRMCPEH8.SEARCH&ppt=None&ppn=None&ssid=tngovjzwk00000001684598847225&qH=514ca2b2d46ccf67
https://www.amazon.in/DHRUVI-TRENDZ-Abstract-Printed-Regular/dp/B0BZVWJ32L/ref=sxin_20_slsr_d_i_fsnewarrivals_fa_1_B0BZVWJ32L?content-id=amzn1.sym.75fb7dbe-1dae-415e-a068-997b8c3956f7%3Aamzn1.sym.75fb7dbe-1dae-415e-a068-997b8c3956f7&crid=3EKVZNS1FCSRP&cv_ct_cx=shirt&keywords=shirt&pd_rd_i=B0BZVWJ32L&pd_rd_r=ccd40d52-081a-4088-8598-28ca3f836aca&pd_rd_w=hf9Pj&pd_rd_wg=AMbk2&pf_rd_p=75fb7dbe-1dae-415e-a068-997b8c3956f7&pf_rd_r=ZHEDTJH0ACM8JS09E1J7&qid=1684559026&sbo=RZvfv%2F%2FHxDF%2BO5021pAnSA%3D%3D&sprefix=shir%2Caps%2C310&sr=1-2-5b2ee5ec-0

In [None]:
#floral shirt
#jeans
#round neck tshirt
#mini shorts