In [39]:
import re
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [40]:
stop_words = set(stopwords.words('english'))

In [41]:
def clean_text(text):
    # Check if the text is a string, if not, return an empty string
    if not isinstance(text, str):
        return ''
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and numbers
    tokens = tokenizer.tokenize(text.lower())  # Tokenize using TreebankWordTokenizer
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return ' '.join(tokens)

In [42]:
data = pd.read_csv('data.csv')

In [43]:
data['Cleaned_ProductName'] = data['ProductName'].apply(clean_text)
data['Cleaned_Description'] = data['Description'].apply(clean_text)
data['Cleaned_PrimaryColor'] = data['PrimaryColor'].apply(clean_text)

In [44]:
data['Combined_Text'] = data['Cleaned_ProductName'] + " " + data['Cleaned_Description']


In [45]:
vectorizer = TfidfVectorizer()


In [46]:
tfidf_matrix = vectorizer.fit_transform(data['Combined_Text'])


In [None]:
def recommend_products(query, top_n=5):
    # Clean the user query
    cleaned_query = clean_text(query)

    # Vectorize the user query
    query_vector = vectorizer.transform([cleaned_query])

    # Calculate cosine similarities between query and product descriptions
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()

    # Get the top N most similar products
    top_indices = cosine_similarities.argsort()[-top_n:][::-1]

    # Recommend products
    print(f"Recommendations for: '{query}'\n")
    for idx in top_indices:
        product_id = data.iloc[idx]['ProductID']
        product_name = data.iloc[idx]['ProductName']
        primary_color = data.iloc[idx]['PrimaryColor']
        description = data.iloc[idx]['Description']
        price = data.iloc[idx]['Price (INR)']
        product_link = f"https://www.myntra.com/{product_id}"


        print(f"Product: {product_name}")
        print(f"Primary Color: {primary_color}")
        print(f"Description: {description}")
        print(f"Price: ₹{price}")
        print(f"Link: {product_link}\n")


In [58]:
user_query = "men hoodie"
recommend_products(user_query)

Recommendations for: 'men hoodie'

Product ID: 10259457
Product: hummel Men Blue Solid Hooded Sweatshirt
Primary Color: Blue
Description: Winter is coming, and the Blue hummel Core Cotton Hoodie is here to keep you warm! The soft Cotton material makes it highly comfortable and ideal to wear in the chilly days. The iconic chevron tape sleeves and smart detailing make this hoodie unique.
Price: ₹1199
Link: https://www.myntra.com/10259457

Product ID: 10145047
Product: GAP Boys Logo Hoodie Sweatshirt
Primary Color: nan
Description: Soft textured fabricLong sleevesHoodedFront pockets
Price: ₹1999
Link: https://www.myntra.com/10145047

Product ID: 10244457
Product: GAP Girl's Logo Hoodie Sweatshirt
Primary Color: nan
Description: Soft fleeceLong raglan sleeves, banded cuffsHoodedLogo at chestBanded hem
Price: ₹1999
Link: https://www.myntra.com/10244457

Product ID: 10244453
Product: GAP Girl's Logo Hoodie Sweatshirt
Primary Color: nan
Description: Soft fleeceLong raglan sleeves, banded cuff

In [60]:
len(stop_words)

179