In [11]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec, KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader as api
import nltk
from nltk.corpus import stopwords
import re
import string
import random
from nltk.tokenize import RegexpTokenizer
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Import the imdb movies dataset
df = pd.read_csv('/content/drive/MyDrive/DATASETS/data.csv')

In [4]:
df.head()

Unnamed: 0,product_name,brand_name,rating,rating_count,marked_price,discounted_price,sizes,product_link,img_link,product_tag,brand_tag,discount_amount,discount_percent
0,Croc Textured Two Fold Wallet,Lino Perros,0.0,0,1295,828,Onesize,wallets/lino-perros/lino-perros-women-peach-co...,"https://assets.myntassets.com/dpr_2,q_60,w_210...",wallets,lino-perros,467,36
1,Men Striped Sliders,Mast & Harbour,4.0,76,1299,584,"UK6,UK7,UK8,UK9,UK10,UK11",flip-flops/mast--harbour/mast--harbour-men-nav...,"https://assets.myntassets.com/dpr_2,q_60,w_210...",flip-flops,mast--harbour,715,55
2,Printed A-line Kurta,Biba,4.3,66,1999,1599,"S,M,L,XL,XXL,3XL",kurtas/biba/biba-women-off-white--black-printe...,"https://assets.myntassets.com/dpr_2,q_60,w_210...",kurtas,biba,400,20
3,Girls Floral Printed T-shirt,Anthrilo,0.0,0,599,539,"7-8Y,8-9Y,9-10Y",tshirts/anthrilo/anthrilo-girls-white-floral-p...,"https://assets.myntassets.com/dpr_2,q_60,w_210...",tshirts,anthrilo,60,10
4,Women Printed Kurta with Skirt,FASHION DWAR,0.0,0,2899,2899,"S,M,L,XL",kurta-sets/fashion-dwar/fashion-dwar-women-mul...,"https://assets.myntassets.com/dpr_2,q_60,w_210...",kurta-sets,fashion-dwar,0,0


In [5]:
# Combine movie name and tags into a single string
df['content'] = df['product_name'].astype(str) + ' ' + df['rating'].astype(str) + ' ' + df['product_tag'] + ' ' + df['brand_tag']
df['content'] = df['content'].fillna('')

In [12]:
#Utitlity functions for removing ASCII characters, converting lower case, removing stop words, html and punctuation from description

def _removeNonAscii(s):
    return "".join(i for i in s if  ord(i)<128)

def make_lower_case(text):
    return text.lower()

def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text

df['cleaned_content'] = df['content'].apply(_removeNonAscii)

df['cleaned_content'] = df.cleaned_content.apply(func = make_lower_case)
df['cleaned_content'] = df.cleaned_content.apply(func = remove_stop_words)
df['cleaned_content'] = df.cleaned_content.apply(func=remove_punctuation)
df['cleaned_content'] = df.cleaned_content.apply(func=remove_html)

In [13]:
# Load the pre-trained Word2Vec model (Google News)
model = api.load('word2vec-google-news-300')



In [14]:
# Function to average word vectors for a text
def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    nwords = 0.

    for word in words:
        if word in vocabulary:
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])

    if nwords:
        feature_vector = np.divide(feature_vector, nwords)

    return feature_vector

In [15]:
# Function to compute average word vectors for all products
def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.index_to_key)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features) for tokenized_sentence in corpus]
    return np.array(features)

In [16]:
# Compute average word vectors for all products
w2v_feature_array = averaged_word_vectorizer(corpus=df['cleaned_content'], model=model, num_features=300)

In [28]:
def get_top_recommendations(product_name, df, w2v_feature_array, model):
    # Find the index of the user item
    item_index = df[df['product_name'] == product_name].index[0]

    # Compute the cosine similarities between the user item and all other items
    user_item_vector = w2v_feature_array[item_index].reshape(1, -1)
    similarity_scores = cosine_similarity(user_item_vector, w2v_feature_array)

    # Get the top 10 most similar products
    similar_items = list(enumerate(similarity_scores[0]))
    sorted_similar_items = sorted(similar_items, key=lambda x: x[1], reverse=True)[1:11]

    # Return the top 10 similar products
    top_recommendations = [(i, df.loc[i, 'product_name'], df.loc[i, 'brand_name'], df.loc[i, 'rating'])
                           for i, _ in sorted_similar_items]
    return top_recommendations

In [31]:
# Define the product name
user_item = "Women Printed Kurta with Skirt"  # Replace with the desired product name

# Get top 10 recommendations
top_recommendations = get_top_recommendations(user_item, df, w2v_feature_array, model)

# Print the top 10 recommended products with brand name and rating
for i, product_name, brand_name, rating in top_recommendations:
    print("{}: {}, Brand: {}, Rating: {}".format(i, product_name, brand_name, rating))

67528: Women Printed Kurta with Skirt, Brand: Indo Era, Rating: 0.0
67645: Women Printed Kurta with Skirt, Brand: Indo Era, Rating: 0.0
17195: Women Yoke Design Kurta with Sharara With Dupatta, Brand: Ishin, Rating: 0.0
57327: Women Yoke Design Kurta with Sharara With Dupatta, Brand: Ishin, Rating: 0.0
9646: Women Printed Kurti with Sharara With Dupatta, Brand: KALINI, Rating: 0.0
9741: Women Printed Kurti with Sharara With Dupatta, Brand: KALINI, Rating: 0.0
20241: Women Printed Kurta with Sharara, Brand: Kiana, Rating: 3.0
19051: Women Printed Satin Kurta, Brand: Soch, Rating: 0.0
75609: Women Printed Kurti with Trousers With Dupatta, Brand: Fabriko, Rating: 0.0
75659: Women Printed Kurti with Trousers With Dupatta, Brand: Fabriko, Rating: 0.0
