In [4]:
import pandas as pd
import os
import numpy as np
from gensim.models import Word2Vec, KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader as api
import nltk
from nltk.corpus import stopwords
import re
import string
import random
from nltk.tokenize import RegexpTokenizer
nltk.download('stopwords')
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 50)
from matplotlib import style
%matplotlib inline


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# Import the imdb movies dataset
df = pd.read_csv("/content/drive/MyDrive/DATASETS/MYNTRA_data.csv")

In [7]:
df.dtypes

product_name         object
brand_name           object
rating              float64
rating_count          int64
marked_price          int64
discounted_price      int64
sizes                object
product_link         object
img_link             object
product_tag          object
brand_tag            object
discount_amount       int64
discount_percent      int64
dtype: object

In [8]:
# Combine movie name and tags into a single string
df['content'] = df['product_name'].astype(str) + ' ' + df['rating'].astype(str) + ' ' + df['product_tag'] + ' ' + df['brand_tag']+ ' ' + df['discount_percent'].astype(str)
df['content'] = df['content'].fillna('')

In [9]:
#Utitlity functions for removing ASCII characters, converting lower case, removing stop words, html and punctuation from description

def _removeNonAscii(s):
    return "".join(i for i in s if  ord(i)<128)

def make_lower_case(text):
    return text.lower()

def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text

df['cleaned_content'] = df['content'].apply(_removeNonAscii)

df['cleaned_content'] = df.cleaned_content.apply(func = make_lower_case)
df['cleaned_content'] = df.cleaned_content.apply(func = remove_stop_words)
df['cleaned_content'] = df.cleaned_content.apply(func=remove_punctuation)
df['cleaned_content'] = df.cleaned_content.apply(func=remove_html)

In [10]:
df.head()

Unnamed: 0,product_name,brand_name,rating,rating_count,marked_price,discounted_price,sizes,product_link,img_link,product_tag,brand_tag,discount_amount,discount_percent,content,cleaned_content
0,Croc Textured Two Fold Wallet,Lino Perros,0.0,0,1295,828,Onesize,wallets/lino-perros/lino-perros-women-peach-co...,"https://assets.myntassets.com/dpr_2,q_60,w_210...",wallets,lino-perros,467,36,Croc Textured Two Fold Wallet 0.0 wallets lino...,croc textured two fold wallet 0 0 wallets lino...
1,Men Striped Sliders,Mast & Harbour,4.0,76,1299,584,"UK6,UK7,UK8,UK9,UK10,UK11",flip-flops/mast--harbour/mast--harbour-men-nav...,"https://assets.myntassets.com/dpr_2,q_60,w_210...",flip-flops,mast--harbour,715,55,Men Striped Sliders 4.0 flip-flops mast--harbo...,men striped sliders 4 0 flip flops mast harbou...
2,Printed A-line Kurta,Biba,4.3,66,1999,1599,"S,M,L,XL,XXL,3XL",kurtas/biba/biba-women-off-white--black-printe...,"https://assets.myntassets.com/dpr_2,q_60,w_210...",kurtas,biba,400,20,Printed A-line Kurta 4.3 kurtas biba 20,printed a line kurta 4 3 kurtas biba 20
3,Girls Floral Printed T-shirt,Anthrilo,0.0,0,599,539,"7-8Y,8-9Y,9-10Y",tshirts/anthrilo/anthrilo-girls-white-floral-p...,"https://assets.myntassets.com/dpr_2,q_60,w_210...",tshirts,anthrilo,60,10,Girls Floral Printed T-shirt 0.0 tshirts anthr...,girls floral printed t shirt 0 0 tshirts anthr...
4,Women Printed Kurta with Skirt,FASHION DWAR,0.0,0,2899,2899,"S,M,L,XL",kurta-sets/fashion-dwar/fashion-dwar-women-mul...,"https://assets.myntassets.com/dpr_2,q_60,w_210...",kurta-sets,fashion-dwar,0,0,Women Printed Kurta with Skirt 0.0 kurta-sets ...,women printed kurta skirt 0 0 kurta sets fashi...


In [11]:
os.chdir("/content/drive/MyDrive/glove FILES")

In [12]:
glove_file="/content/drive/MyDrive/glove FILES/glove.6B.100d.txt"

In [13]:
# Load the GloVe model
def load_glove_model(glove_file):
    print("Loading GloVe Model")
    glove_model = {}
    with open(glove_file, 'r', encoding='utf8') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array([float(val) for val in split_line[1:]])
            glove_model[word] = embedding
    print("Done.", len(glove_model), "words loaded!")
    return glove_model

glove_model = load_glove_model('glove.6B.100d.txt')


Loading GloVe Model
Done. 400000 words loaded!


In [14]:
# Function to convert description to Glove vector
def description_to_glove_vector(description, glove_model):
    words = description.split()
    word_vectors = [glove_model.get(word, np.zeros((100,))) for word in words]

    # Filter out zero vectors (which represent missing words in the GloVe model)
    valid_vectors = [vec for vec in word_vectors if np.any(vec)]

    if len(valid_vectors) == 0:
        return np.zeros((100,))
    else:
        # Normalize the vector
        mean_vector = np.mean(valid_vectors, axis=0)
        return mean_vector / np.linalg.norm(mean_vector)




In [15]:
# Applying function to create Glove vectors
glove_data = df['cleaned_content'].apply(lambda x: description_to_glove_vector(x, glove_model))

# Slice before converting to DataFrame, taking first 50 values
glove_data1 = glove_data.apply(lambda x: x[:50])

# Converting the arrays into a DataFrame
glove_df = pd.DataFrame(glove_data1.tolist(), columns=[f'glove_{i}' for i in range(50)])

In [16]:
glove_df.head()

Unnamed: 0,glove_0,glove_1,glove_2,glove_3,glove_4,glove_5,glove_6,glove_7,glove_8,glove_9,glove_10,glove_11,glove_12,glove_13,glove_14,glove_15,glove_16,glove_17,glove_18,glove_19,glove_20,glove_21,glove_22,glove_23,glove_24,glove_25,glove_26,glove_27,glove_28,glove_29,glove_30,glove_31,glove_32,glove_33,glove_34,glove_35,glove_36,glove_37,glove_38,glove_39,glove_40,glove_41,glove_42,glove_43,glove_44,glove_45,glove_46,glove_47,glove_48,glove_49
0,-0.024927,0.122933,0.063235,-0.129247,0.018655,0.09634,0.100229,0.091411,-0.064236,0.015906,0.120669,-0.019985,-0.078888,0.092039,0.189208,0.151269,-0.031964,0.089766,0.023174,0.133837,0.171496,0.140531,0.046684,0.131931,0.100724,0.076578,0.073899,-0.064015,0.070426,-0.159391,0.099476,0.040235,-0.058006,0.001576,0.051579,0.025041,0.049633,0.084987,-0.074223,0.043819,0.13201,-0.31585,0.067661,-0.111826,-0.060367,-0.044804,-0.030665,-0.021186,0.028195,0.025467
1,-0.060036,0.089251,0.019238,-0.121234,-0.080179,0.06719,0.074739,0.094264,-0.164227,0.032992,0.164014,0.012498,0.041703,0.117063,0.172407,-0.008138,-0.014557,0.065287,0.028456,0.026272,0.218303,0.111036,0.133575,0.01805,0.156931,0.068419,0.015644,-0.067449,0.091309,-0.111299,0.024052,0.085049,-0.011597,0.043725,0.038232,0.038761,-0.058797,0.120149,0.020548,0.088759,0.058321,-0.19828,-0.005196,-0.047799,-0.026831,-0.03556,0.029722,-0.017738,-0.04062,0.035124
2,-0.036418,0.013533,-0.071129,-0.029626,0.041332,0.068435,0.100858,0.056862,-0.087138,0.055032,0.123515,-0.036402,-0.014862,0.089,0.167324,-0.008386,0.06146,0.003345,-0.077276,0.000582,0.11909,-0.01616,0.07029,0.089321,0.115796,-0.006725,0.027957,-0.03336,-0.005836,0.016006,0.00162,0.157288,0.045299,-0.121229,-0.017303,0.084397,0.033769,0.028362,-0.008378,0.052027,0.074447,-0.169841,0.079384,-0.057091,0.00755,-0.109012,0.002405,-0.094542,0.031874,-0.028856
3,-0.027192,0.166186,-0.053562,-0.053834,-0.026201,0.167421,0.110311,0.092757,-0.126827,0.00649,0.116258,-0.086336,0.00474,0.113341,0.156082,0.084683,0.092269,0.016225,0.065597,0.023254,0.099242,0.017445,0.123594,0.090355,0.080043,0.090528,0.069798,-0.053146,0.015804,-0.120163,0.040028,0.110324,0.031504,-0.031432,0.013047,0.041523,-0.023129,0.076664,-0.079622,0.038635,0.136764,-0.163523,0.026497,-0.16138,-0.057204,-0.096437,-0.019671,0.08136,0.041803,-0.071558
4,-0.083352,0.153699,-0.027584,-0.007801,-0.04783,0.166035,0.072846,0.086538,-0.128424,0.027295,0.113238,-0.157941,-0.036927,0.128308,0.128716,0.081869,0.051071,0.062541,0.040653,-0.001782,0.151487,0.06827,0.149471,0.136695,0.063509,0.105356,0.114887,-0.103254,-0.011722,-0.150983,-0.007995,0.067665,0.046286,-0.099054,0.117297,0.031851,0.042911,0.080167,-0.086988,0.103919,0.077865,-0.165304,0.005023,-0.063081,-0.064248,-0.202835,0.056097,0.071726,0.076948,-0.024863


In [17]:
# Convert the glove_data Series to a NumPy array
glove_feature_array = np.vstack(glove_data)


# Hybrid Model


In [18]:
!pip install python-Levenshtein

Collecting python-Levenshtein
  Downloading python_Levenshtein-0.25.0-py3-none-any.whl (9.4 kB)
Collecting Levenshtein==0.25.0 (from python-Levenshtein)
  Downloading Levenshtein-0.25.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (177 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.4/177.4 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rapidfuzz<4.0.0,>=3.1.0 (from Levenshtein==0.25.0->python-Levenshtein)
  Downloading rapidfuzz-3.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m52.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, Levenshtein, python-Levenshtein
Successfully installed Levenshtein-0.25.0 python-Levenshtein-0.25.0 rapidfuzz-3.7.0


In [19]:
import pandas as pd
from scipy.spatial.distance import cdist
from Levenshtein import distance as levenshtein_distance

In [20]:
# Model 1: Define a function to recommend similar items based on a user input product name
def recommend_similar_items(user_item_index, glove_feature_array, top_n=10):
    # Compute cosine similarities between the user item and all other items
    user_item_vector = glove_feature_array[user_item_index].reshape(1, -1)
    similarity_scores = cdist(user_item_vector, glove_feature_array, metric='cosine')

    # Get top N most similar items (excluding the user item itself)
    similar_items = sorted(enumerate(similarity_scores[0]), key=lambda x: x[1])[:top_n]

    return similar_items



In [21]:
# Model 2: Define a function to find the closest match for the user input among product names using Levenshtein Distance
def find_closest_match(user_input, product_names):
    return min(product_names, key=lambda x: levenshtein_distance(user_input, x))

In [22]:
def hybrid_recommendation(user_input, glove_feature_array, df):
    recommendations_list = []  # List to store recommendations

    # Model 1: Get recommendations based on the user input (similar product name)
    closest_product_name = find_closest_match(user_input, df['product_name'])
    if closest_product_name:
        user_item_index = df.index[df['product_name'] == closest_product_name][0]
        recommendations = recommend_similar_items(user_item_index, glove_feature_array)
        recommendations_list.extend((df.loc[i], score) for i, score in recommendations)

    # Model 2: Get recommendations based on the user input (closest matching product name)
    closest_match_recommendations = find_closest_match(user_input, df['product_name'])
    if closest_match_recommendations and closest_match_recommendations in df['product_name'].values:
        recommendations_list.append((df.loc[df['product_name'] == closest_match_recommendations], 0))  # Score set to 0

    # Sort recommendations by similarity score in descending order
    recommendations_list.sort(key=lambda x: x[1], reverse=True)

    # Combine recommendations from both models
    combined_recommendations = [{'Product Name': rec[0]['product_name'],
                                 'Similarity Score': rec[1],
                                 'Rating': rec[0]['rating'],
                                 'Brand Name': rec[0]['brand_name']}
                                for rec in recommendations_list]

    # Convert the list to a DataFrame
    recommendations_df = pd.DataFrame(combined_recommendations)

    return recommendations_df



In [23]:
# Example usage
user_input = "Casual Shirt for mens"  # Replace with the user input
hybrid_results = hybrid_recommendation(user_input, glove_feature_array, df)

# Display the hybrid recommendations DataFrame (sorted by similarity score)
print("Hybrid recommendations (sorted by similarity score):")
print(hybrid_results)

Hybrid recommendations (sorted by similarity score):
                                         Product Name  Similarity Score  \
0                            Regular Fit Casual Shirt          0.015980   
1                                  Solid Casual Shirt          0.014874   
2                                    Men Casual Shirt          0.014216   
3                                  Solid Casual Shirt          0.013921   
4                                    Men Casual Shirt          0.012189   
5                                    Men Casual Shirt          0.011011   
6                             Menchecked Casual Shirt          0.009608   
7                                Casual Checked Shirt          0.008929   
8                                        Casual Shirt          0.002801   
9                                        Casual Shirt          0.000000   
10  223       Casual Shirt
293       Casual Shirt
...          0.000000   

                                              