In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
def combined_feature(row):
    return row['title'] + " "+ row['short_description'] + " " + row['slug']+ " " + row['detail_description']

In [6]:
df = pd.read_csv('grocery.csv')

In [9]:
features = ['title', 'short_description', 'detail_description','slug']
for feature in features:
    df[feature] = df[feature].fillna('')

In [10]:
 df["combined_features"] = df.apply(combined_feature, axis=1)

In [11]:
 df["combined_features"] 

0       Abhi Extra Virgin Olive Oil Cooking oil & Ghe...
1      Bacha Cashew Nut Varieties of dry fuits produc...
2      Halo Extra Virgin Olive Oil Cooking oil & Ghee...
3      Safal Gold Sunflower Oil Cooking oil & Ghee pr...
4      Saffi Gold Sunflower Oil Cooking oil & Ghee pr...
                             ...                        
296    Wine - Prosecco Valdobiaddene Liquor products ...
297    Cranberry Wine   Liquor products are available...
298    Gooseberry Grape Wine Liquor products are avai...
299    Red Wine Liquor products are available red-win...
300    Fresh Yogurt Dairy & Bakery products are avail...
Name: combined_features, Length: 301, dtype: object

In [13]:
cv= CountVectorizer()

In [14]:
 count_matrix = cv.fit_transform(df["combined_features"])

In [16]:
 count_matrix.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 2, 0]], dtype=int64)

In [40]:
def get_index_from_title(title):
    return df.index[df.title==title][0]

In [41]:
cosine_sim = cosine_similarity(count_matrix)

In [42]:
cosine_sim

array([[1.        , 0.26311741, 0.92307692, ..., 0.31448545, 0.33968311,
        0.31448545],
       [0.26311741, 1.        , 0.26311741, ..., 0.35856858, 0.38729833,
        0.35856858],
       [0.92307692, 0.26311741, 1.        , ..., 0.31448545, 0.33968311,
        0.31448545],
       ...,
       [0.31448545, 0.35856858, 0.31448545, ..., 1.        , 0.77151675,
        0.42857143],
       [0.33968311, 0.38729833, 0.33968311, ..., 0.77151675, 1.        ,
        0.46291005],
       [0.31448545, 0.35856858, 0.31448545, ..., 0.42857143, 0.46291005,
        1.        ]])

In [43]:
grocery = "Fresh Yogurt"

In [44]:
grocery_index = get_index_from_title(grocery)

In [45]:
similar_products = list(enumerate(cosine_sim[grocery_index]))

In [72]:
similar_products

[(0, 0.3144854510165755),
 (1, 0.3585685828003181),
 (2, 0.3144854510165755),
 (3, 0.3273268353539886),
 (4, 0.3273268353539886),
 (5, 0.3418817293789138),
 (6, 0.3418817293789138),
 (7, 0.27602622373694163),
 (8, 0.3418817293789138),
 (9, 0.23904572186687872),
 (10, 0.3418817293789138),
 (11, 0.0),
 (12, 0.3418817293789138),
 (13, 0.2279211529192759),
 (14, 0.0),
 (15, 0.0),
 (16, 0.0),
 (17, 0.0),
 (18, 0.0),
 (19, 0.2519763153394848),
 (20, 0.2519763153394848),
 (21, 0.6681531047810608),
 (22, 0.2519763153394848),
 (23, 0.26726124191242434),
 (24, 0.3418817293789138),
 (25, 0.0),
 (26, 0.3585685828003181),
 (27, 0.1951800145897066),
 (28, 0.0),
 (29, 0.0),
 (30, 0.20203050891044214),
 (31, 0.3585685828003181),
 (32, 0.4211174438061578),
 (33, 0.3144854510165755),
 (34, 0.2182178902359924),
 (35, 0.2279211529192759),
 (36, 0.26726124191242434),
 (37, 0.3144854510165755),
 (38, 0.20965696734438366),
 (39, 0.3418817293789138),
 (40, 0.13363062095621217),
 (41, 0.0),
 (42, 0.0),
 (43, 0

In [46]:
 sorted_similar_products = sorted(
        similar_products, key=lambda x: x[1], reverse=True)

In [71]:
sorted_similar_products

[(300, 0.9999999999999997),
 (21, 0.6681531047810608),
 (74, 0.6546536707079772),
 (165, 0.6546536707079772),
 (72, 0.6299407883487119),
 (76, 0.6299407883487119),
 (189, 0.6299407883487119),
 (71, 0.6060915267313264),
 (138, 0.5976143046671968),
 (187, 0.5714285714285713),
 (254, 0.561489925074877),
 (108, 0.47809144373375745),
 (47, 0.4629100498862757),
 (297, 0.4629100498862757),
 (299, 0.4629100498862757),
 (121, 0.4285714285714285),
 (296, 0.4285714285714285),
 (298, 0.4285714285714285),
 (32, 0.4211174438061578),
 (284, 0.4211174438061578),
 (48, 0.40089186286863654),
 (115, 0.39477101697586137),
 (46, 0.3779644730092272),
 (84, 0.3779644730092272),
 (103, 0.3779644730092272),
 (233, 0.3779644730092272),
 (270, 0.3779644730092272),
 (1, 0.3585685828003181),
 (26, 0.3585685828003181),
 (31, 0.3585685828003181),
 (68, 0.3585685828003181),
 (97, 0.3585685828003181),
 (171, 0.3585685828003181),
 (180, 0.3585685828003181),
 (181, 0.3585685828003181),
 (197, 0.3585685828003181),
 (219,

In [73]:
def get_title_from_index(index):
    return df[df.index == index]["title"].values[0]

In [74]:
i=1

In [75]:
for product in sorted_similar_products:
    print(get_title_from_index(grocery[1]))
    i=i+1

    if i > 15:
        break

IndexError: index 0 is out of bounds for axis 0 with size 0