In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import re

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
from sklearn.metrics import pairwise_distances

In [36]:
from sklearn.preprocessing import Normalizer

## Load the data

In [46]:
df=pd.read_csv("flipkart_com-ecommerce_sample.csv")

## Pre Processing

In [47]:
df.columns

Index(['uniq_id', 'crawl_timestamp', 'product_url', 'product_name',
       'product_category_tree', 'pid', 'retail_price', 'discounted_price',
       'image', 'is_FK_Advantage_product', 'description', 'product_rating',
       'overall_rating', 'brand', 'product_specifications'],
      dtype='object')

In [48]:
df=df.drop_duplicates(subset='product_name')

In [50]:
original_df=df.copy()

In [51]:
dropCols=['uniq_id','crawl_timestamp','product_url','image','is_FK_Advantage_product','pid']

In [52]:
df=df.drop(dropCols,axis=1)

In [53]:
df.isna().sum()

product_name                 0
product_category_tree        0
retail_price                56
discounted_price            56
description                  1
product_rating               0
overall_rating               0
brand                     3762
product_specifications       8
dtype: int64

In [12]:
df.product_rating=df.product_rating.replace("No rating available",0)
df.overall_rating=df.overall_rating.replace("No rating available",0)

In [13]:
df.product_rating=df.product_rating.astype("float")
df.overall_rating=df.overall_rating.astype("float")

In [14]:
def removeUnwantedElements(strValue):
#     print(strValue)
    strValue=re.sub("[^a-zA-Z]"," ",strValue)
    strValue=strValue.split()
    newStr="";
    for eachStr in strValue:
        if(len(eachStr)>2):
            newStr=newStr+" "+eachStr
    return newStr

In [15]:
for col in df.columns:
    if(df[col].dtypes=='object'):
        df[col]=df[col].fillna("")
        df[col]=df[col].apply(lambda rowVal: removeUnwantedElements(rowVal))
    else:
        df[col]=df[col].fillna(0).astype("float")


## Converting brand to dummies

In [16]:
df=pd.concat([df,pd.get_dummies(df.brand)],axis=1)

In [17]:
df.drop(['brand'],axis=1,inplace=True)

In [18]:
original_df.columns

Index(['product_name', 'product_category_tree', 'retail_price',
       'discounted_price', 'description', 'product_rating', 'overall_rating',
       'brand', 'product_specifications'],
      dtype='object')

## Doing TF IDF Vectorizer on each Column

In [19]:
naCount=df.isna().sum()
naCount[naCount>0]

Series([], dtype: int64)

In [20]:
df.columns

Index(['product_name', 'product_category_tree', 'retail_price',
       'discounted_price', 'description', 'product_rating', 'overall_rating',
       'product_specifications', '', ' AAKAR',
       ...
       ' vinay', ' vinaya', ' walletsnbags', ' wallskart', ' womaniya',
       ' xpert', ' youniqueshop', ' zDelhi com', ' zaidis', ' zasmina'],
      dtype='object', length=3396)

In [21]:
vectorColumns=['product_name','product_category_tree','description','product_specifications']

In [22]:
def vectorizeAndAppendDataFrame(columnName,dataFrame,min_df):
    vectorizer=TfidfVectorizer(analyzer="word",min_df=min_df,stop_words="english")
    x=vectorizer.fit_transform(df[columnName])
    x=x.toarray()
    x=pd.DataFrame(x)
    vocab=vectorizer.get_feature_names()
    x.columns=[columnName+"_"+str(col) for col in x.columns]
    dataFrame=dataFrame.drop([columnName],axis=1)
    dataFrame=pd.concat([dataFrame,x],axis=1)
    return dataFrame,vocab

In [31]:
df=df.reset_index()
df=df.drop(['index'],axis=1)

In [32]:
for col in vectorColumns:
    df,vocab=vectorizeAndAppendDataFrame(col,df,0.01)
    print(col,"has vocabulary of ",len(vocab))

product_name has vocabulary of  73
product_category_tree has vocabulary of  114
description has vocabulary of  383
product_specifications has vocabulary of  370


In [33]:
naCount=df.isna().sum()
naCount[naCount>0]

Series([], dtype: int64)

In [34]:
len(df.columns)

4332

## Normalizing Price

In [37]:
df[['retail_price','discounted_price']]=Normalizer().fit_transform(df[['retail_price','discounted_price']])

## Doing pairwise Similarity

In [40]:
simResult=pairwise_distances(df,metric="cosine")

In [41]:
similarityDf=pd.DataFrame(simResult)

In [42]:
similarityDf.shape

(12676, 12676)

In [54]:
similarityDf.columns=original_df.pid

In [55]:
similarityDf.index=original_df.pid

In [57]:
similarityDf.head()

pid,SRTEH2FF9KEDEFGF,SBEEH3QGU7MFYJFY,SHOEH4GRSUBJGZXE,PSOEH3ZYDMSYARJ5,PWTEB7H2E4KCYUE3,SHOEH3DZBFR88SCK,SHOEH4KM2W3Z6EH5,SWIEHF3EF5PZAZUY,PSOEH3ZYYFETGCCD,BOTEGYTZ2T6WUJMM,...,STIE5UVGW2JWVWCT,STIEC889ZGFD3RCE,STIE2KFZTQFVYZQQ,SHODYZ6SYRMHDYPB,STIE9F5UGVJFQYZH,SNDEY8UH5TZ2AJCK,STIE2ZEPACRQJKH7,SNDDX969ZZJJKSHB,SNDEF3R3VB55PJBX,STIE4NXGSXG5GFR2
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SRTEH2FF9KEDEFGF,0.0,0.74823,0.677103,0.704454,0.786227,0.702858,0.904756,0.584718,0.869076,0.721611,...,0.724763,0.721104,0.725121,0.896133,0.730894,0.761778,0.89472,0.905138,0.63067,0.708748
SBEEH3QGU7MFYJFY,0.74823,0.0,0.699197,0.707298,0.732622,0.714319,0.909893,0.74332,0.884869,0.74993,...,0.719539,0.719794,0.714548,0.906187,0.717695,0.842542,0.895019,0.922335,0.769514,0.699828
SHOEH4GRSUBJGZXE,0.677103,0.699197,0.0,0.67753,0.752733,0.410047,0.807017,0.701619,0.854728,0.692572,...,0.69605,0.692944,0.694068,0.735716,0.696845,0.56132,0.885075,0.793583,0.312889,0.649572
PSOEH3ZYDMSYARJ5,0.704454,0.707298,0.67753,0.0,0.730982,0.727762,0.905822,0.738307,0.140018,0.589886,...,0.662802,0.652499,0.609941,0.896621,0.639902,0.817176,0.862169,0.911898,0.746795,0.633235
PWTEB7H2E4KCYUE3,0.786227,0.732622,0.752733,0.730982,0.0,0.710432,0.922688,0.790915,0.929189,0.755294,...,0.764543,0.756334,0.73277,0.914296,0.729758,0.849557,0.90587,0.926107,0.795336,0.744786


## Saving the distance measure to a file

In [104]:
similarityDf.to_csv("flipkartProductCosineDistance.csv")

In [98]:
similarityDf['SRTEH2FF9KEDEFGF'].nsmallest(6).index

Index(['SRTEH2FF9KEDEFGF', 'SRTEHVURZFRUZUCZ', 'BXREJYMSM2HRZYVR',
       'TKPEGM48GGD2QKZN', 'BBOEGGV3NYMKAFMA'],
      dtype='object', name='pid')

## Simulating Simple Reco System

In [65]:
from ipywidgets import widgets

In [102]:
def getnLargestSimilarItems(product_id):
    return list(similarityDf[str(product_id)].nsmallest(6).index),list(similarityDf[product_id].nsmallest(6).values)

def getProductInfo(product_id):
    itemRow=original_df[original_df.pid==str(product_id)]
    pName=itemRow.product_name
    pDesc=itemRow.description
    pProductSpec=itemRow.product_specifications
    pCat=itemRow.product_category_tree
    return pName,pDesc,pProductSpec,pCat

def printProductInfo(product_id):
    name,desc,spec,cat=getProductInfo(product_id)
    print("Name:",name)
    print("Desc:",desc)
    print("Spec:",spec)
    print("cat:",cat)
    
def getProductIdAndRecommend(sender):
    product_id=sender.value;
    itemRow=original_df[original_df.pid==str(product_id)]
    if(str(product_id) in similarityDf.columns):
        similarProductList,similarityScore=getnLargestSimilarItems(product_id)
        print("Current Products Below::")
        printProductInfo(product_id)
        print("\nPrint Similar Products\n");
        for i in range(len(similarProductList)):
            if(similarProductList[i]!=product_id):
                printProductInfo(similarProductList[i])
                print("Similarity Score of this product is ",(1-similarityScore[i]))
                print("\n\n")

In [103]:
text=widgets.Text()
display(text)
def handle_submit(asd):
    print(asd.value)
text.on_submit(getProductIdAndRecommend)

Text(value='')

Current Products Below::
Name: 2    AW Bellies
Name: product_name, dtype: object
Desc: 2    Key Features of AW Bellies Sandals Wedges Heel...
Name: description, dtype: object
Spec: 2    {"product_specification"=>[{"key"=>"Ideal For"...
Name: product_specifications, dtype: object
cat: 2    ["Footwear >> Women's Footwear >> Ballerinas >...
Name: product_category_tree, dtype: object

Print Similar Products

Name: 160    Selfie Boots
Name: product_name, dtype: object
Desc: 160    Selfie Boots\n                         Price: ...
Name: description, dtype: object
Spec: 160    {"product_specification"=>[{"key"=>"Ideal For"...
Name: product_specifications, dtype: object
cat: 160    ["Footwear >> Women's Footwear >> Casual Shoes...
Name: product_category_tree, dtype: object
Similarity Score of this product is  0.7594917387601814



Name: 242    Salt N Pepper 12-298 Taupe Boots
Name: product_name, dtype: object
Desc: 242    Salt N Pepper 12-298 Taupe Boots\n            ...
Name: description, dty