# FORMING THE FUNCTION FOR RANKING THE URLS DEPENDING ON MAX SIMILARITY WITH THE INPUT TEXT AND GIVING THE TOP 10 URLS

In [31]:
#importing the libraries to be used
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import numpy as np
import pandas as pd
import math

import json
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
#from sklearn.externals import joblib
import scipy.sparse
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
import warnings; warnings.simplefilter('ignore')
%matplotlib inline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel



In [32]:
#getting the dataset and preprocessiing it
columns=['userId','FABRIC NAME', 'COLOR_OF_DRESS','INT price','PRICE','URL']

cloth = pd.read_csv("clothingFINAL_details.csv")

In [33]:
cloth.head()

Unnamed: 0,S.No,FABRIC_NAME,COLOR_OF_DRESS,price,PRICE,URL
0,0,Georgette Stitched Flared/A-line Gown,Brown,799,₹799,https://www.flipkart.com/vintage-colour-flared...
1,1,Applique Silk Blend Stitched Flared/A-line Gown,Red,399,₹399,https://www.flipkart.com/vintage-colour-flared...
2,2,Printed Crepe Stitched Anarkali Gown,Beige,298,₹298,https://www.flipkart.com/vintage-colour-flared...
3,3,Embroidered Net Semi Stitched Anarkali Gown,Pink,499,₹499,https://www.flipkart.com/vintage-colour-flared...
4,4,Printed Crepe Stitched Anarkali Gown,Beige,298,₹298,https://www.flipkart.com/vintage-colour-flared...


In [34]:
cloth.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   S.No            240 non-null    int64 
 1   FABRIC_NAME     240 non-null    object
 2   COLOR_OF_DRESS  240 non-null    object
 3   price           240 non-null    int64 
 4   PRICE           240 non-null    object
 5   URL             240 non-null    object
dtypes: int64(2), object(4)
memory usage: 11.4+ KB


In [35]:
cloth.shape

(240, 6)

In [36]:
cloth.dtypes

S.No               int64
FABRIC_NAME       object
COLOR_OF_DRESS    object
price              int64
PRICE             object
URL               object
dtype: object

In [37]:
cloth['PRICE'].describe().transpose()


count      240
unique      85
top       ₹499
freq        43
Name: PRICE, dtype: object

In [38]:
#converting str to int
cloth['price'].astype(str).astype(int)
print('Minimum price is: %d' %(cloth.price.min()))
print('Maximum price is: %d' %(cloth.price.max()))



0      799
1      399
2      298
3      499
4      298
      ... 
235    379
236    368
237    495
238    449
239    449
Name: price, Length: 240, dtype: int32

Minimum price is: 199
Maximum price is: 999


In [39]:
#finding missing values
print('Number of missing values across columns: \n',cloth.isnull().sum())


Number of missing values across columns: 
 S.No              0
FABRIC_NAME       0
COLOR_OF_DRESS    0
price             0
PRICE             0
URL               0
dtype: int64


with sns.axes_style('white'):
    g = sns.factorplot("price", data=cloth, aspect=2.0,kind='count')
    g.set_ylabels("Total number of ratings") 


In [40]:

print('Number of unique colors in Raw data = ', cloth['COLOR_OF_DRESS'].nunique())
print('Number of unique price in Raw data = ', cloth['PRICE'].nunique())


Number of unique colors in Raw data =  31
Number of unique price in Raw data =  85


In [41]:
#sorting based on count
most_priced=cloth.groupby('price').size().sort_values(ascending=False)[:10]
print('Top 10 frequency of prices cloths available: \n',most_priced)


Top 10 frequency of prices cloths available: 
 price
499    43
399    22
649    14
284    12
699    11
799     8
298     6
749     5
779     5
769     5
dtype: int64


In [42]:
counts=cloth.COLOR_OF_DRESS.value_counts()
cloth_final=cloth[cloth.COLOR_OF_DRESS.isin(counts[counts>=11].index)]
#print('Number of users who have rated 25 or more items =', len(cloth_final))



In [43]:
cloth.head(1)['FABRIC_NAME']

0    Georgette Stitched Flared/A-line Gown
Name: FABRIC_NAME, dtype: object

In [44]:
#using TfidVectorizer for calculating similarity
from sklearn.feature_extraction.text import TfidfVectorizer
#ignoring the common english words such as( a, an , the etc ) and also commas and symbols while comparing
tfv= TfidfVectorizer(min_df=3, max_features=None , strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
                    ngram_range=(1,3),stop_words='english')
cloth['FABRIC_NAME']=cloth['FABRIC_NAME'].fillna('')

In [45]:
tfv_matrix= tfv.fit_transform(cloth['FABRIC_NAME'])

In [46]:
tfv_matrix

<240x121 sparse matrix of type '<class 'numpy.float64'>'
	with 3399 stored elements in Compressed Sparse Row format>

In [47]:
tfv_matrix.shape


(240, 121)

In [48]:
from sklearn.metrics.pairwise import sigmoid_kernel

In [49]:
sig=sigmoid_kernel(tfv_matrix,tfv_matrix)

In [50]:
sig[0]

array([0.76504323, 0.76291186, 0.76171909, 0.76171367, 0.76171909,
       0.76171909, 0.76171909, 0.76291186, 0.76171909, 0.76171909,
       0.76409083, 0.76171909, 0.76273809, 0.76171367, 0.76168731,
       0.76171909, 0.76171909, 0.76171367, 0.76409083, 0.76171367,
       0.76171909, 0.76171909, 0.76504323, 0.76409083, 0.76227177,
       0.76171367, 0.76171367, 0.76171909, 0.76171909, 0.76174138,
       0.76167399, 0.76239193, 0.76227177, 0.76174138, 0.76171909,
       0.76167402, 0.76168004, 0.76213897, 0.76227177, 0.76171909,
       0.76159416, 0.76184156, 0.76409083, 0.76171367, 0.76304222,
       0.76173829, 0.76171367, 0.76504323, 0.76393561, 0.76301252,
       0.76169484, 0.76291186, 0.76171909, 0.76227177, 0.76174138,
       0.76171909, 0.76267734, 0.76267734, 0.76171909, 0.76187351,
       0.76291186, 0.76171367, 0.76239193, 0.76409083, 0.76304222,
       0.76168004, 0.76171367, 0.7617065 , 0.76174138, 0.76171367,
       0.76168004, 0.76171909, 0.76171909, 0.76282054, 0.76171

In [51]:
#giving indexes to each product
indices=pd.Series(cloth.index,index=cloth['FABRIC_NAME']).drop_duplicates()

In [52]:
indices

FABRIC_NAME
Georgette Stitched Flared/A-line Gown                     0
Applique Silk Blend Stitched Flared/A-line Gown           1
Printed Crepe Stitched Anarkali Gown                      2
Embroidered Net Semi Stitched Anarkali Gown               3
Printed Crepe Stitched Anarkali Gown                      4
                                                       ... 
Printed Rayon Blend Semi Stitched Flared/A-line Gown    235
Printed Rayon Blend Stitched Anarkali Gown              236
Rayon Blend Stitched Anarkali Gown                      237
Printed Rayon Blend Stitched Flared/A-line Gown         238
Printed Cotton Blend Stitched Flared/A-line Gown        239
Length: 240, dtype: int64

In [53]:
indices['Rayon Blend Stitched Anarkali Gown']

FABRIC_NAME
Rayon Blend Stitched Anarkali Gown     45
Rayon Blend Stitched Anarkali Gown    184
Rayon Blend Stitched Anarkali Gown    219
Rayon Blend Stitched Anarkali Gown    221
Rayon Blend Stitched Anarkali Gown    237
dtype: int64

In [54]:
sig[45]

array([0.76173829, 0.76213478, 0.76225288, 0.76222435, 0.76225288,
       0.76225288, 0.76225288, 0.76213478, 0.76225288, 0.76225288,
       0.76169832, 0.76225288, 0.76255243, 0.76222435, 0.76382796,
       0.76225288, 0.76225288, 0.76222435, 0.76169832, 0.76222435,
       0.76225288, 0.76225288, 0.76173829, 0.76169832, 0.76216171,
       0.76222435, 0.76222435, 0.76225288, 0.76225288, 0.7623703 ,
       0.76350958, 0.76226238, 0.76216171, 0.7623703 , 0.76225288,
       0.7622008 , 0.76365443, 0.76205047, 0.76216171, 0.76225288,
       0.76159416, 0.76289742, 0.76169832, 0.76222435, 0.76169496,
       0.76504323, 0.76222435, 0.76173829, 0.76169181, 0.76169288,
       0.76400777, 0.76213478, 0.76225288, 0.76216171, 0.7623703 ,
       0.76225288, 0.76166951, 0.76166951, 0.76225288, 0.76162647,
       0.76213478, 0.76222435, 0.76226238, 0.76169832, 0.76169496,
       0.76365443, 0.76222435, 0.764286  , 0.7623703 , 0.76222435,
       0.76365443, 0.76225288, 0.76225288, 0.76164163, 0.76222

In [55]:
list(enumerate(sig[indices['Rayon Blend Stitched Anarkali Gown']]))

[(0,
  array([0.76173829, 0.76213478, 0.76225288, 0.76222435, 0.76225288,
         0.76225288, 0.76225288, 0.76213478, 0.76225288, 0.76225288,
         0.76169832, 0.76225288, 0.76255243, 0.76222435, 0.76382796,
         0.76225288, 0.76225288, 0.76222435, 0.76169832, 0.76222435,
         0.76225288, 0.76225288, 0.76173829, 0.76169832, 0.76216171,
         0.76222435, 0.76222435, 0.76225288, 0.76225288, 0.7623703 ,
         0.76350958, 0.76226238, 0.76216171, 0.7623703 , 0.76225288,
         0.7622008 , 0.76365443, 0.76205047, 0.76216171, 0.76225288,
         0.76159416, 0.76289742, 0.76169832, 0.76222435, 0.76169496,
         0.76504323, 0.76222435, 0.76173829, 0.76169181, 0.76169288,
         0.76400777, 0.76213478, 0.76225288, 0.76216171, 0.7623703 ,
         0.76225288, 0.76166951, 0.76166951, 0.76225288, 0.76162647,
         0.76213478, 0.76222435, 0.76226238, 0.76169832, 0.76169496,
         0.76365443, 0.76222435, 0.764286  , 0.7623703 , 0.76222435,
         0.76365443, 0.762252

In [56]:
sorted(list(enumerate(sig[indices['Rayon Blend Stitched Anarkali Gown']])))

[(0,
  array([0.76173829, 0.76213478, 0.76225288, 0.76222435, 0.76225288,
         0.76225288, 0.76225288, 0.76213478, 0.76225288, 0.76225288,
         0.76169832, 0.76225288, 0.76255243, 0.76222435, 0.76382796,
         0.76225288, 0.76225288, 0.76222435, 0.76169832, 0.76222435,
         0.76225288, 0.76225288, 0.76173829, 0.76169832, 0.76216171,
         0.76222435, 0.76222435, 0.76225288, 0.76225288, 0.7623703 ,
         0.76350958, 0.76226238, 0.76216171, 0.7623703 , 0.76225288,
         0.7622008 , 0.76365443, 0.76205047, 0.76216171, 0.76225288,
         0.76159416, 0.76289742, 0.76169832, 0.76222435, 0.76169496,
         0.76504323, 0.76222435, 0.76173829, 0.76169181, 0.76169288,
         0.76400777, 0.76213478, 0.76225288, 0.76216171, 0.7623703 ,
         0.76225288, 0.76166951, 0.76166951, 0.76225288, 0.76162647,
         0.76213478, 0.76222435, 0.76226238, 0.76169832, 0.76169496,
         0.76365443, 0.76222435, 0.764286  , 0.7623703 , 0.76222435,
         0.76365443, 0.762252

In [57]:
#sorted(list(enumerate(sig[indices['Rayon Blend Stitched Anarkali Gown']])),key=lambda x: x[1],reverse=True)

In [58]:
#making function for ranking top 10 most similar products based on the given input features 
#here i have giving input in fabric name 
#but we can also calculate it based on any other parameter such as color, price or combination of any two
def give_rec(title, sig=sig):
    idx=indices[title]
    sig_scores=list(enumerate(sig[idx]))
    #sig_scores=sorted(sig_scores, key=lambda x: x[1], reverse =True)
    sig_scores=sig_scores[1:11]
    cloth_indices= [i[0] for i in sig_scores]
    #print prices
    return cloth['PRICE'].iloc[cloth_indices]
def give_rec1(title, sig=sig):
    idx=indices[title]
    sig_scores=list(enumerate(sig[idx]))
    #sig_scores=sorted(sig_scores, key=lambda x: x[1], reverse =True)
    sig_scores=sig_scores[1:11]
    cloth_indices= [i[0] for i in sig_scores]
    #print URLS
    return cloth['URL'].iloc[cloth_indices]


In [59]:
give_rec('Printed Crepe Stitched Anarkali Gown')

1     ₹399
2     ₹298
3     ₹499
4     ₹298
5     ₹298
6     ₹298
7     ₹399
8     ₹300
9     ₹298
10    ₹499
Name: PRICE, dtype: object

In [60]:
give_rec1('Printed Crepe Stitched Anarkali Gown')

1     https://www.flipkart.com/vintage-colour-flared...
2     https://www.flipkart.com/vintage-colour-flared...
3     https://www.flipkart.com/vintage-colour-flared...
4     https://www.flipkart.com/vintage-colour-flared...
5     https://www.flipkart.com/vintage-colour-flared...
6     https://www.flipkart.com/vintage-colour-flared...
7     https://www.flipkart.com/vintage-colour-flared...
8     https://www.flipkart.com/vintage-colour-flared...
9     https://www.flipkart.com/vintage-colour-flared...
10    https://www.flipkart.com/vintage-colour-flared...
Name: URL, dtype: object