### Importing the dependencies

In [None]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
#downloading the Amazon Apparel dataset for the recommendation model
!gdown --id 1xWkZ70Fc0y0WvzI-4Iw8fbPI9hP_biZT

Downloading...
From: https://drive.google.com/uc?id=1xWkZ70Fc0y0WvzI-4Iw8fbPI9hP_biZT
To: /content/tops_fashion.json
100% 263M/263M [00:06<00:00, 39.9MB/s]


In [None]:
data = pd.read_json('/content/tops_fashion.json')

## Exploratory Data Analysis

In [None]:
df = pd.DataFrame(data)
df = df.drop(['sku', 'author', 'publisher','availability', 'reviews', 'large_image_url','availability_type','small_image_url','editorial_review','model','manufacturer'], axis=1) #dropping the unneccessary columns
df.head()
del data
#asin => amazon standard identification number - simply a unique 10-digit identifier used by amazon to identify products in it's database

In [None]:
#displaying the titles of 10 of the records in the clothing dataset
for i in range(10):
  print(df['title'][i]);
  plt.figure

Minions Como Superheroes Ironman Long Sleeve Round Neck T-Shirt For Women
FIG Clothing Womens Izo Tunic
FIG Clothing Womens Won Top
Focal18 Sailor Collar Bubble Sleeve Blouse Shirt Women Mori Girl Casual Top Harajuku
Featherlite Ladies' Long Sleeve Stain Resistant Tapered Twill Shirt, 2XL, Onyx Black/ Stone
[Fits Cloth] Grape Solid Modern Long Sleeve Plain T Shirt
Women's Unique 100% Cotton T - Special Olympics World Games 2015 White Size L
Floerns Women's Bell Sleeve Beading Casual Blouse Top
Standing on His Promises Rhinestones T-Shirt Ripped Cut Out Red Long
Fila Women's Tulip Durable Tennis Style Comfort Tank


In [None]:
print('Attributes present in the data') #listing down the attributes in the clothing dataset
for col in df.columns:
  print('->',col)

print('\nCount of the Unique values and Null values pertaining to each of the attributes in the dataset')
for col in df.columns:
  print('-> Column: {}, Total values {}, unique values {}, null values {}'.format(col, len(df[col]), len(set(list(df[col]))),sum(df[col].isnull().values)))

Attributes present in the data
-> asin
-> product_type_name
-> formatted_price
-> color
-> brand
-> title
-> medium_image_url
-> editorial_reivew

Count of the Unique values and Null values pertaining to each of the attributes in the dataset
-> Column: asin, Total values 183138, unique values 183138, null values 0
-> Column: product_type_name, Total values 183138, unique values 72, null values 0
-> Column: formatted_price, Total values 183138, unique values 3136, null values 154743
-> Column: color, Total values 183138, unique values 7381, null values 118182
-> Column: brand, Total values 183138, unique values 10578, null values 151
-> Column: title, Total values 183138, unique values 175985, null values 0
-> Column: medium_image_url, Total values 183138, unique values 170782, null values 0
-> Column: editorial_reivew, Total values 183138, unique values 128058, null values 2758


# Basis for Recommendation: Titles of clothes

## Preprocessing Steps
1. Removing products with very few words in it
2. Sorting all the data in increasing alphabetical order of the titles of clothes.
3. Removing titles that are very similar
3. TF-IDF vectorizer has been used to convert the words present in the titles into numerical features. TF-IDF helps us generate numerical features that are descriptive of the actual images of the clothing.

In [None]:
print('Total elements in the clothing list: ', len(df))
print('Attributes of an apparel: ', list(df.columns))
print('Total null elements in formatter_price', sum(df['formatted_price'].isnull().values))

Total elements in the clothing list:  183138
Attributes of an apparel:  ['asin', 'product_type_name', 'formatted_price', 'color', 'brand', 'title', 'medium_image_url', 'editorial_reivew']
Total null elements in formatter_price 154743


In [None]:
df = df.sort_values('title') #sorting the data in increasing alphabetical order of titles
df.reset_index(drop = True, inplace = True)

In [None]:
df.head(10) #shwoing the first 10 entries in the database

Unnamed: 0,asin,product_type_name,formatted_price,color,brand,title,medium_image_url,editorial_reivew
0,B008D30AGK,SHIRT,$7.51,Multicolored,Out+of+Print+Clothing,"""1984"" Retro Book Cover Women's SLim Fit T-Shi...",https://images-na.ssl-images-amazon.com/images...,
1,B071WD44WX,SHIRT,,,Sweet Claire,"""ASAP"" As Southern As Possible Tank Top, Heath...",https://images-na.ssl-images-amazon.com/images...,<p>THIS IS FOR THIS TANK IN HEATHER GREY WITH ...
2,B072M88RQ6,SHIRT,,,Sweet Claire,"""ASAP"" As Southern As Possible Tank Top, Off W...",https://images-na.ssl-images-amazon.com/images...,<p>THIS IS FOR THIS TANK IN OFF WHITE WITH BLA...
3,B0739MNWM2,ORCA_SHIRT,,,Natural Black Girl T-Shirt,"""Afro Bae"" Beautiful Black Women's Shirt",https://images-na.ssl-images-amazon.com/images...,
4,B01E0XLYHA,SHIRT,,Blue,GreaterGood,"""Ask Me About My Granddog"" T-Shirt",https://images-na.ssl-images-amazon.com/images...,
5,B01IADM09M,SHIRT,,,Acting Pro,"""Baby Bear"" Round Neck Racerback Tank Top",https://images-na.ssl-images-amazon.com/images...,Baby Bear graphic print relaxed fit tank with ...
6,B072J96Z94,SHIRT,,,Hoop Culture,"""Basketball Mom"" Women's T-Shirt - Hoop Culture",https://images-na.ssl-images-amazon.com/images...,Mom's represent your child in style with the o...
7,B01I5YLQAA,SHIRT,,,Acting Pro,"""Black Is My Happy Color"" Round Neck Tank Top",https://images-na.ssl-images-amazon.com/images...,Black Is My Happy Color graphic print relaxed ...
8,B01IC1F9TK,SHIRT,,,Acting Pro,"""Boheme Life"" Scoop Neck Racerback Tank Top",https://images-na.ssl-images-amazon.com/images...,"Graphic print, hip length tank top in a relaxe..."
9,B01I5XOLVM,SHIRT,,,Acting Pro,"""But First, Wine"" Round Neck Tank Top",https://images-na.ssl-images-amazon.com/images...,"But First, Wine graphic printed knit muscle ta..."


### **Cleaning the texts**

### Removing duplicates in data

In [None]:
df = df.drop_duplicates(subset = 'medium_image_url').reset_index().drop(['index'],axis=1) #removing entires with duplicates in medium_image_url attribute
df = df.drop_duplicates(subset = 'title').reset_index().drop(['index'],axis=1) #removing entries with duplicates in title attribute
print(len(df))

165698


### Removing the null entries

In [None]:
print('with null entries: ')
len(df)
df =df.loc[~df['color'].isnull()]
df =df.loc[~df['title'].isnull()]
df =df.loc[~df['formatted_price'].isnull()]
print('without null entries: ')
len(df)

with null entries: 
without null entries: 


18785

### Removing stop words

In [None]:
stopwrds = list(stopwords.words('english')) #list of stopwords in English language
l = list(df['title'])
for i in range(len(df['title'])):
  for j in range(len(stopwrds)):
    l[i] = l[i].replace(' '+stopwrds[j]+' ',' ') #removing stopwords from the titles of each of the entries in the clothing dataset
df['title']=l

In [None]:
#vectorization of text corpus
tokenizer = Tokenizer(oov_token="<UNK>") # oov_token = "<UNK>" replaces out-of-vocabulary words during text-to-sequence calls
tokenizer.fit_on_texts(list(df['title']))     #list of titles provided to train the tokenizer with respect to internal vocabulary
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(list(df['title'])) #text transformed to a sequence of integers - acquiring numerical features
sequences[0]
tok_sent = tokenizer.sequences_to_texts(sequences)  #each numerical sequence transformed back to text
df['title']=tok_sent

In [None]:
for col in df.columns:
  print('Column: {}, Total values {}, unique values {}, null values {}'.format(col, len(df[col]), len(set(list(df[col]))),sum(df[col].isnull().values)))

Column: asin, Total values 18785, unique values 18785, null values 0
Column: product_type_name, Total values 18785, unique values 54, null values 0
Column: formatted_price, Total values 18785, unique values 2928, null values 0
Column: color, Total values 18785, unique values 4243, null values 0
Column: brand, Total values 18785, unique values 3534, null values 55
Column: title, Total values 18785, unique values 18781, null values 0
Column: medium_image_url, Total values 18785, unique values 18785, null values 0
Column: editorial_reivew, Total values 18785, unique values 12691, null values 282


In [None]:
df.reset_index(drop=True,inplace=True)
indices = list(df.index)
df.head(5)

Unnamed: 0,asin,product_type_name,formatted_price,color,brand,title,medium_image_url,editorial_reivew
0,B008D30AGK,SHIRT,$7.51,Multicolored,Out+of+Print+Clothing,1984 retro book cover women's slim fit t shirt...,https://images-na.ssl-images-amazon.com/images...,
1,B003IDE8XQ,HOME,$18.99,Grey,Maggie's Organics,camisoles grey medium fair labor 1 pc,https://images-na.ssl-images-amazon.com/images...,
2,B00KI3VDXM,SHIRT,$39.99,Purple,Crazy4Bling,i wanna be adored long sleeve top shredded sho...,https://images-na.ssl-images-amazon.com/images...,Edgy but casual long sleeve graphic design top...
3,B073SKNQHD,SHIRT,$24.99,Premium Heather Gray,The Workout Princess,i workout to burn the crazy tri blend tank top...,https://images-na.ssl-images-amazon.com/images...,"Meet your new favorite basic. This daring, top..."
4,B06WRW8RQ1,SHIRT,$9.38,Black,AJ,life journey self help diy t shirt women silve...,https://images-na.ssl-images-amazon.com/images...,Product Details<br><br>5.4 oz 100% cotton (sol...


### Removing titles that are very similar to eachother

In [None]:
import itertools
deduped_idx = []
i = 0
j = 0
while i < len(df) and j < len(df):
    previous_i = i
    a = df['title'].loc[i].split()
    j = i+1
    while j < len(df):
        b = df['title'].loc[indices[j]].split()
        length = max(len(a), len(b))
        count  = 0
        for k in itertools.zip_longest(a,b):
            if (k[0] == k[1]):
                count += 1
        if (length - count) > 2:
            deduped_idx.append(i)
            i = j
            break
        else:
            j += 1
    if previous_i == i:
        break

In [None]:
df = df.iloc[deduped_idx]
df.reset_index(drop=True,inplace=True)
df.head(2)
list_titles = list(df['title'])
print(len(df))

16231


In [None]:
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = CountVectorizer()
vectorizer
X = vectorizer.fit_transform(list_titles) #corpus is a list of sentences
analyze = vectorizer.build_analyzer()

In [None]:
#TF-IDF vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(list_titles)
print(X)

  (0, 10887)	0.24162674775713033
  (0, 2821)	0.28256912543572676
  (0, 7965)	0.18454254204583964
  (0, 7409)	0.2830132088074371
  (0, 8895)	0.11072608185516118
  (0, 4181)	0.23036773989767353
  (0, 9114)	0.2575014230132252
  (0, 10790)	0.17676862303918606
  (0, 3056)	0.3033675085286899
  (0, 2072)	0.4361048142979756
  (0, 8293)	0.3168455416233329
  (0, 239)	0.44958284739261867
  (1, 7581)	0.4406075473906304
  (1, 5856)	0.5251067623295452
  (1, 4008)	0.4887149312797496
  (1, 6597)	0.1833031335137284
  (1, 4737)	0.24216802520840175
  (1, 2410)	0.446139218286329
  (2, 8929)	0.3918231291967685
  (2, 8933)	0.47733677876623676
  (2, 10047)	0.09528794465563686
  (2, 9097)	0.1250912512361386
  (2, 6191)	0.157537172958671
  (2, 1072)	0.47733677876623676
  (2, 1729)	0.3407351604504286
  :	:
  (16228, 8218)	0.17761916965795227
  (16228, 8919)	0.16062180940505558
  (16228, 9763)	0.12928177732961466
  (16228, 10047)	0.08756986660979656
  (16228, 9097)	0.11495918213357324
  (16229, 10864)	0.46259314

In [None]:
#computing closest matches using the cosine similarity metric
def closestkrecom(idx,k):
  dist = sklearn.metrics.pairwise_distances( X, X[idx], metric='cosine') #cosine similarity - to measure similarity between two vectors
  dist = np.squeeze(dist)
  #print(dist.shape)
  sort_idxs = np.random.choice(np.argsort(dist,kind = 'mergesort', axis=0)[1:],k) #argsort returns indices that would sort an array
  return sort_idxs

In [None]:
import random
import tensorflow as tf

In [None]:
#Apparel Recommendation
for idx in random.sample(list(np.arange(len(df))), 20):
  print('Query Title-->', df['title'][idx]) #idx is the index of the query title
  for j in closestkrecom(idx,5): #here, j provides the indices of 5 closest recommendations of title stored in df['title'][idx]
    print('   ',df['title'][j])
  print()

Query Title--> i don't sweat i sparkle custom t shirt yellow top for juniors size small
    bobeau womens medium surplice stretch knit tank top
    floral grid printed tunic size large
    mamisari nap queen crop top tee lazy sleep tired shirt tshirt
    american rag womens classic black plus size embroidered sirt 2x
    kersh ladies' french terry boatneck top black xxl

Query Title--> violeta by mango women's plus size ruffled velvet top pink s
    green floral art wear fringe caftan tunic top plus size oversized
    merona womens black checkered 3 4 button down top small
    eva varro peek a boo shoulders striped tunic black white gold 2x
    felina
    brooke leigh women's double front pocket plaid shirt black white red xl

Query Title--> max mara women's alfio silk tunic sz 8 black
    women's summer pastel candy colored tops tees lavender
    catherine malandrino freya polyester ivory blouse xs
    womens sexy layered round neck falbala sleeve short chiffon blouse shirt black
    