In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spacy, re, nltk
#!python -m spacy download en

# # For cleaning and processing
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import sent_tokenize
import string
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
# # nltk.download('stopwords')
# # nltk.download('punkt')

from sklearn.metrics.pairwise import cosine_similarity

In [2]:
comprehensive_stopwords = ["a","about","above","after","again","against","ain","all","am","an","and","any","are","aren","aren't","as","at","be","because","been","before","being","below","between","both","but","by","can","couldn","couldn't","d","did","didn","didn't","do","does","doesn","doesn't","doing","don","don't","down","during","each","few","for","from","further","had","hadn","hadn't","has","hasn","hasn't","have","haven","haven't","having","he","her","here","hers","herself","him","himself","his","how","i","if","in","into","is","isn","isn't","it","it's","its","itself","just","ll","m","ma","me","mightn","mightn't","more","most","mustn","mustn't","my","myself","needn","needn't","no","nor","not","now","o","of","off","on","once","only","or","other","our","ours","ourselves","out","over","own","re","s","same","shan","shan't","she","she's","should","should've","shouldn","shouldn't","so","some","such","t","than","that","that'll","the","their","theirs","them","themselves","then","there","these","they","this","those","through","to","too","under","until","up","ve","very","was","wasn","wasn't","we","were","weren","weren't","what","when","where","which","while","who","whom","why","will","with","won","won't","wouldn","wouldn't","y","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","could","he'd","he'll","he's","here's","how's","i'd","i'll","i'm","i've","let's","ought","she'd","she'll","that's","there's","they'd","they'll","they're","they've","we'd","we'll","we're","we've","what's","when's","where's","who's","why's","would","able","abst","accordance","according","accordingly","across","act","actually","added","adj","affected","affecting","affects","afterwards","ah","almost","alone","along","already","also","although","always","among","amongst","announce","another","anybody","anyhow","anymore","anyone","anything","anyway","anyways","anywhere","apparently","approximately","arent","arise","around","aside","ask","asking","auth","available","away","awfully","b","back","became","become","becomes","becoming","beforehand","begin","beginning","beginnings","begins","behind","believe","beside","besides","beyond","biol","brief","briefly","c","ca","came","cannot","can't","cause","causes","certain","certainly","co","com","come","comes","contain","containing","contains","couldnt","date","different","done","downwards","due","e","ed","edu","effect","eg","eight","eighty","either","else","elsewhere","end","ending","enough","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","except","f","far","ff","fifth","first","five","fix","followed","following","follows","former","formerly","forth","found","four","furthermore","g","gave","get","gets","getting","give","given","gives","giving","go","goes","gone","got","gotten","h","happens","hardly","hed","hence","hereafter","hereby","herein","heres","hereupon","hes","hi","hid","hither","home","howbeit","however","hundred","id","ie","im","immediate","immediately","importance","important","inc","indeed","index","information","instead","invention","inward","itd","it'll","j","k","keep","keeps","kept","kg","km","know","known","knows","l","largely","last","lately","later","latter","latterly","least","less","lest","let","lets","like","liked","likely","line","little","'ll","look","looking","looks","ltd","made","mainly","make","makes","many","may","maybe","mean","means","meantime","meanwhile","merely","mg","might","million","miss","ml","moreover","mostly","mr","mrs","much","mug","must","n","na","name","namely","nay","nd","near","nearly","necessarily","necessary","need","needs","neither","never","nevertheless","new","next","nine","ninety","nobody","non","none","nonetheless","noone","normally","nos","noted","nothing","nowhere","obtain","obtained","obviously","often","oh","ok","okay","old","omitted","one","ones","onto","ord","others","otherwise","outside","overall","owing","p","page","pages","part","particular","particularly","past","per","perhaps","placed","please","plus","poorly","possible","possibly","potentially","pp","predominantly","present","previously","primarily","probably","promptly","proud","provides","put","q","que","quickly","quite","qv","r","ran","rather","rd","readily","really","recent","recently","ref","refs","regarding","regardless","regards","related","relatively","research","respectively","resulted","resulting","results","right","run","said","saw","say","saying","says","sec","section","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sent","seven","several","shall","shed","shes","show","showed","shown","showns","shows","significant","significantly","similar","similarly","since","six","slightly","somebody","somehow","someone","somethan","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specifically","specified","specify","specifying","still","stop","strongly","sub","substantially","successfully","sufficiently","suggest","sup","sure","take","taken","taking","tell","tends","th","thank","thanks","thanx","thats","that've","thence","thereafter","thereby","thered","therefore","therein","there'll","thereof","therere","theres","thereto","thereupon","there've","theyd","theyre","think","thou","though","thoughh","thousand","throug","throughout","thru","thus","til","tip","together","took","toward","towards","tried","tries","truly","try","trying","ts","twice","two","u","un","unfortunately","unless","unlike","unlikely","unto","upon","ups","us","use","used","useful","usefully","usefulness","uses","using","usually","v","value","various","'ve","via","viz","vol","vols","vs","w","want","wants","wasnt","way","wed","welcome","went","werent","whatever","what'll","whats","whence","whenever","whereafter","whereas","whereby","wherein","wheres","whereupon","wherever","whether","whim","whither","whod","whoever","whole","who'll","whomever","whos","whose","widely","willing","wish","within","without","wont","words","world","wouldnt","www","x","yes","yet","youd","youre","z","zero","a's","ain't","allow","allows","apart","appear","appreciate","appropriate","associated","best","better","c'mon","c's","cant","changes","clearly","concerning","consequently","consider","considering","corresponding","course","currently","definitely","described","despite","entirely","exactly","example","going","greetings","hello","help","hopefully","ignored","inasmuch","indicate","indicated","indicates","inner","insofar","it'd","keep","keeps","novel","presumably","reasonably","second","secondly","sensible","serious","seriously","sure","t's","third","thorough","thoroughly","three","well","wonder"]

In [3]:
df_original = pd.read_csv('amazon-product.csv', delimiter = ';')
df_original.head(5)

Unnamed: 0,id,title,description,url,category_id,created_at
0,1,Polarized Sunglasses for Men and Women Semi-R...,Plastic frame anti-reflective lens Polarized ...,https://www.amazon.ca/Polarized-Sunglasses-Fin...,44,2021-08-20 20:59:28
1,2,SORVINO Vintage Sunglasses Retro Cateye Sungl...,HIGH QUALITY MATERIAL:This gradient cat eye s...,https://www.amazon.ca/SORVINO-Vintage-Sunglass...,44,2021-08-20 20:59:31
2,3,Under Armour Womens Launch Run Visor,Under Armour's mission is to make all athlete...,https://www.amazon.ca/Under-Armour-Womens-Laun...,44,2021-08-20 20:59:33
3,4,GRFISIA Square Oversized Sunglasses for Women...,Plastic frame anti-reflective lens Non-Polari...,https://www.amazon.ca/GRFISIA-Square-Oversized...,44,2021-08-20 20:59:36
4,5,FEISEDY Retro Oversized Square Polarized Sung...,FEISEDY RETRO Design--Square Oversized Frame ...,https://www.amazon.ca/FEISEDY-Oversized-Polari...,44,2021-08-20 20:59:38


In [4]:
## Seems like we can drop id, url and created_at column from the original data
df = df_original[['title', 'description', 'category_id']]
df.head(5)

Unnamed: 0,title,description,category_id
0,Polarized Sunglasses for Men and Women Semi-R...,Plastic frame anti-reflective lens Polarized ...,44
1,SORVINO Vintage Sunglasses Retro Cateye Sungl...,HIGH QUALITY MATERIAL:This gradient cat eye s...,44
2,Under Armour Womens Launch Run Visor,Under Armour's mission is to make all athlete...,44
3,GRFISIA Square Oversized Sunglasses for Women...,Plastic frame anti-reflective lens Non-Polari...,44
4,FEISEDY Retro Oversized Square Polarized Sung...,FEISEDY RETRO Design--Square Oversized Frame ...,44


### Text summarization to keep just the top 2 sentences in the description

In [5]:
def text_summarization(ind):
    if len(df.loc[ind,'title'].strip()) > 0: #Title should be present to perform text summarization
        sentences = sent_tokenize(df.loc[ind,'description'])
        if len(sentences) > 0: #Text summarization needs description
            # append title to description
            sentences.append(df.loc[ind,'title'])

            # vectorize sentences based on tf-idf
            vectorizer = TfidfVectorizer()
            try:
                X = vectorizer.fit_transform(sentences)
            except ValueError as ve:
                print('ValueError: {0} at index {1}'.format(ve, ind))
                return(" ")

            # measure similarity between title and each sentence
            if X.shape[0] == 2:
                similarity = cosine_similarity(X[1], X[0]) 
            else:
                similarity = cosine_similarity(X[-1], X[:-1])

            # find top 2 sentences with greatest similarity
            sorted_index = np.argsort(similarity)[0][::-1]
            top_sentences = []
            for i in range(0,min(2, len(sorted_index))):
                top_sentences.append(sentences[sorted_index[i]])
            return(" ".join(top_sentences))
   
        else: #Description not present
#             print('Description not present at index: {}'.format(ind))
            return(" ")
    else: #Title not present
#         print('Title not present at index: {}'.format(ind))
        return(df.loc[ind,'description'])

In [6]:
important_sentences = [text_summarization(i) for i in range(len(df))]

ValueError: empty vocabulary; perhaps the documents only contain stop words at index 199897


### Data Cleaning

In [7]:
# Defining a function to clean the data

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def clean_data(new):

    table = str.maketrans(string.punctuation, ' '*len(string.punctuation)) #map punctuation to space
    stripped = new.translate(table)
    tokens = stripped.lower().split()
    
    words = [word for word in tokens if word.isalpha()]
    words = list(filter(lambda x: x not in comprehensive_stopwords, words))

#     nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    doc = nlp(" ".join(words))
    lemmatized = " ".join([token.lemma_ for token in doc])
    
    return(lemmatized)

In [8]:
#Cleaning the description
clean_description = [clean_data(important_sentences[i]) for i in range(len(df))]
# clean_data(important_sentences[2])

In [9]:
df['clean_description'] = clean_description
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clean_description'] = clean_description


Unnamed: 0,title,description,category_id,clean_description
0,Polarized Sunglasses for Men and Women Semi-R...,Plastic frame anti-reflective lens Polarized ...,44,sunglass polarize lense resistant impact scrat...
1,SORVINO Vintage Sunglasses Retro Cateye Sungl...,HIGH QUALITY MATERIAL:This gradient cat eye s...,44,high quality material gradient cat eye sunglas...
2,Under Armour Womens Launch Run Visor,Under Armour's mission is to make all athlete...,44,mission athlete passion design relentless purs...
3,GRFISIA Square Oversized Sunglasses for Women...,Plastic frame anti-reflective lens Non-Polari...,44,sunglasse soft microfiber pouch soft glass clo...
4,FEISEDY Retro Oversized Square Polarized Sung...,FEISEDY RETRO Design--Square Oversized Frame ...,44,feisedy retro design square oversized frame de...


In [10]:
#Cleaning the title
clean_title = [clean_data(df.loc[i,'title']) for i in range(len(df))]
# clean_data(df.loc[1,'title'])

In [11]:
df['clean_title'] = clean_title
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clean_title'] = clean_title


Unnamed: 0,title,description,category_id,clean_description,clean_title
0,Polarized Sunglasses for Men and Women Semi-R...,Plastic frame anti-reflective lens Polarized ...,44,sunglass polarize lense resistant impact scrat...,polarize sunglass man woman semi rimless frame...
1,SORVINO Vintage Sunglasses Retro Cateye Sungl...,HIGH QUALITY MATERIAL:This gradient cat eye s...,44,high quality material gradient cat eye sunglas...,sorvino vintage sunglass retro cateye sunglass...
2,Under Armour Womens Launch Run Visor,Under Armour's mission is to make all athlete...,44,mission athlete passion design relentless purs...,armour women launch visor
3,GRFISIA Square Oversized Sunglasses for Women...,Plastic frame anti-reflective lens Non-Polari...,44,sunglasse soft microfiber pouch soft glass clo...,grfisia square oversized sunglass woman man fl...
4,FEISEDY Retro Oversized Square Polarized Sung...,FEISEDY RETRO Design--Square Oversized Frame ...,44,feisedy retro design square oversized frame de...,feisedy retro oversized square polarize sungla...


In [12]:
df.iloc[199897]

title                         0 
description           0 0 0 0 0 
category_id                  251
clean_description               
clean_title                     
Name: 199897, dtype: object

We can drop this record as it does not convey any meaningfull information

In [13]:
df_new = df.drop(199897, axis = 0)
df_new

Unnamed: 0,title,description,category_id,clean_description,clean_title
0,Polarized Sunglasses for Men and Women Semi-R...,Plastic frame anti-reflective lens Polarized ...,44,sunglass polarize lense resistant impact scrat...,polarize sunglass man woman semi rimless frame...
1,SORVINO Vintage Sunglasses Retro Cateye Sungl...,HIGH QUALITY MATERIAL:This gradient cat eye s...,44,high quality material gradient cat eye sunglas...,sorvino vintage sunglass retro cateye sunglass...
2,Under Armour Womens Launch Run Visor,Under Armour's mission is to make all athlete...,44,mission athlete passion design relentless purs...,armour women launch visor
3,GRFISIA Square Oversized Sunglasses for Women...,Plastic frame anti-reflective lens Non-Polari...,44,sunglasse soft microfiber pouch soft glass clo...,grfisia square oversized sunglass woman man fl...
4,FEISEDY Retro Oversized Square Polarized Sung...,FEISEDY RETRO Design--Square Oversized Frame ...,44,feisedy retro design square oversized frame de...,feisedy retro oversized square polarize sungla...
...,...,...,...,...,...
253923,AW BRIDAL Cotton Waffle Robe Couples Spa Robe...,Machine Wash 【MATERIAL】 High quality waffle b...,310,aw cotton couple robe set amp amp king amp que...,aw bridal cotton waffle robe couple spa robe k...
253924,Avidlove Women Lace Kimono Robe Babydoll Sexy...,Please going a size up if you wanted a looser...,310,size want looser long fit kimono small woman s...,avidlove woman lace kimono robe babydoll sexy ...
253925,"PAVILIA Womens Fleece Housecoat Zipper Robe, ...",Fleece Polyester Zipper closure ZIP UP FLEECE...,310,product description nbsp pavilia fleece house ...,pavilia women fleece housecoat zipper robe plu...
253926,Silky Satin Robes For Women Pure Lightweight ...,100% Silk Tie closure Machine Wash 100% Polye...,310,silk tie closure machine wash polyester tie cl...,silky satin robe woman pure lightweight short ...


In [14]:
#There is a chance that after data cleaning we don't have any title or description left (string of len = 0) so let's drop those records
df_new = df_new[['clean_title', 'clean_description', 'category_id']]
df_new = df_new[df_new['clean_title'].apply(lambda x: len(x) > 0)]
df_new = df_new[df_new['clean_description'].apply(lambda x: len(x) > 0)]
df_new

Unnamed: 0,clean_title,clean_description,category_id
0,polarize sunglass man woman semi rimless frame...,sunglass polarize lense resistant impact scrat...,44
1,sorvino vintage sunglass retro cateye sunglass...,high quality material gradient cat eye sunglas...,44
2,armour women launch visor,mission athlete passion design relentless purs...,44
3,grfisia square oversized sunglass woman man fl...,sunglasse soft microfiber pouch soft glass clo...,44
4,feisedy retro oversized square polarize sungla...,feisedy retro design square oversized frame de...,44
...,...,...,...
253923,aw bridal cotton waffle robe couple spa robe k...,aw cotton couple robe set amp amp king amp que...,310
253924,avidlove woman lace kimono robe babydoll sexy ...,size want looser long fit kimono small woman s...,310
253925,pavilia women fleece housecoat zipper robe plu...,product description nbsp pavilia fleece house ...,310
253926,silky satin robe woman pure lightweight short ...,silk tie closure machine wash polyester tie cl...,310


Doing this dropped 990 records

In [15]:
#Saving the data and using this for feature engineering and modeling in the next python file
df_new.to_csv('Clean_product_data.csv')

### Feature Engineering

Nouns should provide better information than keeping the entire sentences, performing keyword extraction and keeping just the nouns

In [16]:
nlp = spacy.load('en_core_web_sm')
df_new1 = pd.DataFrame()
df_new1['title_keywords'] = df_new['clean_title'].apply(lambda x: " ".join([ent.text for ent in nlp(x) if ent.pos_ == 'NOUN']))
df_new1['description_keywords'] = df_new['clean_description'].apply(lambda x: " ".join([ent.text for ent in nlp(x) if ent.pos_ == 'NOUN']))
df_new1['category_id'] = df_new['category_id']
df_new1

Unnamed: 0,title_keywords,description_keywords,category_id
0,sunglass man woman frame drive sun glass uv block,impact scratch day time walk park drive fishin...,44
1,vintage sunglass cateye sunglass woman man fra...,quality material gradient cat eye sunglass pc ...,44
2,armour women visor,mission athlete passion design pursuit innovat...,44
3,sunglass woman man fashion shade,microfiber pouch glass cloth grfisia nbsp grfi...,44
4,feisedy sunglass woman style,feisedy retro design frame design minimalist s...,44
...,...,...,...
253923,cotton waffle robe couple spa robe bathrobe dr...,cotton couple robe amp king amp hubby amp embr...,310
253924,woman lace kimono robe lingerie mesh chemise n...,size fit woman kimono lingerie swimsuit polyes...,310
253925,pavilia women zipper robe zip front lounger,product description nbsp pavilia fleece house ...,310
253926,satin robe woman robe wedding gift,silk tie closure machine wash polyester tie cl...,310


In [17]:
#Saving the data and using this for feature engineering and modeling in the next python file
df_new1.to_csv('Keywords_product_data.csv')