In [1]:
import pandas as pd

In [2]:
pd.set_option('display.max_columns', None)
df = pd.read_csv('../data/cleaned_review_data.csv', 
                 parse_dates=['SubmissionTime', 'FirstSubmissionTime', 'LastSubmissionTime'], 
                 low_memory=False)
df.sample(5)

Unnamed: 0,pd_id,brand,Name,Description,AverageOverallRating,love_count,reviews_count,Price,category_1,category_2,category_3,FirstSubmissionTime,LastSubmissionTime,AuthorId,Rating,Title,ReviewText,Helpfulness,SubmissionTime,IsRecommended,eyeColor,hairColor,skinTone,skinType
532103,P403497,juicy-couture,Viva La Juicy Rosé,"Celebrate with Viva la Juicy Rosé, a light-hea...",4.6984,6726.0,63.0,79.0,Fragrance,,,2016-01-06 07:25:45,2020-06-24 23:01:10,12123183612,5,Amazing Smell,This smells SO good! It’s not too strong and n...,0.0,2018-02-14 18:29:09,True,blue,blonde,fair,dry
275806,P429425,cover-fx,Power Play Foundation G+40,What it is: A cult-favorite liquid foundation ...,4.0964,59518.0,1546.0,44.0,Makeup,Face,Foundation,2018-03-04 18:29:34,2020-06-23 00:46:08,5402073862,3,No Title,"Good transfer resistant, light to medium cover...",0.941176,2018-06-01 20:15:37,True,hazel,brunette,fair,combination
791761,P420951,marc-jacobs-beauty,Eye-Conic Multi-Finish Eyeshadow Palette,What it is: A long-wearing eyeshadow palette t...,4.6736,79337.0,2368.0,49.0,Makeup,Eye,Eye Palettes,2017-06-30 17:34:12,2020-06-29 23:17:19,5591292605,5,#TheEyeDresser,#TheEyeDresser \nLoved these colors they are t...,0.0,2017-08-27 03:34:45,True,,,,
1038573,P409800,sephora-collection,Cleansing & Exfoliating Wipes,What it is: A collection of cleansing and exfo...,4.3998,248103.0,3508.0,8.0,Skincare,Cleansers,Face Wipes,2016-07-09 00:10:16,2020-07-07 15:25:28,5982255408,5,No Title,This is the BEST Sephora cleansing wipe. It’s...,0.0,2018-04-09 02:46:17,True,hazel,brunette,medium,combination
882738,P302923,nars,Pure Radiant Tinted Moisturizer Broad Spectrum...,"What it is: A lightweight, natural-looking tin...",4.384,123749.0,2940.0,45.0,Makeup,Face,Tinted Moisturizer,2012-02-29 18:18:07,2020-07-07 20:41:28,1133197798,5,Magical,It better be magic for this price and it is. G...,0.0,2014-12-08 03:58:02,False,brown,,olive,combination


In [3]:
df.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1364686 entries, 0 to 1364685
Data columns (total 24 columns):
 #   Column                Non-Null Count    Dtype         
---  ------                --------------    -----         
 0   pd_id                 1364686 non-null  object        
 1   brand                 1364686 non-null  object        
 2   Name                  1364686 non-null  object        
 3   Description           1364686 non-null  object        
 4   AverageOverallRating  1364686 non-null  float64       
 5   love_count            1364686 non-null  float64       
 6   reviews_count         1364686 non-null  float64       
 7   Price                 1364686 non-null  float64       
 8   category_1            1364686 non-null  object        
 9   category_2            1006831 non-null  object        
 10  category_3            957356 non-null   object        
 11  FirstSubmissionTime   1364686 non-null  datetime64[ns]
 12  LastSubmissionTime    1364686 non-null  da

In [4]:
n_pd = df['pd_id'].nunique()
n_user = df['AuthorId'].nunique()
print(f'Number of Distinct Products {n_pd}')
print(f'Number of Distinct Users {n_user}')
print(f'If all users have 1 review for each product, Total reviews: {n_pd*n_user}')
print(f'Actually, the number of reviews: {len(df)}')

Number of Distinct Products 2301
Number of Distinct Users 660434
If all users have 1 review for each product, Total reviews: 1519658634
Actually, the number of reviews: 1364686


# TFIDF for Product Description

In [5]:
import nltk
from nltk.stem.porter import *
from sklearn.feature_extraction import stop_words
from collections import Counter
import string
from sklearn.feature_extraction.text import TfidfVectorizer
import re



In [6]:
def tokenize(text):
    """
    Tokenize text and return a non-unique list of tokenized words
    found in the text. Normalize to lowercase, strip punctuation,
    remove stop words, drop words of length < 3, strip digits.
    """
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    nopunct = regex.sub(" ", text)  # delete stuff but leave at least a space to avoid clumping together
    words = nltk.tokenize.word_tokenize(nopunct)
    words = [w.lower() for w in words] # Normalize to lowercase
    words = [w for w in words if len(w) > 2]  # drop words of length < 3
    words = [w for w in words if w not in stop_words.ENGLISH_STOP_WORDS] # remove stop words

    return words

In [7]:
def tokenizer(text):
    """
    Given a list of tokens/words, return a new list with each word
    stemmed using a PorterStemmer.
    """
    stemmer = PorterStemmer()
    stemmed = [stemmer.stem(w) for w in tokenize(text)]
    return stemmed

In [8]:
def compute_tfidf(corpus):
    """
    Create and return a TfidfVectorizer object after training it on
    the list of articles pulled from the corpus dictionary. Meaning,
    call fit() on the list of document strings, which figures out
    all the inverse document frequencies (IDF) for use later by
    the transform() function. The corpus argument is a dictionary
    mapping file name to xml text.
    """

    tfidf = TfidfVectorizer(input = 'content',
                            analyzer = 'word',
                            tokenizer = tokenizer,
                            stop_words ='english',
                            decode_error = 'ignore')
    tfidf = tfidf.fit(corpus)
    return tfidf

In [9]:
def summarize(tfidf, text, n):
    """
    Given a trained TfidfVectorizer object and some XML text, return
    up to n (word,score) pairs in a list. Discard any terms with
    scores < 0.09.
    """
    X = tfidf.transform([text])
    names = tfidf.get_feature_names()
    word_index = X.nonzero()[1]
    scores = [(names[i],X[0,i]) for i in word_index if X[0,i] >= 0.09]
    sorted_bags = sorted(scores, reverse = True, key=lambda x: x[1])

    return sorted_bags[:n]

In [10]:
dscrp_df = df[['pd_id', 'Description']].drop_duplicates().reset_index(drop=True)
trainning_corpus = dscrp_df['Description']
trained = compute_tfidf(trainning_corpus)



In [11]:
# Extract top 5 keywords for each product descriptions
kw_list = []
for i in range(len(dscrp_df)):
    top_5_kw = summarize(tfidf=trained, text=dscrp_df['Description'][i], n=5)
    top_5_kw_dict = {f'kw_{k+1}':kw[0] for k, kw in enumerate(top_5_kw)}
    kw_list.append(top_5_kw_dict)
kw_df = pd.DataFrame(kw_list)
product_df = pd.concat([dscrp_df, kw_df], axis=1)
product_df

Unnamed: 0,pd_id,Description,kw_1,kw_2,kw_3,kw_4,kw_5
0,P307801,Fragrance Family: Floral Scent Type: Fruity Fl...,breathtak,fig,cedarwood,figfragr,amalfi
1,P307804,Fragrance Family: Earthy & Woody Scent Type: F...,junip,island,lemon,sicili,panarea
2,P460441,What it is:A shimmering oil that dresses the s...,tangerineabout,rosa,nobil,oil,centifolia
3,P444119,Fragrance Family: Floral Scent Type: Classic F...,peoni,note,raspberri,scent,peonia
4,P163604,Fragrance Family: Fresh Scent Type: Fresh Citr...,fresh,citru,fruit,verbena,bulgarian
...,...,...,...,...,...,...,...
2296,P428706,Fragrance Family: Fresh Scent Type: Fresh Flor...,perfum,neroli,blossom,orang,essenti
2297,P428702,Fragrance Family: FreshScent Type: Fresh Citru...,sweeti,israel,iran,grapefruit,region
2298,P428709,Fragrance Family: Fresh Scent Type: Fresh Flor...,patchouli,trade,fair,geranium,rose
2299,P452918,"Beauty Benefit: Immunity Support, Gut Support,...",cup,gummi,green,dawn,vitamin


In [12]:
# In general the most common keywords for Sephora's products
current_list = []
for i in range(1,6): 
    new_list = list(product_df[f'kw_{i}'].value_counts()[:20].index)
    current_list += new_list
most_keyword = list(set(current_list))
print(most_keyword)

['lip', 'lash', 'pink', 'spray', 'sandalwood', 'clean', 'spf', 'lancer', 'skin', 'eye', 'tan', 'agre', 'floral', 'makeup', 'moistur', 'report', 'cleans', 'palett', 'mask', 'acid', 'candl', 'brighten', 'color', 'scalp', 'protect', 'blush', 'balm', 'matt', 'serum', 'pore', 'vitamin', 'note', 'mascara', 'wrinkl', 'blemish', 'hair', 'set', 'spot', 'oil', 'volum', 'underton', 'cbd', 'shampoo', 'brush', 'shine', 'rose', 'bodi', 'repair', 'coverag', 'brow', 'coconut', 'powder', 'said', 'mist', 'dark', 'polish', 'cleanser', 'look']
