In [1]:
import pandas as pd
import re

In [2]:
raw_data = pd.read_csv('../scrape_notebook/Sephora_product_info.csv')
raw_data.sample(10)

Unnamed: 0,Product_Id,product_name,item_num,brand,category,price,size,love_count,rating,reviews_count,link
930,P441306,Double Wear Stay-in-Place Matte Powder Foundation,2179604,Estée Lauder,Makeup/Face/Foundation,43.0,0.42 oz/ 12 g,7951,4.0,29,https://www.sephora.com/product/double-wear-st...
1455,P399712,Lotus Caress to Moisturize Eyes,1758457,KENZOKI,Skincare/Eye Care/Eye Creams & Treatments,35.0,0.5 oz/ 15 mL,634,3.5,8,https://www.sephora.com/product/lotus-caress-t...
2881,P263504,All Nighter Long-Lasting Makeup Setting Spray,1900000,Urban Decay,Makeup/Face/Setting Spray & Powder,15.0,Standard Size - 4 oz/ 118 mL,493821,4.4159,9599,https://www.sephora.com/product/all-nighter-lo...
1518,P395383,Dry Shampoo with Oat Milk,1649094,Klorane,Hair/Shampoo & Conditioner/Dry Shampoo,10.0,3.2 oz,26830,4.4448,553,https://www.sephora.com/product/dry-shampoo-wi...
2077,P439425,Kinetin+ Hydrating Eye Cream,2155497,Obagi Clinical,Skincare/Eye Care/Eye Creams & Treatments,65.0,0.5 oz/ 14 g,3306,4.4261,115,https://www.sephora.com/product/kinetin-hydrat...
236,P427624,Anti-Fatigue Under Eye Primer,2037570,BECCA,Makeup/Eye/Eye Primer,32.0,0.13 oz/ 3.7 g,29456,4.004,749,https://www.sephora.com/product/anti-fatigue-u...
2344,P398118,Moringa Tree Conditioning Ghee,2288967,Qhemet Biologics,Hair/Shampoo & Conditioner/Leave-In Conditioner,22.0,8 oz/ 236 mL,3297,4.4,50,https://www.sephora.com/product/moringa-tree-c...
1152,P442731,Abeille Royale Double R Serum,2202166,Guerlain,Skincare/Treatments/Face Serums,215.0,1.6 oz/ 50 mL,2975,4.3333,3,https://www.sephora.com/product/abeille-royale...
1211,P415922,Hair Sweet Hair Growth - Vegan Gummies,1889336,HUM Nutrition,Skincare/Wellness/Beauty Supplements,25.0,60 Vegan berry gummy hearts,26603,3.9357,249,https://www.sephora.com/product/hair-sweet-hai...
1241,P411850,Tinted Lip Conditioner,2197697,ILIA,Makeup/Lip/Lipstick,28.0,0.14 oz/ 4 g,27013,4.0818,159,https://www.sephora.com/product/tinted-lip-con...


In [3]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3083 entries, 0 to 3082
Data columns (total 11 columns):
Product_Id       3083 non-null object
product_name     3083 non-null object
item_num         3083 non-null int64
brand            3083 non-null object
category         3083 non-null object
price            3083 non-null object
size             3083 non-null object
love_count       3083 non-null object
rating           3083 non-null object
reviews_count    3083 non-null object
link             3083 non-null object
dtypes: int64(1), object(10)
memory usage: 265.1+ KB


From the sample dataframe and dataframe info, we can recognize that `Product_Id`, `item_num` and `link` may be not useful for analysis. Therefore, we are going to drop these three columns. Besides, `category` should be splited into three columns. `size` could have na value or other invalid value. Other columns like `love_count`, `rating`, `reviews_count` should turn to numeric variables.

# Let's start cleaning data!

## drop columns

In [4]:
raw_data_c = raw_data.drop(columns=['Product_Id', 'item_num', 'link'])
raw_data_c.sample(5)

Unnamed: 0,product_name,brand,category,price,size,love_count,rating,reviews_count
7,Le Nobili Collection Set,Acqua Di Parma,Fragrance/Value & Gift Sets/Perfume Gift Sets,68.0,3 x 0.51 oz/ 15 mL,1366,4.1818,11
91,Brow Definer,Anastasia Beverly Hills,Makeup/Eye/Eyebrow,23.0,0.007 oz/ 0.2 g,176466,4.0086,3383
1010,Iris Goddess Eau De Parfum Travel Spray,Floral Street,Fragrance/Women/Rollerballs & Travel Size,26.0,0.34 oz/ 10 mL,220,5.0,1
377,Thickening Spray,Bumble and bumble,Hair/Hair Styling & Treatments/Hair Styling Pr...,11.0,8 oz/ 250 mL,25447,4.1169,847
1594,Shea Butter Intensive Hand Balm,L'Occitane,Bath & Body/Body Moisturizers/Hand Cream & Foo...,36.0,5.3 oz/ 150 mL,1057,5.0,3


## split category into three columns

In [5]:
category_lst_series = raw_data_c['category'].str.split('/')
raw_data_c['category_1'] = category_lst_series.str[0]
raw_data_c['category_2'] = category_lst_series.str[1]
raw_data_c['category_3'] = category_lst_series.str[-1]
raw_data_c.sample(5)

Unnamed: 0,product_name,brand,category,price,size,love_count,rating,reviews_count,category_1,category_2,category_3
1399,Lady Vengeance,Juliette Has a Gun,Fragrance/Women/Perfume,28.0,3.3 oz/ 100 mL,5008,4.4306,72,Fragrance,Women,Perfume
499,Chloé Eau de Parfum Gift Set,Chloé,Fragrance/Value & Gift Sets/Perfume Gift Sets,115.0,na,782,na,na,Fragrance,Value & Gift Sets,Perfume Gift Sets
1111,Turmeric Tonic Sachets for Skin Glow + Debloat,Golde,Skincare/Value & Gift Sets,22.0,12 Sticks,669,4.75,4,Skincare,Value & Gift Sets,Value & Gift Sets
955,Sweet Greens Limited Edition Holiday Set,Farmacy,Skincare/Value & Gift Sets,44.0,na,13355,4.4118,17,Skincare,Value & Gift Sets,Value & Gift Sets
2935,Flowerbomb Flowerflake Set,Viktor&Rolf,Fragrance/Value & Gift Sets/Perfume Gift Sets,120.0,3 x 1.7 oz/ 50 mL,515,5,1,Fragrance,Value & Gift Sets,Perfume Gift Sets


## turn price, love_count, reviews_count, rating into numeric variables.

In [6]:
# Check the situation when love_count is 'na'.
raw_data_c.loc[raw_data_c.love_count=='na']

Unnamed: 0,product_name,brand,category,price,size,love_count,rating,reviews_count,category_1,category_2,category_3
2493,Holiday Gift Card,SEPHORA COLLECTION,Gifts,10.0,$10,na,na,na,Gifts,,Gifts


As gift card is not useful for analysis, we can drop this row for data cleaning.

In [7]:
# Check the situation when reviews_count is 'na'.
raw_data_c.loc[raw_data_c.reviews_count=='na']

Unnamed: 0,product_name,brand,category,price,size,love_count,rating,reviews_count,category_1,category_2,category_3
10,Blu Mediterraneo Arancia di Capri Gift Set,Acqua Di Parma,Fragrance/Value & Gift Sets/Perfume Gift Sets,113.0,na,188,na,na,Fragrance,Value & Gift Sets,Perfume Gift Sets
36,Complete Eye Renewal Balm Duo,Algenist,Skincare/Value & Gift Sets,68.0,na,545,na,na,Skincare,Value & Gift Sets,Value & Gift Sets
201,Gut Primer™ Inner Beauty Support™,The Beauty Chef,Skincare,$69.00,7.05 oz/ 200g,445,na,na,Skincare,,Skincare
371,"Reign, Dear: Texture Set",Bumble and bumble,Hair/Value & Gift Sets,15.0,na,2362,na,na,Hair,Value & Gift Sets,Value & Gift Sets
426,Lake Travel Spray,By Rosie Jane,Fragrance/Women/Perfume,25.0,0.25 oz/ 7.5 mL,198,na,na,Fragrance,Women,Perfume
...,...,...,...,...,...,...,...,...,...,...,...
2946,Best Life 2 Palette,Violet Voss,Makeup/Eye/Eye Palettes,49.0,1.21 oz/ 34.3 g,737,na,na,Makeup,Eye,Eye Palettes
2994,Sleep Duo Essential Oil Set,Vitruvi,Skincare/Value & Gift Sets,50.0,na,230,na,na,Skincare,Value & Gift Sets,Value & Gift Sets
3012,Copper Clove Boxed Scalloped Candlepot,VOLUSPA,Fragrance/Candles & Home Scents,18.0,6.2oz/ 176g,560,na,na,Fragrance,Candles & Home Scents,Candles & Home Scents
3065,Mon Paris Holiday Set,Yves Saint Laurent,Fragrance/Value & Gift Sets/Perfume Gift Sets,97.0,na,1442,na,na,Fragrance,Value & Gift Sets,Perfume Gift Sets


After checking these product page on Sephora, we can know the reason why we cannot scrape the information is that the review counts is actually 0. So we are going to fill na with 0 for review_count

In [8]:
# Check the situation when reviews_count is 'na'.
raw_data_c.loc[raw_data_c.rating=='na']

Unnamed: 0,product_name,brand,category,price,size,love_count,rating,reviews_count,category_1,category_2,category_3
10,Blu Mediterraneo Arancia di Capri Gift Set,Acqua Di Parma,Fragrance/Value & Gift Sets/Perfume Gift Sets,113.0,na,188,na,na,Fragrance,Value & Gift Sets,Perfume Gift Sets
36,Complete Eye Renewal Balm Duo,Algenist,Skincare/Value & Gift Sets,68.0,na,545,na,na,Skincare,Value & Gift Sets,Value & Gift Sets
201,Gut Primer™ Inner Beauty Support™,The Beauty Chef,Skincare,$69.00,7.05 oz/ 200g,445,na,na,Skincare,,Skincare
371,"Reign, Dear: Texture Set",Bumble and bumble,Hair/Value & Gift Sets,15.0,na,2362,na,na,Hair,Value & Gift Sets,Value & Gift Sets
426,Lake Travel Spray,By Rosie Jane,Fragrance/Women/Perfume,25.0,0.25 oz/ 7.5 mL,198,na,na,Fragrance,Women,Perfume
...,...,...,...,...,...,...,...,...,...,...,...
2946,Best Life 2 Palette,Violet Voss,Makeup/Eye/Eye Palettes,49.0,1.21 oz/ 34.3 g,737,na,na,Makeup,Eye,Eye Palettes
2994,Sleep Duo Essential Oil Set,Vitruvi,Skincare/Value & Gift Sets,50.0,na,230,na,na,Skincare,Value & Gift Sets,Value & Gift Sets
3012,Copper Clove Boxed Scalloped Candlepot,VOLUSPA,Fragrance/Candles & Home Scents,18.0,6.2oz/ 176g,560,na,na,Fragrance,Candles & Home Scents,Candles & Home Scents
3065,Mon Paris Holiday Set,Yves Saint Laurent,Fragrance/Value & Gift Sets/Perfume Gift Sets,97.0,na,1442,na,na,Fragrance,Value & Gift Sets,Perfume Gift Sets


The situation when rating is na is just the same as that of reviews_count

From the dataframe above, we can also observe that price may contain dollar sign as well. So we need to strip that.

In [9]:
# turn price, love_count, reviews_count, rating into numeric variables.
df_process_1 = raw_data_c.loc[raw_data_c.love_count != 'na']
df_process_1.price = df_process_1.price.str.strip('$')
df_process_1.loc[raw_data_c.reviews_count == 'na', ['reviews_count','rating']] = 0
df_process_2 = df_process_1.copy()
df_process_2[['love_count', 'reviews_count']] = df_process_1.loc[df_process_1.love_count != 'na', [
    'love_count', 'reviews_count']].astype('int64')
df_process_2[['rating', 'price']] = df_process_1.loc[df_process_1.love_count != 'na', [
    'rating', 'price']].astype('float')
df_process_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3082 entries, 0 to 3082
Data columns (total 11 columns):
product_name     3082 non-null object
brand            3082 non-null object
category         3082 non-null object
price            3082 non-null float64
size             3082 non-null object
love_count       3082 non-null int64
rating           3082 non-null float64
reviews_count    3082 non-null int64
category_1       3082 non-null object
category_2       3079 non-null object
category_3       3082 non-null object
dtypes: float64(2), int64(2), object(7)
memory usage: 288.9+ KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


# Size

As there are different measure for different products. Here, we mainly focus on products that can be measured by 'ml', 'oz', and 'g'.

In [10]:
df_process_2

Unnamed: 0,product_name,brand,category,price,size,love_count,rating,reviews_count,category_1,category_2,category_3
0,Blu Mediterraneo MINIATURE Set,Acqua Di Parma,Fragrance/Value & Gift Sets/Perfume Gift Sets,63.0,5 x 0.16oz/5mL,2603,4.0000,4,Fragrance,Value & Gift Sets,Perfume Gift Sets
1,Colonia,Acqua Di Parma,Fragrance/Men/Cologne,63.0,0.7 oz/ 20 mL,2537,4.3067,75,Fragrance,Men,Cologne
2,Rosa Nobile,Acqua Di Parma,Fragrance/Women/Perfume,68.0,3.4 oz/ 101 mL,4582,4.5652,69,Fragrance,Women,Perfume
3,Fico di Amalfi,Acqua Di Parma,Fragrance/Women/Perfume,64.0,5 oz/ 148 mL,2472,4.5714,70,Fragrance,Women,Perfume
4,Mirto di Panarea,Acqua Di Parma,Fragrance/Women/Perfume,64.0,2.5 oz/ 74 mL,2624,4.6364,22,Fragrance,Women,Perfume
...,...,...,...,...,...,...,...,...,...,...,...
3078,Orange Blossom Eau de Parfum Rollerball,The 7 Virtues,Fragrance/Women/Rollerballs & Travel Size,29.0,0.33 oz/ 11 mL,2009,4.2083,24,Fragrance,Women,Rollerballs & Travel Size
3079,Vetiver Elemi Eau de Parfum Rollerball,The 7 Virtues,Fragrance/Women/Rollerballs & Travel Size,29.0,0.33 oz/ 11 mL,1230,4.6000,15,Fragrance,Women,Rollerballs & Travel Size
3080,Patchouli Citrus Eau de Parfum Rollerball,The 7 Virtues,Fragrance/Women/Rollerballs & Travel Size,29.0,0.33 oz/ 11 mL,1336,4.3077,26,Fragrance,Women,Rollerballs & Travel Size
3081,8Greens Gummies Dietary Supplement,8Greens,Skincare/Wellness/Beauty Supplements,45.0,60 Gummies,1668,4.3333,9,Skincare,Wellness,Beauty Supplements


Use regex to find measures with oz or mL or g and create responding columns

In [11]:
def find_measure(measure, text):
    regex = '(\d?\.?\d+)' + measure   
    result = re.findall(regex, text)
    if len(result)!=0:
        return result[0]
    else:
        return None

In [12]:
measures = [' oz', ' mL', ' g']
for measure in measures:
    measure_ = measure.split(' ')[-1]
    name = 'size_' + measure_
    df_process_2[name] = df_process_2.loc[(df_process_2['size'].str.contains(measure_)) &
                                          (~df_process_2['size'].str.contains('x')), 'size'].apply(lambda x: find_measure(measure, x))
df_process_2.sample(5)

Unnamed: 0,product_name,brand,category,price,size,love_count,rating,reviews_count,category_1,category_2,category_3,size_oz,size_mL,size_g
1032,Multitask. Leave-In Lotion,FORM,Hair/Shampoo & Conditioner/Leave-In Conditioner,14.0,12 oz/ 355 mL,7631,4.3393,168,Hair,Shampoo & Conditioner,Leave-In Conditioner,12.0,355.0,
2557,Pitera™ Essence Set,SK-II,Skincare/Value & Gift Sets,99.0,na,37152,4.1907,194,Skincare,Value & Gift Sets,Value & Gift Sets,,,
2580,Perfume Palette,SKYLAR,Fragrance/Value & Gift Sets/Perfume Gift Sets,20.0,na,2830,3.6667,45,Fragrance,Value & Gift Sets,Perfume Gift Sets,,,
1718,High CBD Formula Body Lotion,Lord Jones,Bath & Body/Body Moisturizers/Body Lotions & B...,60.0,1.69 oz/ 50 mL,13430,4.1083,157,Bath & Body,Body Moisturizers,Body Lotions & Body Oils,1.69,50.0,
1828,7 Days of Flavor Set,Marvis,Skincare/High Tech Tools/Teeth Whitening,36.0,na,7817,4.52,25,Skincare,High Tech Tools,Teeth Whitening,,,


In [13]:
df_process_2.to_csv('cleaned_data.csv', index=False)