In [81]:
#!pip install rake_nltk
#!pip install --upgrade pip

In [82]:
import pandas as pd
import re
import unicodedata
import numpy as np

from rake_nltk import Rake
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer #tokenizes a collection of words extracted from a text doc
from ast import literal_eval #This evaluates whether an expresion is a Python datatype or not


In [83]:
cosmetics_path = "./data/cosmetics.csv"
raw_df = pd.read_csv(cosmetics_path)
raw_df.head()

Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive
0,Moisturizer,LA MER,Crème de la Mer,175,4.1,"Algae (Seaweed) Extract, Mineral Oil, Petrolat...",1,1,1,1,1
1,Moisturizer,SK-II,Facial Treatment Essence,179,4.1,"Galactomyces Ferment Filtrate (Pitera), Butyle...",1,1,1,1,1
2,Moisturizer,DRUNK ELEPHANT,Protini™ Polypeptide Cream,68,4.4,"Water, Dicaprylyl Carbonate, Glycerin, Ceteary...",1,1,1,1,0
3,Moisturizer,LA MER,The Moisturizing Soft Cream,175,3.8,"Algae (Seaweed) Extract, Cyclopentasiloxane, P...",1,1,1,1,1
4,Moisturizer,IT COSMETICS,Your Skin But Better™ CC+™ Cream with SPF 50+,38,4.1,"Water, Snail Secretion Filtrate, Phenyl Trimet...",1,1,1,1,1


In [84]:
raw_df.describe()

Unnamed: 0,Price,Rank,Combination,Dry,Normal,Oily,Sensitive
count,1472.0,1472.0,1472.0,1472.0,1472.0,1472.0,1472.0
mean,55.584239,4.153261,0.65625,0.61413,0.652174,0.607337,0.513587
std,45.014429,0.633918,0.47512,0.486965,0.476442,0.488509,0.499985
min,3.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,30.0,4.0,0.0,0.0,0.0,0.0,0.0
50%,42.5,4.3,1.0,1.0,1.0,1.0,1.0
75%,68.0,4.5,1.0,1.0,1.0,1.0,1.0
max,370.0,5.0,1.0,1.0,1.0,1.0,1.0


In [85]:
raw_df.isnull().sum()

Label          0
Brand          0
Name           0
Price          0
Rank           0
Ingredients    0
Combination    0
Dry            0
Normal         0
Oily           0
Sensitive      0
dtype: int64

In [86]:
raw_df.columns

Index(['Label', 'Brand', 'Name', 'Price', 'Rank', 'Ingredients', 'Combination',
       'Dry', 'Normal', 'Oily', 'Sensitive'],
      dtype='object')

In [87]:
with_types = raw_df
def add_skin_types_names(row):
    types = []
    if row["Combination"]:
        types.append("combination")
    if row["Dry"]:
        types.append("dry")
    if row["Normal"]:
        types.append("normal")
    if row["Oily"]:
        types.append("oily")
    if row["Sensitive"]:
        types.append("sensitive")

    if not row["Combination"] and not row["Dry"] and not row["Normal"] and not row["Oily"] and not row["Sensitive"]:
        types.append("combination")
        types.append("dry")
        types.append("normal")
        types.append("oily")
        types.append("sensitive")
        types.append("alltypes")
    if row["Combination"] and row["Dry"] and row["Normal"] and row["Oily"]and row["Sensitive"]:
        types.append("alltypes")

    return " ".join(types)

with_types["Skin Types"] = with_types.apply(add_skin_types_names, axis=1)
with_types.head()


Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive,Skin Types
0,Moisturizer,LA MER,Crème de la Mer,175,4.1,"Algae (Seaweed) Extract, Mineral Oil, Petrolat...",1,1,1,1,1,combination dry normal oily sensitive alltypes
1,Moisturizer,SK-II,Facial Treatment Essence,179,4.1,"Galactomyces Ferment Filtrate (Pitera), Butyle...",1,1,1,1,1,combination dry normal oily sensitive alltypes
2,Moisturizer,DRUNK ELEPHANT,Protini™ Polypeptide Cream,68,4.4,"Water, Dicaprylyl Carbonate, Glycerin, Ceteary...",1,1,1,1,0,combination dry normal oily
3,Moisturizer,LA MER,The Moisturizing Soft Cream,175,3.8,"Algae (Seaweed) Extract, Cyclopentasiloxane, P...",1,1,1,1,1,combination dry normal oily sensitive alltypes
4,Moisturizer,IT COSMETICS,Your Skin But Better™ CC+™ Cream with SPF 50+,38,4.1,"Water, Snail Secretion Filtrate, Phenyl Trimet...",1,1,1,1,1,combination dry normal oily sensitive alltypes


In [88]:
with_types["Ingredients"]

0       Algae (Seaweed) Extract, Mineral Oil, Petrolat...
1       Galactomyces Ferment Filtrate (Pitera), Butyle...
2       Water, Dicaprylyl Carbonate, Glycerin, Ceteary...
3       Algae (Seaweed) Extract, Cyclopentasiloxane, P...
4       Water, Snail Secretion Filtrate, Phenyl Trimet...
                              ...                        
1467    Water, Alcohol Denat., Potassium Cetyl Phospha...
1468    Water, Isododecane, Dimethicone, Butyloctyl Sa...
1469    Water, Dihydroxyacetone, Glycerin, Sclerocarya...
1470    Water, Dihydroxyacetone, Propylene Glycol, PPG...
1471                        Visit the DERMAFLASH boutique
Name: Ingredients, Length: 1472, dtype: object

In [89]:
def clean_ingredients(text):
    text = text.lower()
    text =text.replace("\n", " ")
    if "ingredients:" in text:
        text = text.split("ingredients:")[1].strip()
    text = re.sub(r':((?!.*?,).*?)\.', '', text)
    text =text.replace(" -", ", ")
    text = text.replace(" =;", ",")
    if text.startswith("-"):
        text =text.replace("-", "")
    text =text.replace("(", "")
    text =text.replace(")", "")
    text =text.replace("*", "")


    if text.startswith("visit") or text.startswith("#name?"):
        return "No info"
    return text

with_types["Clean Ingredients"] = with_types["Ingredients"].apply(clean_ingredients)
with_types["Clean Ingredients"].head(190)


0      algae seaweed extract, mineral oil, petrolatum...
1      galactomyces ferment filtrate pitera, butylene...
2      water, dicaprylyl carbonate, glycerin, ceteary...
3      algae seaweed extract, cyclopentasiloxane, pet...
4      water, snail secretion filtrate, phenyl trimet...
                             ...                        
185    water, isocetyl stearate, pentaerythrityl tetr...
186    water, cyclopentasiloxane, propanediol, dimeth...
187    water, isopropyl isostearate, butyrospermum pa...
188    octinoxate 7.5%, octisalate 2.0%, octocrylene ...
189    water, butylene glycol, caprylic/capric trigly...
Name: Clean Ingredients, Length: 190, dtype: object

In [90]:
def clean_names(text):
    normalized = unicodedata.normalize("NFD", text)
    without_accents = "".join(character for character in normalized if unicodedata.category(character) != "Mn")
    cleaned = re.sub(r"[^a-zA-Z0-9\s]", "", without_accents)
    return cleaned.lower()

In [92]:
with_types["Clean Name"] = with_types["Name"].apply(clean_names)
with_types

Unnamed: 0,Label,Brand,Name,Price,Rank,Ingredients,Combination,Dry,Normal,Oily,Sensitive,Skin Types,Clean Ingredients,Clean Name
0,Moisturizer,LA MER,Crème de la Mer,175,4.1,"Algae (Seaweed) Extract, Mineral Oil, Petrolat...",1,1,1,1,1,combination dry normal oily sensitive alltypes,"algae seaweed extract, mineral oil, petrolatum...",creme de la mer
1,Moisturizer,SK-II,Facial Treatment Essence,179,4.1,"Galactomyces Ferment Filtrate (Pitera), Butyle...",1,1,1,1,1,combination dry normal oily sensitive alltypes,"galactomyces ferment filtrate pitera, butylene...",facial treatment essence
2,Moisturizer,DRUNK ELEPHANT,Protini™ Polypeptide Cream,68,4.4,"Water, Dicaprylyl Carbonate, Glycerin, Ceteary...",1,1,1,1,0,combination dry normal oily,"water, dicaprylyl carbonate, glycerin, ceteary...",protini polypeptide cream
3,Moisturizer,LA MER,The Moisturizing Soft Cream,175,3.8,"Algae (Seaweed) Extract, Cyclopentasiloxane, P...",1,1,1,1,1,combination dry normal oily sensitive alltypes,"algae seaweed extract, cyclopentasiloxane, pet...",the moisturizing soft cream
4,Moisturizer,IT COSMETICS,Your Skin But Better™ CC+™ Cream with SPF 50+,38,4.1,"Water, Snail Secretion Filtrate, Phenyl Trimet...",1,1,1,1,1,combination dry normal oily sensitive alltypes,"water, snail secretion filtrate, phenyl trimet...",your skin but better cc cream with spf 50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1467,Sun protect,KORRES,Yoghurt Nourishing Fluid Veil Face Sunscreen B...,35,3.9,"Water, Alcohol Denat., Potassium Cetyl Phospha...",1,1,1,1,1,combination dry normal oily sensitive alltypes,"water, alcohol denat., potassium cetyl phospha...",yoghurt nourishing fluid veil face sunscreen b...
1468,Sun protect,KATE SOMERVILLE,Daily Deflector™ Waterlight Broad Spectrum SPF...,48,3.6,"Water, Isododecane, Dimethicone, Butyloctyl Sa...",0,0,0,0,0,combination dry normal oily sensitive alltypes,"water, isododecane, dimethicone, butyloctyl sa...",daily deflector waterlight broad spectrum spf ...
1469,Sun protect,VITA LIBERATA,Self Tan Dry Oil SPF 50,54,3.5,"Water, Dihydroxyacetone, Glycerin, Sclerocarya...",0,0,0,0,0,combination dry normal oily sensitive alltypes,"water, dihydroxyacetone, glycerin, sclerocarya...",self tan dry oil spf 50
1470,Sun protect,ST. TROPEZ TANNING ESSENTIALS,Pro Light Self Tan Bronzing Mist,20,1.0,"Water, Dihydroxyacetone, Propylene Glycol, PPG...",0,0,0,0,0,combination dry normal oily sensitive alltypes,"water, dihydroxyacetone, propylene glycol, ppg...",pro light self tan bronzing mist


In [94]:
with_types.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1472 entries, 0 to 1471
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Label              1472 non-null   object 
 1   Brand              1472 non-null   object 
 2   Name               1472 non-null   object 
 3   Price              1472 non-null   int64  
 4   Rank               1472 non-null   float64
 5   Ingredients        1472 non-null   object 
 6   Combination        1472 non-null   int64  
 7   Dry                1472 non-null   int64  
 8   Normal             1472 non-null   int64  
 9   Oily               1472 non-null   int64  
 10  Sensitive          1472 non-null   int64  
 11  Skin Types         1472 non-null   object 
 12  Clean Ingredients  1472 non-null   object 
 13  Clean Name         1472 non-null   object 
dtypes: float64(1), int64(6), object(7)
memory usage: 161.1+ KB


In [95]:
dataframe = with_types[["Label", "Brand",  "Clean Name", "Clean Ingredients", "Skin Types","Rank"]]
dataframe.head()

Unnamed: 0,Label,Brand,Clean Name,Clean Ingredients,Skin Types,Rank
0,Moisturizer,LA MER,creme de la mer,"algae seaweed extract, mineral oil, petrolatum...",combination dry normal oily sensitive alltypes,4.1
1,Moisturizer,SK-II,facial treatment essence,"galactomyces ferment filtrate pitera, butylene...",combination dry normal oily sensitive alltypes,4.1
2,Moisturizer,DRUNK ELEPHANT,protini polypeptide cream,"water, dicaprylyl carbonate, glycerin, ceteary...",combination dry normal oily,4.4
3,Moisturizer,LA MER,the moisturizing soft cream,"algae seaweed extract, cyclopentasiloxane, pet...",combination dry normal oily sensitive alltypes,3.8
4,Moisturizer,IT COSMETICS,your skin but better cc cream with spf 50,"water, snail secretion filtrate, phenyl trimet...",combination dry normal oily sensitive alltypes,4.1


In [102]:
query = "oily"
dataframe[ dataframe["Skin Types"].str.contains(query) ].head()

Unnamed: 0,Label,Brand,Clean Name,Clean Ingredients,Skin Types,Rank
0,Moisturizer,LA MER,creme de la mer,"algae seaweed extract, mineral oil, petrolatum...",combination dry normal oily sensitive alltypes,4.1
1,Moisturizer,SK-II,facial treatment essence,"galactomyces ferment filtrate pitera, butylene...",combination dry normal oily sensitive alltypes,4.1
2,Moisturizer,DRUNK ELEPHANT,protini polypeptide cream,"water, dicaprylyl carbonate, glycerin, ceteary...",combination dry normal oily,4.4
3,Moisturizer,LA MER,the moisturizing soft cream,"algae seaweed extract, cyclopentasiloxane, pet...",combination dry normal oily sensitive alltypes,3.8
4,Moisturizer,IT COSMETICS,your skin but better cc cream with spf 50,"water, snail secretion filtrate, phenyl trimet...",combination dry normal oily sensitive alltypes,4.1


In [107]:
query = "combination oily"
dataframe[ dataframe["Skin Types"].str.strip() == query ].head()

Unnamed: 0,Label,Brand,Clean Name,Clean Ingredients,Skin Types,Rank
42,Moisturizer,CLINIQUE,dramatically different moisturizing gel,"water , dimethicone , isododecane , butylene g...",combination oily,4.5
150,Moisturizer,MURAD,oilcontrol mattifier spf 15 pa,"water, cyclopentasiloxane, bis-peg/ppg-14/14 d...",combination oily,4.2
151,Moisturizer,OLEHENRIKSEN,counter balance oil control hydrator,"water, butylene glycol, glycerin, caprylic/cap...",combination oily,4.3
163,Moisturizer,CLINIQUE,acne solutions bb cream broad spectrum spf 40,"octinoxate 7.5% , octisalate 2.0% , titanium d...",combination oily,4.1
194,Moisturizer,CLINIQUE,limited edition dramatically different moistur...,"water, dimethicone, isododecane, butylene glyc...",combination oily,5.0
