In [58]:
# Import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as st
import warnings
warnings.filterwarnings("ignore")

In [59]:
# Read in the data set
input_data = "Resources/cosmetics_data.csv"
cosmetics_df = pd.read_csv(input_data, low_memory=False)
cosmetics_df.head()

Unnamed: 0,CDPHId,ProductName,CSFId,CSF,CompanyId,CompanyName,BrandName,PrimaryCategoryId,PrimaryCategory,SubCategoryId,...,CasNumber,ChemicalId,ChemicalName,InitialDateReported,MostRecentDateReported,DiscontinuedDate,ChemicalCreatedAt,ChemicalUpdatedAt,ChemicalDateRemoved,ChemicalCount
0,2,ULTRA COLOR RICH EXTRA PLUMP LIPSTICK-ALL SHADES,,,4,New Avon LLC,AVON,44,Makeup Products (non-permanent),53,...,13463-67-7,6,Titanium dioxide,06/17/2009,08/28/2013,02/01/2011,07/09/2009,07/09/2009,,1
1,3,Glover's Medicated Shampoo,,,338,J. Strickland & Co.,Glover's,18,Hair Care Products (non-coloring),25,...,65996-92-1,4,Distillates (coal tar),07/01/2009,07/01/2009,,07/01/2009,07/01/2009,,2
2,3,Glover's Medicated Shampoo,,,338,J. Strickland & Co.,Glover's,18,Hair Care Products (non-coloring),25,...,140-67-0,5,Estragole,07/01/2009,07/01/2009,,07/02/2009,07/02/2009,,2
3,4,PRECISION GLIMMER EYE LINER-ALL SHADES �,,,4,New Avon LLC,AVON,44,Makeup Products (non-permanent),46,...,13463-67-7,7,Titanium dioxide,07/09/2009,08/28/2013,,07/09/2009,07/09/2009,,1
4,5,AVON BRILLIANT SHINE LIP GLOSS-ALL SHADES �,,,4,New Avon LLC,AVON,44,Makeup Products (non-permanent),52,...,13463-67-7,8,Titanium dioxide,07/09/2009,08/28/2013,02/01/2011,07/09/2009,07/09/2009,,1


In [60]:
# Remove unnecessary columns
columns_to_drop = ["CDPHId", "CSFId", "CompanyId", "PrimaryCategoryId", "SubCategoryId", "CasId", "ChemicalId"]
cosmetics_cleaned_df = cosmetics_df.drop(columns=columns_to_drop)
cosmetics_cleaned_df.head()

Unnamed: 0,ProductName,CSF,CompanyName,BrandName,PrimaryCategory,SubCategory,CasNumber,ChemicalName,InitialDateReported,MostRecentDateReported,DiscontinuedDate,ChemicalCreatedAt,ChemicalUpdatedAt,ChemicalDateRemoved,ChemicalCount
0,ULTRA COLOR RICH EXTRA PLUMP LIPSTICK-ALL SHADES,,New Avon LLC,AVON,Makeup Products (non-permanent),"Lip Color - Lipsticks, Liners, and Pencils",13463-67-7,Titanium dioxide,06/17/2009,08/28/2013,02/01/2011,07/09/2009,07/09/2009,,1
1,Glover's Medicated Shampoo,,J. Strickland & Co.,Glover's,Hair Care Products (non-coloring),Hair Shampoos (making a cosmetic claim),65996-92-1,Distillates (coal tar),07/01/2009,07/01/2009,,07/01/2009,07/01/2009,,2
2,Glover's Medicated Shampoo,,J. Strickland & Co.,Glover's,Hair Care Products (non-coloring),Hair Shampoos (making a cosmetic claim),140-67-0,Estragole,07/01/2009,07/01/2009,,07/02/2009,07/02/2009,,2
3,PRECISION GLIMMER EYE LINER-ALL SHADES �,,New Avon LLC,AVON,Makeup Products (non-permanent),Eyeliner/Eyebrow Pencils,13463-67-7,Titanium dioxide,07/09/2009,08/28/2013,,07/09/2009,07/09/2009,,1
4,AVON BRILLIANT SHINE LIP GLOSS-ALL SHADES �,,New Avon LLC,AVON,Makeup Products (non-permanent),Lip Gloss/Shine,13463-67-7,Titanium dioxide,07/09/2009,08/28/2013,02/01/2011,07/09/2009,07/09/2009,,1


In [61]:
# Change dates to datetime objects
cosmetics_cleaned_df["InitialDateReported"] = pd.to_datetime(cosmetics_cleaned_df["InitialDateReported"])
cosmetics_cleaned_df["MostRecentDateReported"] = pd.to_datetime(cosmetics_cleaned_df["MostRecentDateReported"])
cosmetics_cleaned_df["DiscontinuedDate"] = pd.to_datetime(cosmetics_cleaned_df["DiscontinuedDate"])
cosmetics_cleaned_df["ChemicalCreatedAt"] = pd.to_datetime(cosmetics_cleaned_df["ChemicalCreatedAt"])
cosmetics_cleaned_df["ChemicalUpdatedAt"] = pd.to_datetime(cosmetics_cleaned_df["ChemicalUpdatedAt"])
cosmetics_cleaned_df["ChemicalDateRemoved"] = pd.to_datetime(cosmetics_cleaned_df["ChemicalDateRemoved"])
cosmetics_cleaned_df.head()

Unnamed: 0,ProductName,CSF,CompanyName,BrandName,PrimaryCategory,SubCategory,CasNumber,ChemicalName,InitialDateReported,MostRecentDateReported,DiscontinuedDate,ChemicalCreatedAt,ChemicalUpdatedAt,ChemicalDateRemoved,ChemicalCount
0,ULTRA COLOR RICH EXTRA PLUMP LIPSTICK-ALL SHADES,,New Avon LLC,AVON,Makeup Products (non-permanent),"Lip Color - Lipsticks, Liners, and Pencils",13463-67-7,Titanium dioxide,2009-06-17,2013-08-28,2011-02-01,2009-07-09,2009-07-09,NaT,1
1,Glover's Medicated Shampoo,,J. Strickland & Co.,Glover's,Hair Care Products (non-coloring),Hair Shampoos (making a cosmetic claim),65996-92-1,Distillates (coal tar),2009-07-01,2009-07-01,NaT,2009-07-01,2009-07-01,NaT,2
2,Glover's Medicated Shampoo,,J. Strickland & Co.,Glover's,Hair Care Products (non-coloring),Hair Shampoos (making a cosmetic claim),140-67-0,Estragole,2009-07-01,2009-07-01,NaT,2009-07-02,2009-07-02,NaT,2
3,PRECISION GLIMMER EYE LINER-ALL SHADES �,,New Avon LLC,AVON,Makeup Products (non-permanent),Eyeliner/Eyebrow Pencils,13463-67-7,Titanium dioxide,2009-07-09,2013-08-28,NaT,2009-07-09,2009-07-09,NaT,1
4,AVON BRILLIANT SHINE LIP GLOSS-ALL SHADES �,,New Avon LLC,AVON,Makeup Products (non-permanent),Lip Gloss/Shine,13463-67-7,Titanium dioxide,2009-07-09,2013-08-28,2011-02-01,2009-07-09,2009-07-09,NaT,1


In [62]:
cosmetics_cleaned_df.shape

(114635, 15)

In [63]:
print(cosmetics_cleaned_df.duplicated().sum())

7692


In [64]:
cosmetics_cleaned_df = cosmetics_cleaned_df.drop_duplicates()
cosmetics_cleaned_df.shape

(106943, 15)