# Phase 1 Project: Ebay Perfume Analysis
## Authored by Trinity Gahagen

In this analysis, we will be exploring and analyzing different perfumes sold on ebay.

In [255]:
# Import necessary modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [256]:
mens_perfume = pd.read_csv('ebay_mens_perfume.csv')
womens_perfume = pd.read_csv('ebay_womens_perfume.csv')

In [257]:
mens_perfume.head()

Unnamed: 0,brand,title,type,price,priceWithCurrency,available,availableText,sold,lastUpdated,itemLocation
0,Dior,Christian Dior Sauvage Men's EDP 3.4 oz Fragra...,Eau de Parfum,84.99,US $84.99/ea,10.0,More than 10 available / 116 sold,116.0,"May 24, 2024 10:03:04 PDT","Allen Park, Michigan, United States"
1,AS SHOW,A-v-entus Eau de Parfum 3.3 oz 100ML Millesime...,Eau de Parfum,109.99,US $109.99,8.0,8 available / 48 sold,48.0,"May 23, 2024 23:07:49 PDT","Atlanta, Georgia, Canada"
2,Unbranded,HOGO BOSS cologne For Men 3.4 oz,Eau de Toilette,100.0,US $100.00,10.0,More than 10 available / 27 sold,27.0,"May 22, 2024 21:55:43 PDT","Dearborn, Michigan, United States"
3,Giorgio Armani,Acqua Di Gio by Giorgio Armani 6.7 Fl oz Eau D...,Eau de Toilette,44.99,US $44.99/ea,2.0,2 available / 159 sold,159.0,"May 24, 2024 03:30:43 PDT","Reinholds, Pennsylvania, United States"
4,Lattafa,Lattafa Men's Hayaati Al Maleky EDP Spray 3.4 ...,Fragrances,16.91,US $16.91,,Limited quantity available / 156 sold,156.0,"May 24, 2024 07:56:25 PDT","Brooklyn, New York, United States"


In [258]:
womens_perfume.head()

Unnamed: 0,brand,title,type,price,priceWithCurrency,available,availableText,sold,lastUpdated,itemLocation
0,Carolina Herrera,Good Girl by Carolina Herrera 2.7 oz Eau De Pa...,Eau de Parfum,43.99,US $43.99/ea,2.0,2 available / 393 sold,393.0,"May 23, 2024 10:43:50 PDT","Thomasville, Alabama, United States"
1,As Shown,Parfums de Marly Delina La Rosee Eau de Parfum...,Eau de Parfum,79.99,US $79.99,5.0,5 available / 40 sold,40.0,"May 24, 2024 00:15:48 PDT","New Jersey, Hong Kong"
2,PRADA,PRADA Paradoxe by Prada EDP 3.0oz/90ml Spray P...,Eau de Parfum,59.99,US $59.99,10.0,More than 10 available / 35 sold,35.0,"May 14, 2024 20:54:25 PDT","Orange, New Jersey, United States"
3,As Show,J'adore Parfum D'eau by Christian 3.4 oz EDP F...,Eau de Parfum,59.99,US $59.99/ea,10.0,More than 10 available / 9 sold,9.0,"May 23, 2024 01:23:05 PDT","USA, New Jersey, Hong Kong"
4,Khadlaj,Shiyaaka for Men EDP Spray 100ML (3.4 FL.OZ) B...,Eau de Parfum,29.99,US $29.99/ea,10.0,More than 10 available,,,"Little Ferry, New Jersey, United States"


In [259]:
mens_perfume['forMen'] = 1
womens_perfume['forWomen'] = 1
womens_perfume[['unisex','forMen']] = 0
mens_perfume[['unisex', 'forWomen']] = 0

print(mens_perfume.columns)
print(womens_perfume.columns)

Index(['brand', 'title', 'type', 'price', 'priceWithCurrency', 'available',
       'availableText', 'sold', 'lastUpdated', 'itemLocation', 'forMen',
       'unisex', 'forWomen'],
      dtype='object')
Index(['brand', 'title', 'type', 'price', 'priceWithCurrency', 'available',
       'availableText', 'sold', 'lastUpdated', 'itemLocation', 'forWomen',
       'unisex', 'forMen'],
      dtype='object')


In [260]:
perfumes = pd.concat([mens_perfume, womens_perfume], ignore_index=True)

In [261]:
perfumes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   brand              1998 non-null   object 
 1   title              2000 non-null   object 
 2   type               1995 non-null   object 
 3   price              2000 non-null   float64
 4   priceWithCurrency  2000 non-null   object 
 5   available          1758 non-null   float64
 6   availableText      1989 non-null   object 
 7   sold               1978 non-null   float64
 8   lastUpdated        1874 non-null   object 
 9   itemLocation       2000 non-null   object 
 10  forMen             2000 non-null   int64  
 11  unisex             2000 non-null   int64  
 12  forWomen           2000 non-null   int64  
dtypes: float64(3), int64(3), object(7)
memory usage: 203.2+ KB


In [262]:
perfumes.dropna(subset=['brand', 'type'], inplace=True)

In [263]:
UNISEX_MASK = perfumes['title'].str.lower().str.contains('unisex')
WOMEN_MASK = perfumes['title'].str.lower().str.contains('for women')
MEN_MASK = perfumes['title'].str.lower().str.contains('for men')

perfumes['unisex'] = np.where(UNISEX_MASK, 1, perfumes['unisex'])
perfumes['forWomen'] = np.where(WOMEN_MASK, 1, perfumes['forWomen'])
perfumes['forMen'] = np.where(MEN_MASK, 1, perfumes['forMen'])

for col in ['forWomen', 'forMen']:
    perfumes[col] = np.where(perfumes['unisex'] == 1, 0, perfumes[col])

In [264]:
perfumes['type'].unique()

array(['Eau de Parfum', 'Eau de Toilette', 'Fragrances', 'Perfume', '/',
       'PARFUM', 'Parfum', 'Concentrated Uncut Pure Body Oil',
       'LE PARFUM', 'Eau De Parfum', 'Unscented', 'Eau de Cologne',
       '~ THE ONE EAU DE PARFUM SPRAY ~', 'EXTRAIT DE PARFUM',
       'Eau De Toilette', 'Eau De Parfum Intense', 'Pheromone',
       'Aftershave', 'Fragrance & Perfume', 'Eau de Perfume',
       'Jo Malone Cologne Intense Spray', 'Y', 'Gift Sets',
       'Fragrance Rolling Ball', 'Body Spray', 'Eau de toilette',
       'Eau de Toillette',
       'Eau De Toilette, Eau De Parfum, Eau De Parfum Intense', 'Cologne',
       'le parfum', 'Eau de Toilette Intense',
       'Eau de Cologne Spray, Cologne Spray', 'Extrait De Parfum',
       'Fine Cologne', 'Does not apply', 'EDT', 'Extrait de Parfum',
       'Editions Parfums', 'DIOR HOMME COLOGNE', 'Deodorant', 'De Nuit',
       'Eau de Toilette, Cologne Spray', 'Parfum Intense',
       'Eau de Parfum Intense', 'cologne', 'EAU DE COLOGNE SPRAY

In [265]:
eau_de_toilette = "(:?eau){1} de toi.*|edt"
eau_de_parfum = "(:?eau){1} de parfum|edp"
mist = "mist"
cologne = "cologne|edc"
parfum = "(:?eau){0}parfum{1}"
oil = "(?<=t)oil"
fragrance = "fragrance"
roll_on = "roll"
lotion = "lotion|moisturizer|cream"
deodorant = "deodorant"
sets = "set|pc"
perfume = "perfume|eau de perfume"

type_masks = [eau_de_toilette, eau_de_parfum, mist, cologne, parfum, oil, fragrance, roll_on, lotion, deodorant, sets, perfume]

type_categories = [
    "Eau De Toilette",
    "Eau De Parfum",
    "Mist",
    "Cologne",
    "Parfum",
    "Oil",
    "Fragrance",
    "Roll On",
    "Lotion",
    "Deodorant",
    "Sets",
    "Perfume"
]

type_conditions = [(perfumes['type'].str.lower().str.contains(condition, na=False)) for condition in type_masks]

  return func(self, *args, **kwargs)


In [266]:
perfumes['type_clean'] = np.select(type_conditions, type_categories, default="Other")

In [267]:
perfumes['type_clean'].unique()

array(['Eau De Parfum', 'Eau De Toilette', 'Fragrance', 'Perfume',
       'Other', 'Parfum', 'Cologne', 'Sets', 'Deodorant', 'Roll On',
       'Mist', 'Lotion', 'Oil'], dtype=object)

In [268]:
perfumes['itemLocation'].unique()

array(['Allen Park, Michigan, United States', 'Atlanta, Georgia, Canada',
       'Dearborn, Michigan, United States',
       'Reinholds, Pennsylvania, United States',
       'Brooklyn, New York, United States',
       'Houston, Texas, United States',
       'Englewood Cliffs, New Jersey, United States',
       'Ithaca, New York, United States', 'shanghai, China',
       'Dearborn Heights, Michigan, United States',
       'Ecorse, Michigan, United States',
       'Warren, Michigan, United States',
       'San Francisco, California, United States',
       'Dayton,New Jersey, Hong Kong',
       'San Jose, California, United States',
       'Miami, Florida, United States',
       'Hamtramck, Michigan, United States',
       'Flat Lick, Kentucky, United States',
       'Elmhurst, New York, United States',
       'Hackensack, New Jersey, United States',
       'Dallas, Texas, United States',
       'Pomona, California, United States', 'Katy, Texas, United States',
       'College Point, New 

In [269]:
usa = "usa|united states|us|estados unidos|unitedstates"
hong_kong = "hong kong|hk|hongkong"
china = "china"
india = "india"
pakistan = "pakistan"
canada = "canada"
taiwan = "taiwan"
brazil = "brazil"
japan = "japan"
bulgaria = "bulgaria"

location_masks = [usa, hong_kong, china, india, pakistan, canada, taiwan, brazil, japan, bulgaria]

location_categories = [
    "USA",
    "HK",
    "China",
    "India",
    "Pakistan",
    "Canada",
    "Taiwan",
    "Brazil",
    "Japan",
    "Bulgaria"
]

location_conditions = [(perfumes['itemLocation'].str.lower().str.contains(condition, na=False)) for condition in location_masks]

zipped_locations = zip(location_categories, location_conditions)

In [270]:
for place, condition in zipped_locations:
    col_label = f'in_{place}'
    perfumes[col_label] = condition.map(lambda x: 1 if x == True else 0)

In [271]:
perfumes_clean = perfumes.drop(columns=['type', 'priceWithCurrency', 'itemLocation', 'availableText'])

In [272]:
perfumes_clean.isna().sum()

brand            0
title            0
price            0
available      239
sold            21
lastUpdated    126
forMen           0
unisex           0
forWomen         0
type_clean       0
in_USA           0
in_HK            0
in_China         0
in_India         0
in_Pakistan      0
in_Canada        0
in_Taiwan        0
in_Brazil        0
in_Japan         0
in_Bulgaria      0
dtype: int64

In [None]:
perfumes_clean.fillna('Unknown', inplace=True)