# Statistical Test for Price with other Variables 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from utils import *

In [3]:
skin_care_df = pd.read_csv('../data_cleaning/skin_care_cleaned.csv')
for col in skin_care_df.columns.values:
    col_new = col.replace(' ','_').replace('-','_').replace(':','').replace('/','_')
    skin_care_df.rename({col : col_new}, axis=1, inplace=True)

skin_care_df = skin_care_df.loc[skin_care_df['price']<1000]

### ANOVA test for product category

In [4]:
model = ols('price ~ product_category', data=skin_care_df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
anova_table

Unnamed: 0,sum_sq,df,F,PR(>F)
product_category,1950311.0,19.0,47.899637,3.5915689999999996e-166
Residual,11368510.0,5305.0,,


### ANOVA test for brand

In [5]:
df = skin_care_df.drop(['product_category'],axis=1).drop_duplicates()

In [6]:
model = ols('price ~ brand', data=df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
anova_table

Unnamed: 0,sum_sq,df,F,PR(>F)
brand,6915815.0,223.0,24.286528,0.0
Residual,5908437.0,4627.0,,


### Tests For ingredient

drop products without ingredient information.

In [7]:
df = df.loc[~df['ingredient'].isnull()]

#### Slope test for number of ingredients

In [16]:
n_ingredient_lr = lr_stats(skin_care_df, ['n_inactive_ingredient', 'n_active_ingredient'], 'price')
n_ingredient_lr

Unnamed: 0,slope,intercept,r-value,p-value,stderr
n_inactive_ingredient,1.282797,5.611741,0.405608,4.641363e-210,0.039623
n_active_ingredient,-3.514788,43.376997,-0.091362,2.400549e-11,0.525093


#### Slope test for ingredient rating

In [22]:
n_ingredient_lr = lr_stats(skin_care_df, 
      ['active_mean_rating', 'inactive_mean_rating', 'inactive_mean_rating_w1',
       'inactive_mean_rating_w2'], 'price')
n_ingredient_lr

Unnamed: 0,slope,intercept,r-value,p-value,stderr
inactive_mean_rating_w2,21.62835,-3.268316,0.100807,3.092106e-13,2.959006
inactive_mean_rating_w1,11.613253,17.76058,0.055304,6.540729e-05,2.906472
inactive_mean_rating,11.205006,19.353416,0.054856,7.494401e-05,2.827266
active_mean_rating,1.124973,25.092251,0.018786,0.5521378,1.891471


#### Slope test for ingredient category count

* F-test

In [24]:
inactive_cat_count = [f_ for f_ in df.columns.values if f_.find('inactive_cat_count')>=0]
fitting_formula = "price ~ " + " + ".join(inactive_cat_count)
model = ols(fitting_formula, data=df).fit()
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.253
Model:,OLS,Adj. R-squared:,0.249
Method:,Least Squares,F-statistic:,55.11
Date:,"Wed, 17 Oct 2018",Prob (F-statistic):,2.1100000000000002e-272
Time:,19:30:43,Log-Likelihood:,-24731.0
No. Observations:,4742,AIC:,49520.0
Df Residuals:,4712,BIC:,49720.0
Df Model:,29,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,8.9125,1.564,5.698,0.000,5.846,11.979
inactive_cat_count_Absorbent,-1.5480,0.898,-1.723,0.085,-3.309,0.213
inactive_cat_count_Anti_Acne,-10.5180,3.035,-3.466,0.001,-16.468,-4.568
inactive_cat_count_Antioxidants,2.5754,0.332,7.761,0.000,1.925,3.226
inactive_cat_count_Cleansing_Agents,-1.2692,0.392,-3.238,0.001,-2.038,-0.501
inactive_cat_count_Coloring_Agents_Pigments,1.7200,0.583,2.952,0.003,0.578,2.862
inactive_cat_count_Emollients,0.6038,0.269,2.242,0.025,0.076,1.132
inactive_cat_count_Emulsifiers,-1.9032,1.022,-1.863,0.063,-3.906,0.100
inactive_cat_count_Exfoliant,3.0894,1.225,2.523,0.012,0.689,5.490

0,1,2,3
Omnibus:,4277.149,Durbin-Watson:,1.909
Prob(Omnibus):,0.0,Jarque-Bera (JB):,210403.239
Skew:,4.183,Prob(JB):,0.0
Kurtosis:,34.542,Cond. No.,83.3


* t-test for each ingredient category

In [23]:
ingredient_cat_lr = lr_stats(df, inactive_cat_count, 'price')
for col in ingredient_cat_lr.index.values:
    ingredient_cat_lr.rename({col:col[19:]},inplace=True)
ingredient_cat_lr

Unnamed: 0,slope,intercept,r-value,p-value,stderr
Skin_Restoring,10.684626,21.645609,0.39197,5.961647e-174,0.364246
Texture_Enhancer,4.85487,17.272401,0.364458,6.44091e-149,0.180174
Antioxidants,3.769359,21.188856,0.354678,1.3646440000000002e-140,0.144328
Emollients,3.367237,22.800924,0.296379,9.10088e-97,0.157606
Skin_Replenishing,5.861908,26.987551,0.282312,1.333018e-87,0.289324
Skin_Soothing,7.05217,30.158931,0.259103,1.320731e-73,0.381831
Plant_Extracts,2.295747,28.782692,0.250339,1.121349e-68,0.128959
Preservatives,5.964672,25.812329,0.246784,9.876094e-67,0.340201
Hydration,12.967283,34.599852,0.243207,8.323318e-65,0.75118
Fragrance_Synthetic_and_Fragrant_Plant_Extracts,4.758296,33.49161,0.233642,8.227357e-60,0.287622


#### chi-square test for individule ingredient

In [21]:
ingredient_count_df = get_matching_ingredient_count(df)

common_ingredients = []
for col in ingredient_count_df.columns.values:
    if ingredient_count_df[col].sum() > 100:
       common_ingredients.append(col) 
    
price_band, bins = pd.qcut(df['price'], q=[0, 0.2, 0.4, 0.6, 0.8, 1], labels=False, retbins=True)
ingredient_count_df['price_band'] = price_band.reset_index(drop=True)

results = chi2_contingency(ingredient_count_df, common_ingredients, 'price_band')
results.loc[results['p-value']<0.05]

  if np.any(observed < 0):


Unnamed: 0,chi2,p-value
sodium hyaluronate,607.992236,2.887072e-130
butylene glycol,455.716584,2.523294e-97
phenoxyethanol,320.416654,4.263783e-68
dimethicone,234.051406,1.771575e-49
pentylene glycol,218.838099,3.333915e-46
adenosine,216.327398,1.156593e-45
lecithin,209.555720,3.310917e-44
capric triglyceride,186.625503,2.814215e-39
citronellol,184.484667,8.114779e-39
algae extract,175.794968,5.963445e-37


Using sklearn's chi2 feature selection, I get different results...

In [49]:
from sklearn.feature_selection import chi2

chi2s, p_values = chi2(ingredient_count_df[common_ingredients], ingredient_count_df['price_band'])
results = pd.DataFrame({'chi2':chi2s, 'p_value':p_values}, index=common_ingredients)
results = results.sort_values('p_value')
results.loc[results['p_value']<0.05]

Unnamed: 0,chi2,p_value
sodium hyaluronate,455.801856,2.418422e-97
butylene glycol,260.821343,3.033861e-55
adenosine,207.796562,7.912581e-44
pentylene glycol,193.225458,1.074314e-40
lecithin,188.697369,1.009722e-39
citronellol,165.927268,7.824251e-35
algae extract,163.783671,2.256002e-34
yeast extract,151.226570,1.111403e-31
acetyl hexapeptide-8,150.862675,1.330019e-31
dimethicone,150.094965,1.942573e-31
