# Price Regression

In [25]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor
#from lightgbm import LGBMRegressor
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
import itertools

import sys
sys.path.append('../utilities')
from data_utils import *
from ml_utils import *

In [2]:
# read cleaned data
skin_care_df = pd.read_csv('../data_cleaning/skin_care_cleaned.csv')
body_care_df = pd.read_csv('../data_cleaning/body_care_cleaned.csv')
makeup_df = pd.read_csv('../data_cleaning/makeup_cleaned.csv')

# merge skin-care, body-care and makeup table
df = pd.concat([skin_care_df,body_care_df,makeup_df], sort=False).reset_index(drop=True)

df = df.loc[df['price']<1000]

df['count_extract'] = df['inactive_ingredient_list'].apply(count_pattern, pattern='extract')
df['count_peptide'] = df['inactive_ingredient_list'].apply(count_pattern, pattern='peptide')
df['count_oil'] = df['inactive_ingredient_list'].apply(count_pattern, pattern='oil')

In [19]:
target = 'price'
meanenc_feats = ['product_category', 'brand', 'size_unit']
gen_features = ['product_category_mean_encode', 'brand_mean_encode', 'size_num', 'size_unit_mean_encode']
ingredient_features = ['n_inactive_ingredient','n_active_ingredient','active_mean_rating', 'inactive_mean_rating', 'inactive_mean_rating_w1','inactive_mean_rating_w2']
ingredient_features += [f_ for f_ in df.columns.values if f_.find('count')>=0]

In [20]:
train, test = train_test_split(df)
folds = KFold(5, random_state=777)

for col in meanenc_feats:
    train[col+'_mean_encode'] = 0.
    test[col+'_mean_encode'] = 0.
    SMOOTHING = test[~test[col].isin(train[col])].shape[0]/test.shape[0]
    _, test[col+'_mean_encode'] = target_encode(train[col], 
                                      test[col], 
                                      target=train[target], 
                                      min_samples_leaf=20,
                                      smoothing=SMOOTHING,
                                      noise_level=0.02)
    for f, (vis_index, blind_index) in enumerate(folds.split(train, train[target])):
        _, train[col+'_mean_encode'].iloc[blind_index] = target_encode(train[col].iloc[vis_index], 
                                                            train[col].iloc[blind_index], 
                                                            target=train[target].iloc[vis_index], 
                                                            min_samples_leaf=20,
                                                            smoothing=SMOOTHING,
                                                            noise_level=0.02)     
        


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A

In [36]:
features = gen_features
X_train, y_train = train[features], train[target]
X_test, y_test = test[features], test[target]

model = XGBRegressor(n_estimators=200, max_depth=5, learning_rate=0.1)
y_train_pred = oof_preds(X_train.values, y_train.values, model, folds=folds, return_prob=False)

model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)

rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

print("rmse: (train cv) %.3f, (test) %.3f"%(rmse_train, rmse_test))

rmse: (train cv) 29.676, (test) 31.254


In [37]:
features = gen_features + ingredient_features
X_train, y_train = train[features], train[target]
X_test, y_test = test[features], test[target]

model = XGBRegressor(n_estimators=200, max_depth=5, learning_rate=0.1)
y_train_pred = oof_preds(X_train.values, y_train.values, model, folds=folds, return_prob=False)

model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)

rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

print("rmse: (train cv) %.3f, (test) %.3f"%(rmse_train, rmse_test))

rmse: (train cv) 27.399, (test) 26.064
