In [12]:
##### import pandas as pd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandasql import sqldf
import pandas_profiling
import datetime

In [91]:
##### Basics
from pprint import pprint
import joblib
#### For Feature selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor 
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from treeinterpreter import treeinterpreter as ti

In [92]:
from sklearn.model_selection import train_test_split
from sklearn import metrics, datasets, ensemble
from sklearn import tree

In [93]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error as MSE

In [94]:
data = pd.read_csv("...data_train_non_trans_merge_final_dup.csv")
data.head(1)

Unnamed: 0.1,Unnamed: 0,id,prices_availability,prices_condition,prices_isSale,prices_merchant_x,brand_x,manufacturerNumber,primaryCategories,weight_norm,...,category_3,category_4,category_5,category_6,shipping_flag,n_days_since_added,n_days_since_updated,prices_merchant_y,brand_y,price_USD
0,0,AVphzgbJLJeJML43fA0o,In Stock,New,False,Bestbuy.com,Sanus,VLF410B1,Electronics,32.8,...,Electronics,A/V Presentation,Accessories & Supplies,TV Ceiling & Wall Mounts,,2313,1188,1,2,104.99


In [95]:
data_train = data[['weight_norm',
                   'n_days_since_added',
                   'category_2',
                   'category_4',
                   'manufacturerNumber',
                   'category_1',
                   'category_0',
                   'price_USD']]

In [96]:
data_train.dtypes

weight_norm           float64
n_days_since_added      int64
category_2             object
category_4             object
manufacturerNumber     object
category_1             object
category_0             object
price_USD             float64
dtype: object

In [97]:
data_train['manufacturerNumber'] = data_train['manufacturerNumber'].astype(str)
data_train['category_4'] = data_train['category_4'].astype(str)
data_train['category_2'] = data_train['category_2'].astype(str)
data_train['category_1'] = data_train['category_1'].astype(str)
data_train['category_0'] = data_train['category_0'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [98]:
data_train_t_imputed = data_train.copy()

In [99]:
data_train_t_imputed.dtypes

weight_norm           float64
n_days_since_added      int64
category_2             object
category_4             object
manufacturerNumber     object
category_1             object
category_0             object
price_USD             float64
dtype: object

# Base Line model

In [100]:
# Let us start with base line model, which is the average price of product of different categories. 
# In other words, in our base line model, price prediction will be the average price of product categories

In [101]:
category_median = data_train.pivot_table(index=['category_2', 'category_4', 'category_1', 'category_0'], values=['price_USD'], aggfunc=np.median)

In [102]:
data_train_baseline_model = pd.merge(category_median, data_train, how='left',
                  left_on = ['category_2', 'category_4', 'category_1', 'category_0'], right_on = ['category_2', 'category_4', 'category_1', 'category_0'])

In [103]:
# The Mean Absolute Error of this simplistic approach is 103.75

In [104]:
(sum(abs(data_train_baseline_model['price_USD_x'] - data_train_baseline_model['price_USD_y'])))/14146

103.75712379471254

# RandomForestRegressor - top7 predictors

In [105]:
#Encoding of categorical features

In [106]:
categorical_features = list(data_train_t_imputed.select_dtypes(exclude=["number"]).columns)

In [107]:
le = LabelEncoder()

In [108]:
for i in range(len(categorical_features)):
    new = le.fit_transform(data_train_t_imputed[categorical_features[i]])
    data_train_t_imputed[categorical_features[i]] = new
data_train_t_imputed.head()

Unnamed: 0,weight_norm,n_days_since_added,category_2,category_4,manufacturerNumber,category_1,category_0,price_USD
0,32.8,2313,322,23,1120,299,15,104.99
1,30.0,1216,322,11,734,299,15,99.99
2,175.0,2113,322,11,1134,299,15,419.95
3,175.0,2113,322,11,1134,299,15,579.99
4,175.0,2113,322,11,1134,299,15,398.99


In [109]:
# Set some records apart to test our model further on

In [110]:
#Splitting the data into independent and dependent variables
X = data_train_t_imputed.iloc[:,0:7].values
y = data_train_t_imputed.iloc[:,7].values
print('The independent features set: ')
print(X[:5,:])
print('The dependent variable: ')
print(y[:5])

The independent features set: 
[[  32.8 2313.   322.    23.  1120.   299.    15. ]
 [  30.  1216.   322.    11.   734.   299.    15. ]
 [ 175.  2113.   322.    11.  1134.   299.    15. ]
 [ 175.  2113.   322.    11.  1134.   299.    15. ]
 [ 175.  2113.   322.    11.  1134.   299.    15. ]]
The dependent variable: 
[104.99  99.99 419.95 579.99 398.99]


In [111]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 21)

In [112]:
# Let us use Random Search to find the best parameters for Random Forest

In [113]:
from sklearn.model_selection import RandomizedSearchCV

In [114]:
n_estimators = [int(x) for x in np.linspace(start = 5, stop = 30, num = 7)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(start = 5, stop = 20, num = 7)]
min_samples_split = [5, 10, 20]
min_samples_leaf = [1, 3, 4, 5]
bootstrap = [True, False]

In [115]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [116]:
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [5, 7, 10, 12, 15, 17, 20],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 3, 4, 5],
 'min_samples_split': [5, 10, 20],
 'n_estimators': [5, 9, 13, 17, 21, 25, 30]}


In [117]:
rf = RandomForestRegressor(random_state=42)
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 400, cv = 10, verbose=2, random_state=42, n_jobs = -1)

In [118]:
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 10 folds for each of 400 candidates, totalling 4000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   10.3s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   40.8s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 1977 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 2584 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 3285 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed: 10.3min finished


RandomizedSearchCV(cv=10, estimator=RandomForestRegressor(random_state=42),
                   n_iter=400, n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [5, 7, 10, 12, 15, 17, 20],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 3, 4, 5],
                                        'min_samples_split': [5, 10, 20],
                                        'n_estimators': [5, 9, 13, 17, 21, 25,
                                                         30]},
                   random_state=42, verbose=2)

In [119]:
rf_random.best_params_

{'n_estimators': 17,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 20,
 'bootstrap': False}

In [120]:
pd.DataFrame(rf_random.cv_results_).head(1)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_max_features,param_max_depth,param_bootstrap,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.119136,0.009312,0.003889,0.000536,5,10,3,sqrt,15,False,...,0.896612,0.930759,0.909144,0.837386,0.55412,0.889059,0.827046,0.863612,0.109585,41


In [121]:
rf_random_results = pd.DataFrame(rf_random.cv_results_)[['params','rank_test_score','mean_test_score']].sort_values(by=["rank_test_score"], ascending=True)

In [122]:
%%time
num_iter = 10
rf_importance = pd.DataFrame(columns = ['Feature', 'Importance_rf', 'Iteration_n'])
importance = []

for i in range(num_iter):

    rf = RandomForestRegressor(bootstrap = True, n_estimators = 21, max_features='sqrt',  min_samples_leaf = 1, min_samples_split = 5, max_depth = 20, criterion = 'mse')
    rf.fit(X_train, y_train)
    importance = pd.DataFrame(list(zip(data_train_t_imputed.columns[0:X_train.shape[1]], rf.feature_importances_)))
    importance.rename(columns = {importance.columns[0] : 'Feature', importance.columns[1] : 'Importance_rf'}, inplace = True)
    importance['Iteration_n'] = i
    rf_importance = rf_importance.append(importance)

Wall time: 4.52 s


In [123]:
rf_importance_agg = rf_importance.groupby(['Feature'], as_index = False).mean()
rf_importance_agg.sort_values('Importance_rf', inplace=True, ascending=False)
rf_importance_agg

Unnamed: 0,Feature,Importance_rf
6,weight_norm,0.370945
2,category_2,0.136776
5,n_days_since_added,0.118057
4,manufacturerNumber,0.111414
3,category_4,0.094944
0,category_0,0.091094
1,category_1,0.07677


In [124]:
y_pred = rf.predict(X_test)
y_pred

array([ 69.52893896, 318.57391017, 441.75922687, ..., 478.45570522,
       184.49894322,  80.86814653])

In [125]:
#AS we can see, the Mean Absolute Error in Random Forest approach is ~101 (slightly better than base line model)

In [126]:
from sklearn import metrics
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error (RMSE):', metrics.mean_squared_error(y_test, y_pred, squared=False))
print('Explained Variance Score:', metrics.explained_variance_score(y_test, y_pred))
print('Max Error:', metrics.max_error(y_test, y_pred))
print('Mean Squared Log Error:', metrics.mean_squared_log_error(y_test, y_pred))
print('Median Absolute Error:', metrics.median_absolute_error(y_test, y_pred))
print('R^2:', metrics.r2_score(y_test, y_pred))
print('Mean Poisson Deviance:', metrics.mean_poisson_deviance(y_test, y_pred))

Mean Absolute Error (MAE): 95.27513275168799
Mean Squared Error (MSE): 99176.71946003742
Root Mean Squared Error (RMSE): 314.92335489772336
Explained Variance Score: 0.859872891981674
Max Error: 8611.852044716152
Mean Squared Log Error: 0.19954300313482756
Median Absolute Error: 27.26378831796223
R^2: 0.8597355320981503
Mean Poisson Deviance: 69.2599189815745


# Griadient Boosting

In [127]:
# Let us use Random Search to find the best parameters for Gradient Boosting

In [128]:
n_estimators = [int(x) for x in np.linspace(start = 5, stop = 30, num = 7)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(start = 5, stop = 20, num = 7)]
min_samples_split = [5, 10, 20]
min_samples_leaf = [2, 3, 4, 5]
#bootstrap = [True, False]

In [129]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

In [130]:
pprint(random_grid)

{'max_depth': [5, 7, 10, 12, 15, 17, 20],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [2, 3, 4, 5],
 'min_samples_split': [5, 10, 20],
 'n_estimators': [5, 9, 13, 17, 21, 25, 30]}


In [131]:
gb = GradientBoostingRegressor(random_state=42)
gb_random = RandomizedSearchCV(estimator = gb, param_distributions = random_grid, n_iter = 400, cv = 10, verbose=2, random_state=42, n_jobs = -1)

In [132]:
# Fit the random search model
gb_random.fit(X_train, y_train)

Fitting 10 folds for each of 400 candidates, totalling 4000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  58 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 186 tasks      | elapsed:   24.2s
[Parallel(n_jobs=-1)]: Done 389 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 672 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 1037 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 1482 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 2009 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 2616 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 3305 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done 4000 out of 4000 | elapsed: 10.7min finished


RandomizedSearchCV(cv=10, estimator=GradientBoostingRegressor(random_state=42),
                   n_iter=400, n_jobs=-1,
                   param_distributions={'max_depth': [5, 7, 10, 12, 15, 17, 20],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [2, 3, 4, 5],
                                        'min_samples_split': [5, 10, 20],
                                        'n_estimators': [5, 9, 13, 17, 21, 25,
                                                         30]},
                   random_state=42, verbose=2)

In [133]:
gb_random.best_params_

{'n_estimators': 30,
 'min_samples_split': 10,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 17}

In [134]:
pd.DataFrame(gb_random.cv_results_)[['params','rank_test_score','mean_test_score']].sort_values(by=["rank_test_score"], ascending=True).head(5)

Unnamed: 0,params,rank_test_score,mean_test_score
118,"{'n_estimators': 30, 'min_samples_split': 10, ...",1,0.875762
6,"{'n_estimators': 30, 'min_samples_split': 5, '...",2,0.875199
116,"{'n_estimators': 30, 'min_samples_split': 10, ...",3,0.875061
306,"{'n_estimators': 25, 'min_samples_split': 5, '...",4,0.874362
31,"{'n_estimators': 30, 'min_samples_split': 10, ...",5,0.873743


In [135]:
%%time
num_iter = 10
gb_importance = pd.DataFrame(columns = ['Feature', 'Importance_rf', 'Iteration_n'])
importance = []

for i in range(num_iter):

    gb = GradientBoostingRegressor(n_estimators=30, max_features='sqrt', min_samples_leaf = 2, min_samples_split = 10, max_depth=17, learning_rate=0.5, criterion='mse')
    gb.fit(X_train, y_train)
    importance = pd.DataFrame(list(zip(data_train_t_imputed.columns[0:X_train.shape[1]], gb.feature_importances_)))
    importance.rename(columns = {importance.columns[0] : 'Feature', importance.columns[1] : 'Importance_gb'}, inplace = True)
    importance['Iteration_n'] = i
    gb_importance = gb_importance.append(importance)

Wall time: 8.45 s


In [136]:
gb_importance_agg = gb_importance.groupby(['Feature'], as_index = False).mean()
gb_importance_agg.sort_values('Importance_gb', inplace=True, ascending=False)
gb_importance_agg.head(30)

Unnamed: 0,Feature,Importance_gb
6,weight_norm,0.360774
5,n_days_since_added,0.132823
2,category_2,0.130579
3,category_4,0.115747
4,manufacturerNumber,0.113476
0,category_0,0.085618
1,category_1,0.060983


In [137]:
y_pred = gb.predict(X_test)
y_pred

array([ 70.32463906, 323.00710211, 438.40211451, ..., 501.37650713,
       197.06359003,  83.25490099])

In [138]:
#As we can see, the Mean Absolute Error of Gradient Boosting approach is ~97. Also slightly better than baseline and random forest models

In [139]:
from sklearn import metrics
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, y_pred))
#print('Root Mean Squared Error (RMSE):', metrics.mean_squared_error(y_test, y_pred, squared=False))
print('Explained Variance Score:', metrics.explained_variance_score(y_test, y_pred))
print('Max Error:', metrics.max_error(y_test, y_pred))
#print('Mean Squared Log Error:', metrics.mean_squared_log_error(y_test, y_pred))
print('Median Absolute Error:', metrics.median_absolute_error(y_test, y_pred))
print('R^2:', metrics.r2_score(y_test, y_pred))
#print('Mean Poisson Deviance:', metrics.mean_poisson_deviance(y_test, y_pred))

Mean Absolute Error (MAE): 91.78964817939608
Mean Squared Error (MSE): 100231.05777436102
Explained Variance Score: 0.8584029212944294
Max Error: 8798.140304045164
Median Absolute Error: 23.3722172771448
R^2: 0.8582443938204145


In [140]:
# Let us compute the combined importance of all features

In [141]:
Combined_importance = pd.merge(rf_importance_agg,gb_importance_agg, how = 'left', on='Feature')
Combined_importance['Avg_Importance'] = (Combined_importance['Importance_rf'] + Combined_importance['Importance_gb'])/2
Combined_importance.sort_values('Avg_Importance', inplace=True, ascending=False)

In [142]:
Combined_importance.head(15)

Unnamed: 0,Feature,Importance_rf,Importance_gb,Avg_Importance
0,weight_norm,0.370945,0.360774,0.36586
1,category_2,0.136776,0.130579,0.133677
2,n_days_since_added,0.118057,0.132823,0.12544
3,manufacturerNumber,0.111414,0.113476,0.112445
4,category_4,0.094944,0.115747,0.105345
5,category_0,0.091094,0.085618,0.088356
6,category_1,0.07677,0.060983,0.068876


# Demo

In [155]:
testing_set_sample = testing_set.sample(1)
testing_set_sample

Unnamed: 0,weight_norm,n_days_since_added,category_2,category_4,manufacturerNumber,category_1,category_0,price_USD
7970,55.0,2096,157,307,838,284,154,499.95


In [156]:
data_train.iloc[[7970]]

Unnamed: 0,weight_norm,n_days_since_added,category_2,category_4,manufacturerNumber,category_1,category_0,price_USD
7970,55.0,2096,Frys,Speakers,RTIA7 CHERR,Speaker Systems,Stereos,499.95


In [157]:
weight_norm_value = testing_set_sample.iloc[0]['weight_norm']
n_days_since_added_value = testing_set_sample.iloc[0]['n_days_since_added']
category_2_value = testing_set_sample.iloc[0]['category_2']
category_4_value = testing_set_sample.iloc[0]['category_4']
manufacturerNumber_value = testing_set_sample.iloc[0]['manufacturerNumber']
category_1_value = testing_set_sample.iloc[0]['category_1']
category_0_value = testing_set_sample.iloc[0]['category_0']

In [158]:
instance = np.array([[weight_norm_value, n_days_since_added_value,category_2_value,category_4_value, manufacturerNumber_value, category_1_value, category_0_value]])

In [159]:
instance

array([[  55., 2096.,  157.,  307.,  838.,  284.,  154.]])

In [160]:
rf.predict(instance)

array([317.2480433])