In [42]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression


In [2]:
#IMPORTING DF
df_pricing = pd.read_csv("Resources/retail_price_cleaned.csv")
df_pricing

Unnamed: 0,product_id,product_category_name,month_year,qty_sold,total_price,freight_price,unit_price,product_rating,no_customers,month,...,comp1_price,comp1_prod_rating,comp1_freight_price,comp2_price,comp2_prod_rating,comp2_freight_price,comp3_price,comp3_prod_rating,comp3_freight_price,lag_price
0,bed1,bed_bath_table,01-05-2017,1,45.95,15.100000,45.950000,4.0,57,5,...,89.9,3.9,15.011897,215.000000,4.4,8.760000,45.95,4.0,15.100000,45.900000
1,bed1,bed_bath_table,01-06-2017,3,137.85,12.933333,45.950000,4.0,61,6,...,89.9,3.9,14.769216,209.000000,4.4,21.322000,45.95,4.0,12.933333,45.950000
2,bed1,bed_bath_table,01-07-2017,6,275.70,14.840000,45.950000,4.0,123,7,...,89.9,3.9,13.993833,205.000000,4.4,22.195932,45.95,4.0,14.840000,45.950000
3,bed1,bed_bath_table,01-08-2017,4,183.80,14.287500,45.950000,4.0,90,8,...,89.9,3.9,14.656757,199.509804,4.4,19.412885,45.95,4.0,14.287500,45.950000
4,bed1,bed_bath_table,01-09-2017,2,91.90,15.100000,45.950000,4.0,54,9,...,89.9,3.9,18.776522,163.398710,4.4,24.324687,45.95,4.0,15.100000,45.950000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
671,bed5,bed_bath_table,01-05-2017,1,215.00,8.760000,215.000000,4.4,57,5,...,89.9,3.9,15.011897,215.000000,4.4,8.760000,45.95,4.0,15.100000,214.950000
672,bed5,bed_bath_table,01-06-2017,10,2090.00,21.322000,209.000000,4.4,61,6,...,89.9,3.9,14.769216,209.000000,4.4,21.322000,45.95,4.0,12.933333,215.000000
673,bed5,bed_bath_table,01-07-2017,59,12095.00,22.195932,205.000000,4.4,123,7,...,89.9,3.9,13.993833,205.000000,4.4,22.195932,45.95,4.0,14.840000,209.000000
674,bed5,bed_bath_table,01-08-2017,52,10375.00,19.412885,199.509804,4.4,90,8,...,89.9,3.9,14.656757,199.509804,4.4,19.412885,45.95,4.0,14.287500,205.000000


In [3]:
df_pricing.dtypes

product_id                object
product_category_name     object
month_year                object
qty_sold                   int64
total_price              float64
freight_price            float64
unit_price               float64
product_rating           float64
no_customers               int64
month                      int64
year                       int64
seasonality              float64
volume                     int64
comp1_price              float64
comp1_prod_rating        float64
comp1_freight_price      float64
comp2_price              float64
comp2_prod_rating        float64
comp2_freight_price      float64
comp3_price              float64
comp3_prod_rating        float64
comp3_freight_price      float64
lag_price                float64
dtype: object

In [4]:
# CONVERTING DATE TO DATE TIME
df_pricing['month_year'] = pd.to_datetime(df_pricing['month_year'], format='%d-%m-%Y')
df_pricing.dtypes

product_id                       object
product_category_name            object
month_year               datetime64[ns]
qty_sold                          int64
total_price                     float64
freight_price                   float64
unit_price                      float64
product_rating                  float64
no_customers                      int64
month                             int64
year                              int64
seasonality                     float64
volume                            int64
comp1_price                     float64
comp1_prod_rating               float64
comp1_freight_price             float64
comp2_price                     float64
comp2_prod_rating               float64
comp2_freight_price             float64
comp3_price                     float64
comp3_prod_rating               float64
comp3_freight_price             float64
lag_price                       float64
dtype: object

In [46]:
# ATTEMPTED GROUPING BY DATE AND PRODUCT ID, 
df_grouped_pricing_date_id = df_pricing.groupby(['month_year','product_id']).mean()
df_grouped_pricing_date_id.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,qty_sold,total_price,freight_price,unit_price,product_rating,no_customers,month,year,seasonality,volume,comp1_price,comp1_prod_rating,comp1_freight_price,comp2_price,comp2_prod_rating,comp2_freight_price,comp3_price,comp3_prod_rating,comp3_freight_price,lag_price
month_year,product_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-01,health5,8,2799.2,22.90125,349.9,4.3,9,1,2017,12.771739,11400,349.9,4.3,22.90125,349.9,4.3,22.90125,64.99,3.9,11.06,349.85
2017-01-01,health7,1,64.99,11.06,64.99,3.9,9,1,2017,5.642857,3960,64.99,3.9,11.06,64.99,3.9,11.06,64.99,3.9,11.06,64.94
2017-02-01,bed2,2,179.8,13.02,89.9,3.9,2,2,2017,8.872302,20000,89.9,3.9,13.02,89.9,3.9,13.02,89.9,3.9,13.02,89.85
2017-02-01,computers4,3,479.97,33.4,159.99,4.2,3,2,2017,7.889031,8000,159.99,4.2,33.4,159.99,4.2,33.4,159.99,4.2,33.4,159.94
2017-02-01,cool1,7,599.93,21.971429,85.704286,4.2,7,2,2017,8.708709,15750,85.704286,4.2,21.971429,85.704286,4.2,21.971429,85.704286,4.2,21.971429,85.654286
2017-02-01,garden1,1,99.99,28.89,99.99,4.3,3,2,2017,7.510204,8000,99.99,4.3,28.89,99.99,4.3,28.89,99.99,4.3,28.89,99.94
2017-02-01,garden3,1,99.99,33.0,99.99,4.1,3,2,2017,11.586851,8000,99.99,4.1,33.0,99.99,4.1,33.0,99.99,4.1,33.0,99.94
2017-02-01,garden8,1,179.99,33.54,179.99,4.2,3,2,2017,8.942308,8000,179.99,4.2,33.54,179.99,4.2,33.54,179.99,4.2,33.54,179.94
2017-02-01,health5,4,1399.6,26.9275,349.9,4.3,19,2,2017,11.277174,11400,19.99,4.3,11.750909,349.9,4.3,26.9275,64.99,3.9,15.348,349.9
2017-02-01,health7,5,324.95,15.348,64.99,3.9,19,2,2017,13.928571,3960,19.99,4.3,11.750909,64.99,3.9,15.348,64.99,3.9,15.348,64.99


In [5]:
# ATTEMPTED GROUPING BY DATE AND PRODUCT ID, 
# things didnt look good so attempting to ignore month and year columns
df_grouped_pricing = df_pricing.groupby(['product_id']).mean()
df_dropped = df_grouped_pricing.drop(columns=['month','year','seasonality'])
df_dropped.head(20)
# big problems here

Unnamed: 0_level_0,qty_sold,total_price,freight_price,unit_price,product_rating,no_customers,volume,comp1_price,comp1_prod_rating,comp1_freight_price,comp2_price,comp2_prod_rating,comp2_freight_price,comp3_price,comp3_prod_rating,comp3_freight_price,lag_price
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
bed1,8.9375,364.485625,15.025673,42.211989,4.0,57.1875,3800.0,86.188512,3.9,17.655089,89.846896,4.125,16.51034,42.211989,4.0,15.025673,42.628239
bed2,25.631579,2259.929474,17.063376,86.774536,3.9,50.947368,20000.0,86.774536,3.9,17.063376,115.322353,4.031579,18.053233,49.741675,3.984211,14.849131,87.608747
bed3,10.090909,936.814545,15.665095,92.101364,3.3,48.181818,20944.0,84.501471,3.9,18.6612,92.101364,3.3,15.665095,40.512893,4.0,15.286357,93.378636
bed4,13.1,601.177,15.15763,46.725444,4.2,51.5,8000.0,85.551618,3.9,18.801749,46.725444,4.2,15.15763,40.640182,4.0,15.181618,46.520444
bed5,30.8,5999.472,19.203101,198.381703,4.4,77.0,12600.0,89.9,3.9,15.441645,198.381703,4.4,19.203101,45.95,4.0,14.452167,208.691961
computers1,9.266667,820.454,15.037399,92.482149,4.0,59.333333,2808.0,137.619907,4.186667,40.380132,124.063636,4.14,20.782643,93.729691,3.7,16.003633,92.075482
computers2,18.3,1543.925,16.634154,87.234536,3.5,86.4,2210.0,133.602844,4.2,41.487362,141.945455,4.14,25.347439,87.234536,3.5,16.634154,90.429536
computers3,10.3,1511.182,28.077465,143.472964,4.2,52.9,7650.0,153.591866,4.2,39.201789,146.901419,4.22,27.723461,129.211442,3.92,22.010673,147.976679
computers4,18.944444,2606.472778,40.611177,141.576034,4.2,53.777778,8000.0,141.576034,4.2,40.611177,150.665262,4.244444,33.884445,115.815863,3.811111,26.803839,144.903812
computers5,15.875,1405.12,14.471821,100.243737,3.5,95.375,2926.0,137.71804,4.2,42.246796,148.344318,4.1,24.670268,96.46817,3.5,17.238999,105.587487


In [6]:
# FEATURE SET
copy = df_dropped.copy()
X = copy.drop(columns=["unit_price"])
X.head(10)

Unnamed: 0_level_0,qty_sold,total_price,freight_price,product_rating,no_customers,volume,comp1_price,comp1_prod_rating,comp1_freight_price,comp2_price,comp2_prod_rating,comp2_freight_price,comp3_price,comp3_prod_rating,comp3_freight_price,lag_price
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
bed1,8.9375,364.485625,15.025673,4.0,57.1875,3800.0,86.188512,3.9,17.655089,89.846896,4.125,16.51034,42.211989,4.0,15.025673,42.628239
bed2,25.631579,2259.929474,17.063376,3.9,50.947368,20000.0,86.774536,3.9,17.063376,115.322353,4.031579,18.053233,49.741675,3.984211,14.849131,87.608747
bed3,10.090909,936.814545,15.665095,3.3,48.181818,20944.0,84.501471,3.9,18.6612,92.101364,3.3,15.665095,40.512893,4.0,15.286357,93.378636
bed4,13.1,601.177,15.15763,4.2,51.5,8000.0,85.551618,3.9,18.801749,46.725444,4.2,15.15763,40.640182,4.0,15.181618,46.520444
bed5,30.8,5999.472,19.203101,4.4,77.0,12600.0,89.9,3.9,15.441645,198.381703,4.4,19.203101,45.95,4.0,14.452167,208.691961
computers1,9.266667,820.454,15.037399,4.0,59.333333,2808.0,137.619907,4.186667,40.380132,124.063636,4.14,20.782643,93.729691,3.7,16.003633,92.075482
computers2,18.3,1543.925,16.634154,3.5,86.4,2210.0,133.602844,4.2,41.487362,141.945455,4.14,25.347439,87.234536,3.5,16.634154,90.429536
computers3,10.3,1511.182,28.077465,4.2,52.9,7650.0,153.591866,4.2,39.201789,146.901419,4.22,27.723461,129.211442,3.92,22.010673,147.976679
computers4,18.944444,2606.472778,40.611177,4.2,53.777778,8000.0,141.576034,4.2,40.611177,150.665262,4.244444,33.884445,115.815863,3.811111,26.803839,144.903812
computers5,15.875,1405.12,14.471821,3.5,95.375,2926.0,137.71804,4.2,42.246796,148.344318,4.1,24.670268,96.46817,3.5,17.238999,105.587487


In [7]:
# Y TARGET SET AND MAKE INTO ARRAY
y = df_dropped["unit_price"].ravel()
y

array([ 42.21198864,  86.774536  ,  92.10136364,  46.72544444,
       198.38170272,  92.48214865,  87.23453604, 143.47296429,
       141.57603374, 100.2437367 , 149.95681819,  28.24166667,
        25.58428571,  98.36250794, 128.21596154,  42.        ,
       158.78872802,  98.6565    ,  36.74923077,  77.50989695,
        36.70557423,  96.15690476, 105.76447811,  55.03017532,
        54.83431373, 105.37457407,  54.28219656,  94.56820728,
        54.89295358,  54.9625    , 172.92486656,  55.26127876,
        84.99      ,  50.54148423, 326.99175824, 117.30730769,
        29.9       , 350.74431818,  64.99      ,  60.49380952,
        86.84773109,  22.62374486,  54.87461538, 123.82300977,
       180.62742725, 144.78531804,  77.98809524, 106.26428571,
       213.89715959, 132.53046094, 305.69029584, 184.50488011])

In [8]:
# TRAIN AND TEST DATASETS
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [10]:
# SCALER INITIALIZATION and FITTING
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)

In [12]:
# SCALING X_TRAIN AND X_TEST
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## RF Model

In [1]:
# RF REGRESSOR MODEL CREATION
rf_model = RandomForestRegressor(n_estimators=50, random_state=78)

NameError: name 'RandomForestRegressor' is not defined

In [33]:
# FIT AND PREDICT RF MODEL
rf_model = rf_model.fit(X_train_scaled, y_train)
y_predictions = rf_model.predict(X_test_scaled)

In [34]:
# CONFUSION MATRIX FOR RF REGRESSOR
rf_score = rf_model.score(X_test, y_test)
rf_score

  f"X has feature names, but {self.__class__.__name__} was fitted without"


-1.051017890161277

## Linear Regression Model

In [37]:
# LINEAR REGRESSOR MODEL CREATION
lr_model = LinearRegression()

In [41]:
# FIT AND PREDICT LR MODEL
lr_model.fit(X_train_scaled, y_train)
lr_y_predictions = lr_model.predict(X_test_scaled)

In [44]:
lr_score = lr_model.score(X_test, y_test, sample_weight=None)
lr_score

  f"X has feature names, but {self.__class__.__name__} was fitted without"


-39315.83335564545

In [45]:
r2_lr = r2_score(y_test, lr_y_predictions)
r2_lr

0.9961197267200439