In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [2]:
# Importing the dataset 
data = pd.read_csv("C:\\Users\\Siddharth\\Desktop\\Projects\\Projects\\Price_prediction\\data.csv", encoding = "ISO-8859-1")
data = data.drop(['item_description'], axis = 1)
data = data.drop(['name'], axis = 1)
#data['shipping'] = data['shipping'].astype(object)
data.head(5)
#data.dtypes

Unnamed: 0,item_condition_id,category_name,brand_name,shipping,sentiment_score,price
0,3,Men/Tops/T-shirts,,1,0.0,10.0
1,3,Electronics/Computers & Tablets/Components & P...,Razer,0,0.202202,52.0
2,1,Women/Tops & Blouses/Blouse,Target,1,0.311313,10.0
3,1,Home/Home Décor/Home Décor Accents,,1,0.303195,35.0
4,1,Women/Jewelry/Necklaces,,0,0.31305,44.0


In [4]:
#Filling 'NaN' by 'None'
data = data.fillna('None')

#Mean encoding of columns with 'Object' dtype
data_enc = data
for col in data.columns:
    means = data_enc.groupby(col).price.mean()
    if (data_enc[col].dtype == 'object'):
        data_enc[col] = data_enc[col].map(means)
data_enc.head(5)

Unnamed: 0,item_condition_id,category_name,brand_name,shipping,sentiment_score,price
0,3,18.359914,21.100819,1,0.0,10.0
1,3,40.491876,43.032787,0,0.202202,52.0
2,1,15.744451,15.083717,1,0.311313,10.0
3,1,22.121014,21.100819,1,0.303195,35.0
4,1,25.473234,21.100819,0,0.31305,44.0


In [5]:
# Labels are the values we want to predict
labels = np.array(data_enc['price'])

# Remove the labels from the features
# axis 1 refers to the columns
features = data_enc.drop('price', axis = 1)

# Saving feature names for later use
feature_list = list(data_enc.columns)

# Convert to numpy array
features = np.array(features)

In [6]:
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2, random_state = 42)

#print('Training Features Shape:', train_features.shape)
#print('Training Labels Shape:', train_labels.shape)
#print('Testing Features Shape:', test_features.shape)
#print('Testing Labels Shape:', test_labels.shape)

In [7]:
# Build the model with 100 decision trees
RF = RandomForestRegressor(n_estimators = 5, random_state = 41)

# Train the model on training data
RF.fit(train_features, train_labels)

#from pprint import pprint
#pprint(RF.get_params())

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=1,
           oob_score=False, random_state=41, verbose=0, warm_start=False)

In [8]:
# Use the forest's predict method on the test data
predictions = RF.predict(test_features)

# Calculate the absolute errors
errors = abs(predictions - test_labels)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2))

from sklearn.model_selection import cross_val_score
mean_abs_errors = cross_val_score(RF, features, labels, cv=5, scoring = 'neg_mean_absolute_error')
mean_abs_errors

Mean Absolute Error: 15.04


array([-15.01104035, -14.9588541 , -15.01344247, -15.06762526,
       -14.96503371])

In [9]:
# Get numerical feature importances
importances = list(RF.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]

Variable: sentiment_score      Importance: 0.46
Variable: brand_name           Importance: 0.27
Variable: category_name        Importance: 0.2
Variable: item_condition_id    Importance: 0.05
Variable: shipping             Importance: 0.03


[None, None, None, None, None]

In [10]:
#Hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 1000, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 10, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [2, 10]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [50, 287, 525, 762, 1000], 'max_features': ['auto', 'sqrt'], 'max_depth': [5, 6, 7, 8, 10, None], 'min_samples_split': [2, 10], 'min_samples_leaf': [2, 10], 'bootstrap': [True, False]}


In [11]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 5, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(train_features, train_labels)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed: 28.0min remaining: 24.5min
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 50.0min finished


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=5, n_jobs=-1,
          param_distributions={'n_estimators': [50, 287, 525, 762, 1000], 'max_features': ['auto', 'sqrt'], 'max_depth': [5, 6, 7, 8, 10, None], 'min_samples_split': [2, 10], 'min_samples_leaf': [2, 10], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring=None, verbose=2)

In [12]:
rf_random.best_params_

{'bootstrap': False,
 'max_depth': 10,
 'max_features': 'auto',
 'min_samples_leaf': 10,
 'min_samples_split': 2,
 'n_estimators': 1000}

In [None]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [5,10,15],
    'max_features': ['sqrt'],
    'min_samples_leaf': [5,10,15],
    'min_samples_split': [2,3,4],
    'n_estimators': [400, 500, 600]
}

# Create a based model
rf = RandomForestRegressor()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [None]:
# Fit the grid search to the data
grid_search.fit(train_features, train_labels)
grid_search.best_params_

best_grid = grid_search.best_estimator_

Fitting 3 folds for each of 81 candidates, totalling 243 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 24.0min


In [None]:
best_grid