In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [2]:
# Importing the dataset
data = pd.read_csv("C:\\Users\\Siddharth\\Desktop\\Projects\\Price_prediction\\train.tsv", sep = '\t')
data = data.drop(['train_id'], axis = 1)
#data.head(10)

In [3]:
#Filling 'NaN' by 'None'
data = data.fillna('None')

#Mean encoding of columns with 'Object' dtype
data_enc = data
for col in data.columns:
    means = data_enc.groupby(col).price.mean()
    if (data_enc[col].dtype == 'object'):
        data_enc[col] = data_enc[col].map(means)
#data_enc.head(5)

In [4]:
# Labels are the values we want to predict
labels = np.array(data_enc['price'])

# Remove the labels from the features
# axis 1 refers to the columns
features = data_enc.drop('price', axis = 1)

# Saving feature names for later use
feature_list = list(data_enc.columns)

# Convert to numpy array
features = np.array(features)

In [5]:
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.2, random_state = 44)

#print('Training Features Shape:', train_features.shape)
#print('Training Labels Shape:', train_labels.shape)
#print('Testing Features Shape:', test_features.shape)
#print('Testing Labels Shape:', test_labels.shape)

In [6]:
# Build the model with 100 decision trees
RF = RandomForestRegressor(n_estimators = 100, random_state = 44)

# Train the model on training data
RF.fit(train_features, train_labels)

from pprint import pprint
pprint(RF.get_params())

{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': 44,
 'verbose': 0,
 'warm_start': False}


In [8]:
# Use the forest's predict method on the test data
predictions = RF.predict(test_features)

# Calculate the absolute errors
errors = abs(predictions - test_labels)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2))

Mean Absolute Error: 0.45


In [9]:
# Get numerical feature importances
importances = list(RF.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]

Variable: shipping             Importance: 0.77
Variable: name                 Importance: 0.22
Variable: item_condition_id    Importance: 0.0
Variable: category_name        Importance: 0.0
Variable: brand_name           Importance: 0.0
Variable: price                Importance: 0.0


[None, None, None, None, None, None]

In [10]:
#Hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(train_features, train_labels)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [None]:
rf_random.best_params_