In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor

In [2]:
df = pd.read_csv('MobileDataSet.csv')

In [3]:
df.head()

Unnamed: 0,brand,model,base_color,processor,screen_size,ROM,RAM,display_size,num_rear_camera,num_front_camera,battery_capacity,ratings,num_of_ratings,sales_price,discount_percent,sales
0,Apple,iPhone SE,Black,Water,Very Small,64,2,4.7,1,1,1800,4.5,38645,32999,0.17,127.52
1,Apple,iPhone 12 Mini,Red,Ceramic,Small,64,4,5.4,2,1,2815,4.5,244,57149,0.04,1.39
2,Apple,iPhone SE,Red,Water,Very Small,64,2,4.7,1,1,1800,4.5,38645,32999,0.17,127.52
3,Apple,iPhone XR,Others,iOS,Medium,64,3,6.1,1,1,2942,4.6,5366,42999,0.1,23.07
4,Apple,iPhone 12,Red,Ceramic,Medium,128,4,6.1,2,1,2815,4.6,745,69149,0.02,5.15


In [4]:
df = df[['brand', 'base_color', 'processor', 'screen_size', 'ROM', 'RAM', 'display_size', 'num_rear_camera', 'num_front_camera', 'battery_capacity', 'sales_price', 'sales']]
encoder = LabelEncoder()
df['brand'] = encoder.fit_transform(df['brand'])
df['base_color'] = encoder.fit_transform(df['base_color'])
df['processor'] = encoder.fit_transform(df['processor'])
df['screen_size'] = encoder.fit_transform(df['screen_size'])

In [5]:
X = df.drop(['sales_price'], axis = 1)
y = df['sales_price']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
params = {'max_leaf_nodes': list(range(2, 100)), 'min_samples_split': [2, 3, 4]}
grid_search_cv = GridSearchCV(DecisionTreeRegressor(random_state=42), params, verbose=1, cv=3)

grid_search_cv.fit(X_train, y_train)

Fitting 3 folds for each of 294 candidates, totalling 882 fits


In [8]:
grid_search_cv.best_estimator_

In [9]:
y_pred = grid_search_cv.predict(X_test)
mean_squared_error(y_test, y_pred)

14976904.057325311

In [10]:
n_trees = 1000
n_instances = 100

mini_sets = []

rs = ShuffleSplit(n_splits=n_trees, test_size=len(X_train) - n_instances, random_state=42)

In [11]:
rs

ShuffleSplit(n_splits=1000, random_state=42, test_size=244, train_size=None)

In [12]:
X_train = X_train.to_numpy()

In [13]:
y_train = y_train.to_numpy()

In [14]:
for mini_train_index, mini_test_index in rs.split(X_train):
    X_mini_train = X_train[mini_train_index]
    y_mini_train = y_train[mini_train_index]
    mini_sets.append((X_mini_train, y_mini_train))

In [15]:
mini_sets

[(array([[4.0000e+00, 0.0000e+00, 4.0000e+00, ..., 1.0000e+00, 4.0000e+03,
          1.7375e+02],
         [0.0000e+00, 5.0000e+00, 0.0000e+00, ..., 1.0000e+00, 2.8150e+03,
          5.9000e+00],
         [3.0000e+00, 6.0000e+00, 1.0000e+00, ..., 1.0000e+00, 4.5000e+03,
          6.3800e+00],
         ...,
         [2.0000e+00, 1.0000e+00, 4.0000e+00, ..., 1.0000e+00, 4.5000e+03,
          1.8380e+01],
         [0.0000e+00, 0.0000e+00, 6.0000e+00, ..., 1.0000e+00, 1.8000e+03,
          1.0994e+02],
         [3.0000e+00, 8.0000e+00, 4.0000e+00, ..., 1.0000e+00, 4.0000e+03,
          9.2200e+00]]),
  array([ 13400,  79149,  39999,  11999,   9499,  11999,  12400,  47999,
          13695,   8083,  16999,  18499,  29999,  11490,  14999,  21999,
           8394,  16999,  37499,  10999,   8999,  36499,  12349,  12999,
          10999,  15999,  11499,  14999,  42999,  69149,   8999,  11499,
          54599,  10499,  12390,   8565,  16999,  27999,  27999,  13399,
          16999,  14450,   9569

In [16]:
from sklearn.base import clone

forest = [clone(grid_search_cv.best_estimator_) for _ in range(n_trees)]

mse_scores = []

for tree, (X_mini_train, y_mini_train) in zip(forest, mini_sets):
    tree.fit(X_mini_train, y_mini_train)

    y_pred = tree.predict(X_test)
    mse_scores.append(mean_squared_error(y_test, y_pred))

np.mean(mse_scores)



150761128.9676115

In [17]:
Y_pred = np.empty([n_trees, len(X_test)], dtype=np.uint8)

for tree_index, tree in enumerate(forest):
    Y_pred[tree_index] = tree.predict(X_test)



In [18]:
y_pred_majority_votes = np.mean(Y_pred, axis=0)

In [19]:
y_pred_majority_votes

array([142.252,  68.927, 114.061,  94.67 , 115.739, 137.939, 143.713,
       108.345, 102.176, 112.881, 138.508,  90.823, 111.228, 130.83 ,
       187.237,  83.256, 135.83 , 130.189,  58.389, 146.351, 146.096,
       100.647, 176.419,  70.415, 120.203, 131.135, 135.854, 141.426,
       100.914, 125.423, 150.496, 120.823, 144.549, 111.789, 116.591,
       127.758, 105.963,  99.844, 104.037, 136.955,  92.285, 141.111,
       140.805, 117.688, 232.249, 109.911, 128.124, 118.287, 141.856,
       139.444, 152.587, 120.752, 112.728, 192.99 ,  74.646, 110.934,
       115.856,  89.475, 115.334, 141.557, 119.441, 120.822, 117.198,
       132.425,  89.456, 151.437, 131.613, 111.304, 131.44 , 188.771,
       141.846, 124.405, 146.654, 144.865, 154.428, 106.369,  73.023,
       150.976, 107.823, 128.83 , 110.948, 138.565, 136.811,  93.641,
       153.836, 124.193])

In [20]:
mean_squared_error(y_test, y_pred_majority_votes.reshape([-1]))

1046131186.8235108