In [1]:
#########imports

from fastai.imports import *
from fastai.structured import *

from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor
from IPython.display import display

from sklearn import metrics

import sys

import feather

########

In [2]:
############################# path to data files

PATH = "../data/bulldozers/"
!ls {PATH}

#####

Train.csv


In [3]:
############################ read data

df_raw = pd.read_csv(f'{PATH}Train.csv', low_memory = False, parse_dates = ["saledate"])

######

In [4]:
########################### FUNCTIONS ##############################


####### display_all function

def display_all(df):
    with pd.option_context("display.max_rows", 1000,"display.max_columns", 1000):
        display(df)

        
        
####### split_vals function

def split_vals(a, n):
    return a[:n].copy(), a[n:].copy()



##### rmse function

def rmse(x, y):
    return math.sqrt(((x-y)**2).mean())



##### print_score function

def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid), m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'):
        res.append(m.oob_score_)
    print(res)
    

##### dectree max depth function

def dectree_max_depth(tree):
    children_left = tree.children_left
    children_right = tree.children_right
    
    def walk(node_id):
        if(children_left[node_id] != children_right[node_id]):
            left_max = 1 + walk(children_left[node_id])
            right_max = 1 + walk(children_right[node_id])
            return max(left_max, right_max)
        else:
            return 1
    root_node_id = 0
    return walk(root_node_id)    
    
#####

In [2]:
# display_all(df_raw.tail())

In [3]:
# display_all(df_raw.describe(include='all'))

In [5]:
######### data pre-processing

df_raw.SalePrice = np.log(df_raw.SalePrice)

train_cats(df_raw)

add_datepart(df_raw, 'saledate')

df, y, nas = proc_df(df_raw, 'SalePrice')

#########

In [6]:
####### making cross-validtion set

n_valid = 12000
n_trn = len(df) - n_valid
raw_train, raw_valid = split_vals(df_raw, n_trn)

X_train, X_valid = split_vals(df, n_trn)
y_train, y_valid = split_vals(y, n_trn)


# X_train.shape, y_train.shape, X_valid.shape

raw_train.shape, raw_valid.shape, X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

#######

((389125, 65), (12000, 65), (389125, 66), (12000, 66), (389125,), (12000,))

In [24]:
########### model 5 : sub-sampling

set_rf_samples(20000)

m = RandomForestRegressor(n_estimators=40, n_jobs = -1, oob_score=True)
%time m.fit(X_train, y_train)
print_score(m)

###########

Wall time: 24.3 s
[0.2276955823348228, 0.26101183230413705, 0.8916462277733792, 0.8783342423873864, 0.8800936692650698]


In [26]:
reset_rf_samples()

In [19]:
########### comparing predictions with y values

preds = np.stack([t.predict(X_valid) for t in m.estimators_])
preds[:,0], np.mean(preds[:,0]), y_valid[0]

###########

(array([9.04782, 9.9988 , 9.21034, 9.30565, 9.15905, 9.39266, 9.21034, 9.21034, 9.02401, 9.10498]),
 9.266399049898974,
 9.104979856318357)

In [9]:
######## model 5 : tree-building parameters (min_samples_leaf, max_features)

m = RandomForestRegressor(max_features=0.5, n_estimators=40, min_samples_leaf=3, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)

########

[0.119074062780624, 0.2280148228300204, 0.970367440172192, 0.9071516617837563, 0.9119004338566976]


In [7]:
############ model 4 : oot_of_bag_score with bagging (more than one tree)

m = RandomForestRegressor(n_estimators=40, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)

############

[0.07846918770708818, 0.23518547981512622, 0.9871313455318379, 0.9012200085161102, 0.9083402293474887]


In [9]:
####### single tree

m =RandomForestRegressor(n_estimators=1, max_depth=3, bootstrap=False, n_jobs=-1)
m.fit(X_train, y_train)
print_score(m)

#######

[0.5219385070968967, 0.5813672044382565, 0.4034366443768935, 0.39640027646932957]


In [15]:
###### model 3

m=RandomForestRegressor(n_jobs = -1)
%time m.fit(X_train, y_train)
print_score(m)

######



Wall time: 36.5 s
[0.09034990616930431, 0.2474196882233459, 0.9829395596027305, 0.8906757555458812]


In [8]:
####### model 2

m = RandomForestRegressor(n_jobs = -1)
m.fit(df, y)
m.score(df, y)

#######



0.9831096295517464

In [10]:
####### model 1

m = RandomForestRegressor(n_jobs = -1)
m.fit(df_raw.drop('SalePrice', axis=1), df_raw.SalePrice)
m.score(df_raw, df_raw.drop('SalePrice', axis=1), df_raw.SalePrice)

#######

In [8]:
####### speed things up (after model 3)

df_trn, y_trn, nas = proc_df(df_raw, 'SalePrice', subset=30000, na_dict=nas)
X_train, _= split_vals(df_trn, 20000)
y_train, _= split_vals(y_trn, 20000)

#######

In [37]:
#### save temp data

os.makedirs('tmp', exist_ok = True)
df_raw.to_feather('tmp/bulldozer-raw')

####

In [41]:
#### read saved temp data

df_raw = feather.read_dataframe('tmp/bulldozer-raw')

####

In [11]:
# draw_tree(m.estimators_[0], df_trn, precision=3)

In [12]:
??proc_df

In [39]:
pd.__version__

'0.23.4'