##### Creation Date:
##### February 19 2022
##### Created By Alperen KOLAMUC

In [1]:
from sklearn.ensemble import RandomForestRegressor
from IPython.display import display
from sklearn import metrics

In [2]:
import re
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype

In [3]:
import pandas as pd
import numpy as np
import math

##### Prepare


In [4]:
df = pd.read_feather("tmp/bulldozers_1")

In [5]:
def numericalize(df, col, name):
    if not is_numeric_dtype(col):
        df[name] = col.cat.codes + 1 # ortak ağız için

In [6]:
def fix_missing(df, col, name, nan_dict, is_train):
    if is_train:
        if is_numeric_dtype(col):
            if pd.isnull(col).sum():
                df[name+"_NA"] = pd.isnull(col)
                nan_dict[name] = col.median()
                df[name] = col.fillna(nan_dict[name])

    else:
        if is_numeric_dtype(col):
            if name in nan_dict:
                df[name+"_NA"] = pd.isnull(col)
                df[name] = col.fillna(nan_dict[name])
            
            else:
                df[name] = col.fillna(df[name].median())

In [7]:
def proc_df(df, y_fld, nan_dict=None, is_train=True):
    df = df.copy()
    y = df[y_fld].values

    df.drop([y_fld], axis=1, inplace=True)

    if nan_dict is None:
        nan_dict = {}
    
    for n, c in df.items():
        fix_missing(df, c, n, nan_dict, is_train)
        numericalize(df, c, n)

    if is_train:
        return df, y, nan_dict
    
    return df, y

In [8]:
def split_train_val(df, n):
    return(df[:n].copy(), df[n:].copy())

In [9]:
n_valid = 12000
n_train = len(df)-n_valid
raw_train, raw_valid = split_train_val(df, n_train)

In [10]:
x_train_raw, y_train_raw, nas = proc_df(raw_train, 'SalePrice')

In [11]:
x_train, y_train, nas = proc_df(raw_train, 'SalePrice')

In [12]:
x_valid, y_valid = proc_df(raw_valid, 'SalePrice', nan_dict=nas, is_train=False)

In [13]:
def rmse(x, y):
    return math.sqrt(((x-y)**2).mean())

In [14]:
def print_score(m):

    print(f"RMSLE of train set {rmse(m.predict(x_train), y_train)}")
    print(f"RMSLE of validation set {rmse(m.predict(x_valid), y_valid)}")
    print(f"R^2 of train set {m.score(x_train, y_train)}")
    print(f"R^2 of validation set {m.score(x_valid, y_valid)}")

#### Using Bootstraping and More Trees than Default

In [15]:
m = RandomForestRegressor(n_estimators=40, n_jobs=-1)
m.fit(x_train, y_train)
print_score(m)

RMSLE of train set 0.07811480837279308
RMSLE of validation set 0.24785275598491305
R^2 of train set 0.9871221992633731
R^2 of validation set 0.8765181675099502


#### Using min_sample_leaf
`min_sample_leaf`: The minimum number of samples required to be at a leaf node.
We can grow our trees less deeply to reduce over-fitting. We do this by setting `min_sample_leaf`
* There are less decision rules for each leaf node; Our model will not memorize the data, it will be so called simpler, and not specialized to our data, that kind of models should generalize better
* The predictions are made by averaging more rows in the leaf node, it will also help our model generalize better

In [16]:
RandomForestRegressor??

[1;31mInit signature:[0m
[0mRandomForestRegressor[0m[1;33m([0m[1;33m
[0m    [0mn_estimators[0m[1;33m=[0m[1;36m100[0m[1;33m,[0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mcriterion[0m[1;33m=[0m[1;34m'squared_error'[0m[1;33m,[0m[1;33m
[0m    [0mmax_depth[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmin_samples_split[0m[1;33m=[0m[1;36m2[0m[1;33m,[0m[1;33m
[0m    [0mmin_samples_leaf[0m[1;33m=[0m[1;36m1[0m[1;33m,[0m[1;33m
[0m    [0mmin_weight_fraction_leaf[0m[1;33m=[0m[1;36m0.0[0m[1;33m,[0m[1;33m
[0m    [0mmax_features[0m[1;33m=[0m[1;36m1.0[0m[1;33m,[0m[1;33m
[0m    [0mmax_leaf_nodes[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmin_impurity_decrease[0m[1;33m=[0m[1;36m0.0[0m[1;33m,[0m[1;33m
[0m    [0mbootstrap[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0moob_score[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mn_jobs[0m[1;33m=[0m[1

In [17]:
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, n_jobs=-1)
m.fit(x_train, y_train)
print_score(m)

RMSLE of train set 0.1158124506903021
RMSLE of validation set 0.24907979752599999
R^2 of train set 0.9716935464597074
R^2 of validation set 0.8752925011264797



    It increase our R^2 of validation set! it generalizes better as we thought it would!
    If you are using big dataset, you can set min_sample_leaf to 10-10000
    The only way to know which one is better is to try and experiment!
* Generally try values first: 1, 3, 5, 10, 25, 100