##### Creation Date:
##### February 19 2022
##### Created By Alperen KOLAMUC

In [1]:
from sklearn.ensemble import RandomForestRegressor
from IPython.display import display
from sklearn import metrics

In [2]:
import re
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype

In [3]:
import pandas as pd
import numpy as np
import math

##### Prepare


In [4]:
df = pd.read_feather("tmp/bulldozers_1")

In [5]:
def numericalize(df, col, name):
    if not is_numeric_dtype(col):
        df[name] = col.cat.codes + 1 # ortak ağız için

In [6]:
def fix_missing(df, col, name, nan_dict, is_train):
    if is_train:
        if is_numeric_dtype(col):
            if pd.isnull(col).sum():
                df[name+"_NA"] = pd.isnull(col)
                nan_dict[name] = col.median()
                df[name] = col.fillna(nan_dict[name])

    else:
        if is_numeric_dtype(col):
            if name in nan_dict:
                df[name+"_NA"] = pd.isnull(col)
                df[name] = col.fillna(nan_dict[name])
            
            else:
                df[name] = col.fillna(df[name].median())

In [7]:
def proc_df(df, y_fld, nan_dict=None, is_train=True):
    df = df.copy()
    y = df[y_fld].values

    df.drop([y_fld], axis=1, inplace=True)

    if nan_dict is None:
        nan_dict = {}
    
    for n, c in df.items():
        fix_missing(df, c, n, nan_dict, is_train)
        numericalize(df, c, n)

    if is_train:
        return df, y, nan_dict
    
    return df, y

In [8]:
def split_train_val(df, n):
    return(df[:n].copy(), df[n:].copy())

In [9]:
n_valid = 12000
n_train = len(df)-n_valid
raw_train, raw_valid = split_train_val(df, n_train)

In [10]:
x_train_raw, y_train_raw, nas = proc_df(raw_train, 'SalePrice')

In [11]:
x_train, y_train, nas = proc_df(raw_train, 'SalePrice')

In [12]:
x_valid, y_valid = proc_df(raw_valid, 'SalePrice', nan_dict=nas, is_train=False)

In [13]:
def rmse(x, y):
    return math.sqrt(((x-y)**2).mean())

In [14]:
def print_score(m):

    print(f"RMSLE of train set {rmse(m.predict(x_train), y_train)}")
    print(f"RMSLE of validation set {rmse(m.predict(x_valid), y_valid)}")
    print(f"R^2 of train set {m.score(x_train, y_train)}")
    print(f"R^2 of validation set {m.score(x_valid, y_valid)}")

#### Farklı Şeyleri Hızlı Şekilde Denemek
* Elimizdeki probleme uygun modeli bulmak için çok şey denememiz gerekebilir
* Bu iteratif süreci hızlı bir hale getirmek için model seçme kısmını subsample alarak yapabiliriz

#### Subset Yaratma

In [15]:
def get_sample(df, n):
    idxs = np.random.permutation(len(df))[:n]
    return idxs, df.iloc[idxs].copy()

* Bütün verini model seçmek için kullanman gerekmez
* En son eğitmek için gerekir
* Validation seti değiştirmek istemiyorum, sadece train içinde subset alacağım


|        | **Train** | **Validation** |
|--------|:---------:|:--------------:|
|        |  **300K** |     **1M**     |
| **M1** |    0.64   |      0.75      |
| **M2** |    0.67   |      0.79      |
| **M3** |    0.58   |      0.68      |
| **M4** |    0.44   |      0.57      |

* Örnekte görüldüğü gibi M2 300K'lık veride `Train`'de en yüksek skoru yapmış. Yine aynı şekilde 1M'luk veride `Validation`'da da yüksek skor yapmış diğer modellere göre. O zaman `Train`'de en yüksek yapan `Validation`'da da yüksek sonuç verecektir. Sırf bunu test etmek için 1M veriyi diğer modellerde de test ederek zaman kaybetmeye gerek yok. `Train`'de en yüksek 2 tane M2 ve M1'i `Validation`'da test etmek daha doğru olacaktır


In [16]:
idxs, x_train = get_sample(x_train, 3000)
y_train = y_train[idxs]

In [17]:
m = RandomForestRegressor(n_estimators=10, n_jobs=-1)
%time m.fit(x_train, y_train)
print_score(m)

CPU times: total: 594 ms
Wall time: 98.4 ms
RMSLE of train set 0.14145120231004832
RMSLE of validation set 0.35261117677243714
R^2 of train set 0.9573240635941189
R^2 of validation set 0.7500761460648667


In [18]:
m = RandomForestRegressor(n_estimators=30, n_jobs=-1)
%time m.fit(x_train, y_train)
print_score(m)

CPU times: total: 2.81 s
Wall time: 235 ms
RMSLE of train set 0.12153577784010855
RMSLE of validation set 0.34049898048255495
R^2 of train set 0.9684951031135548
R^2 of validation set 0.7669510272928001


#### Tüm Veri İle

In [19]:
x_train, y_train, nas = proc_df(raw_train, 'SalePrice')

In [20]:
x_valid, y_valid = proc_df(raw_valid, 'SalePrice', nan_dict=nas, is_train=False)

In [21]:
m = RandomForestRegressor(n_estimators=10, n_jobs=-1)
%time m.fit(x_train, y_train)
print_score(m)

CPU times: total: 1min 39s
Wall time: 10.4 s
RMSLE of train set 0.09035467988577847
RMSLE of validation set 0.2555905513575876
R^2 of train set 0.9827703577282123
R^2 of validation set 0.8686877778630461


In [22]:
m = RandomForestRegressor(n_estimators=30, n_jobs=-1)
%time m.fit(x_train, y_train)
print_score(m)

CPU times: total: 6min 29s
Wall time: 29.3 s
RMSLE of train set 0.07929675199313088
RMSLE of validation set 0.2494024121846643
R^2 of train set 0.9867295467857662
R^2 of validation set 0.8749692430971017
