In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [6]:
from fastai import *

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display

from sklearn import metrics

  from numpy.core.umath_tests import inner1d


In [10]:
PATH = "data/house_prices/"

In [19]:
df_raw = pd.read_csv(f'{PATH}train.csv', low_memory=False)

In [14]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

In [15]:
display_all(df_raw.head().transpose())

Unnamed: 0,0,1,2,3,4
Id,1,2,3,4,5
MSSubClass,60,20,60,70,60
MSZoning,RL,RL,RL,RL,RL
LotFrontage,65,80,68,60,84
LotArea,8450,9600,11250,9550,14260
Street,Pave,Pave,Pave,Pave,Pave
Alley,,,,,
LotShape,Reg,Reg,IR1,IR1,IR1
LandContour,Lvl,Lvl,Lvl,Lvl,Lvl
Utilities,AllPub,AllPub,AllPub,AllPub,AllPub


In [20]:
display_all(df_raw.describe(include='all').T)

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Id,1460,,,,730.5,421.61,1.0,365.75,730.5,1095.25,1460.0
MSSubClass,1460,,,,56.8973,42.3006,20.0,20.0,50.0,70.0,190.0
MSZoning,1460,5.0,RL,1151.0,,,,,,,
LotFrontage,1201,,,,70.05,24.2848,21.0,59.0,69.0,80.0,313.0
LotArea,1460,,,,10516.8,9981.26,1300.0,7553.5,9478.5,11601.5,215245.0
Street,1460,2.0,Pave,1454.0,,,,,,,
Alley,91,2.0,Grvl,50.0,,,,,,,
LotShape,1460,4.0,Reg,925.0,,,,,,,
LandContour,1460,4.0,Lvl,1311.0,,,,,,,
Utilities,1460,2.0,AllPub,1459.0,,,,,,,


log transform dependent variable

In [21]:
df_raw.SalePrice = np.log(df_raw.SalePrice)

code categorical variables

In [25]:
from pandas.api.types import is_string_dtype, is_numeric_dtype
def train_cats(df):
    for n,c in df.items():
        if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered()

def apply_cats(df, trn):
    for n,c in df.items():
        if (n in trn.columns) and (trn[n].dtype.name=='category'):
            df[n] = pd.Categorical(c, categories=trn[n].cat.categories, ordered=True)


In [26]:
train_cats(df_raw)

look at fraction of missing values

In [27]:
display_all(df_raw.isnull().sum().sort_index()/len(df_raw))

1stFlrSF         0.000000
2ndFlrSF         0.000000
3SsnPorch        0.000000
Alley            0.937671
BedroomAbvGr     0.000000
BldgType         0.000000
BsmtCond         0.025342
BsmtExposure     0.026027
BsmtFinSF1       0.000000
BsmtFinSF2       0.000000
BsmtFinType1     0.025342
BsmtFinType2     0.026027
BsmtFullBath     0.000000
BsmtHalfBath     0.000000
BsmtQual         0.025342
BsmtUnfSF        0.000000
CentralAir       0.000000
Condition1       0.000000
Condition2       0.000000
Electrical       0.000685
EnclosedPorch    0.000000
ExterCond        0.000000
ExterQual        0.000000
Exterior1st      0.000000
Exterior2nd      0.000000
Fence            0.807534
FireplaceQu      0.472603
Fireplaces       0.000000
Foundation       0.000000
FullBath         0.000000
Functional       0.000000
GarageArea       0.000000
GarageCars       0.000000
GarageCond       0.055479
GarageFinish     0.055479
GarageQual       0.055479
GarageType       0.055479
GarageYrBlt      0.055479
GrLivArea   

replace categories with their numeric codes, handle missing continuous values, and split the dependent variable into a separate variable.

In [55]:
def numericalize(df, col, name, max_n_cat):
    if not is_numeric_dtype(col) and ( max_n_cat is None or len(col.cat.categories)>max_n_cat):
        df[name] = col.cat.codes+1

def fix_missing(df, col, name, na_dict):
    if is_numeric_dtype(col):
        if pd.isnull(col).sum() or (name in na_dict):
            df[name+'_na'] = pd.isnull(col)
            filler = na_dict[name] if name in na_dict else col.median()
            df[name] = col.fillna(filler)
            na_dict[name] = filler
    return na_dict

def proc_df(df, y_fld=None, skip_flds=None, ignore_flds=None, do_scale=False, na_dict=None,
            preproc_fn=None, max_n_cat=None, subset=None, mapper=None):
    if not ignore_flds: ignore_flds=[]
    if not skip_flds: skip_flds=[]
    if subset: df = get_sample(df,subset)
    else: df = df.copy()
    ignored_flds = df.loc[:, ignore_flds]
    df.drop(ignore_flds, axis=1, inplace=True)
    if preproc_fn: preproc_fn(df)
    if y_fld is None: y = None
    else:
        if not is_numeric_dtype(df[y_fld]): df[y_fld] = df[y_fld].cat.codes
        y = df[y_fld].values
        skip_flds += [y_fld]
    df.drop(skip_flds, axis=1, inplace=True)

    if na_dict is None: na_dict = {}
    else: na_dict = na_dict.copy()
    na_dict_initial = na_dict.copy()
    for n,c in df.items(): na_dict = fix_missing(df, c, n, na_dict)
    if len(na_dict_initial.keys()) > 0:
        df.drop([a + '_na' for a in list(set(na_dict.keys()) - set(na_dict_initial.keys()))], axis=1, inplace=True)
    if do_scale: mapper = scale_vars(df, mapper)
    for n,c in df.items(): numericalize(df, c, n, max_n_cat)
    df = pd.get_dummies(df, dummy_na=True)
    df = pd.concat([ignored_flds, df], axis=1)
    res = [df, y, na_dict]
    if do_scale: res = res + [mapper]
    return res

In [56]:
df, y, nas = proc_df(df_raw, 'SalePrice')

fit model

In [45]:
m = RandomForestRegressor(n_jobs=-1)
m.fit(df, y)
m.score(df,y)

0.9750029261566399

Predict on test set

In [66]:
df_raw_test = pd.read_csv(f'{PATH}test.csv', low_memory=False)
apply_cats(df_raw_test, df_raw)
df_test, _, nas = proc_df(df_raw_test, na_dict=nas)

In [69]:
df.shape, df_test.shape, df_raw.shape, df_raw_test.shape

((1460, 83), (1459, 83), (1460, 81), (1459, 80))

In [71]:
pred_test = m.predict(df_test)

remeber to exp salesprice

In [72]:
pred_test = np.exp(pred_test)

In [78]:
df_test['SalePrice']=pred_test

summary of predictions

In [83]:
df_test.SalePrice.describe() 

(count      1459.000000
 mean     175403.350824
 std       72236.881631
 min       51441.181532
 25%      127470.231645
 50%      154906.968489
 75%      207976.187505
 max      545384.353688
 Name: SalePrice, dtype: float64, 180921.19589041095)

summary of training labels

In [89]:
np.exp(y).mean(), np.exp(y).min(), np.exp(y).max()

(180921.19589041095, 34900.00000000001, 754999.9999999999)

In [90]:
csv_fn=f'{PATH}submission1.csv'
df_test[['Id','SalePrice']].to_csv(csv_fn, index=False)

In [None]:
!kaggle competitions submit house-prices-advanced-regression-techniques -f submission1.csv -m "first random forest"

look at if categories should be ordered

In [92]:
df_raw2 = pd.read_csv(f'{PATH}train.csv', low_memory=False)

In [100]:
cat_vars = []
for col in df_raw2:
    if not is_numeric_dtype(df_raw2[col]):
        cat_vars.append(col)
cat_vars

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [None]:
for df_raw..cat.categories