In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### In the Kaggle course, only a few columns were chosen so I will start with choosing all the columns (except sales price ofcourse) to train models. Let's see what happens 😉

In [None]:
df = pd.read_csv('../input/home-data-for-ml-course/train.csv')
df.head()

In [None]:
df.info()

In [None]:
for col in df.columns:
    if df[col].count()<1350:
        df.drop(col,axis=1,inplace=True)

df = df.select_dtypes(exclude=['object'])

df.info()

In [None]:
df.head()

### Let me first define a function to simply get the mean squared error between the predicted values of the models that I will use and the actual values in the validation set

In [None]:
from sklearn.metrics import mean_absolute_error 

def get_mae(actual,predicted):
    return mean_absolute_error(actual,predicted)

### Now I will get some things ready that I will be using for every variation of models

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer

In [None]:
X = df.loc[:,'Id':'YrSold']
y = df['SalePrice']

### 1. RandomForestRegressor - n_estimators = [10,25,50,75,100,250,500,750,1000,1250,1500,1750,2000,2500,3500,5000,10000]
###    SimpleImputer strategy - mean

In [None]:
train_X,valid_X,train_y,valid_y = train_test_split(X,y,train_size=0.8,test_size=0.2,random_state=42)

imputer = SimpleImputer(strategy='mean')
imputed_train_X = pd.DataFrame(imputer.fit_transform(train_X))
imputed_valid_X = pd.DataFrame(imputer.transform(valid_X))

imputed_train_X.columns = train_X.columns
imputed_valid_X.columns = valid_X.columns

In [None]:
for i in [10,25,50,75,100,250,500,750,1000,1250,1500,1750,2000,2500,3500,5000,10000]:
    model = RandomForestRegressor(n_estimators=i,random_state=42)
    model.fit(imputed_train_X,train_y)
    preds = model.predict(imputed_valid_X)
    print('MAE for',i,'trees is',get_mae(valid_y,preds))

#### So I see that around 750 trees is where the MAE went the lowest. I will see how the MAE changes from 500 trees to 1000 trees by plotting it

In [None]:
plot_list = []

In [None]:
for i in range(500,1001):
    model = RandomForestRegressor(n_estimators=i,random_state=42)
    model.fit(imputed_train_X,train_y)
    preds = model.predict(imputed_valid_X)
    plot_list.append(get_mae(valid_y,preds))
    print(i,'done')

In [None]:
df_to_plot = pd.DataFrame({'mae':plot_list})

plt.plot(list(range(500,1001)),df_to_plot['mae'])

min_mae_index = plot_list.index(min(plot_list))
plt.annotate("({},{})".format(500+min_mae_index,min(plot_list)),(500+min_mae_index,min(plot_list)),(500+min_mae_index+7,min(plot_list)))

plt.plot(500+min_mae_index,min(plot_list),'o')

plt.show()

### 2. RandomForestRegressor - n_estimators = [10,25,50,75,100,250,500,750,1000,1250,1500,1750,2000,2500,3500,5000,10000]
###    SimpleImputer strategy - median

In [None]:
train_X,valid_X,train_y,valid_y = train_test_split(X,y,train_size=0.8,test_size=0.2,random_state=42)

imputer = SimpleImputer(strategy='median')
imputed_train_X = pd.DataFrame(imputer.fit_transform(train_X))
imputed_valid_X = pd.DataFrame(imputer.transform(valid_X))

imputed_train_X.columns = train_X.columns
imputed_valid_X.columns = valid_X.columns

In [None]:
for i in [10,25,50,75,100,250,500,750,1000,1250,1500,1750,2000,2500,3500,5000,10000]:
    model = RandomForestRegressor(n_estimators=i,random_state=42)
    model.fit(imputed_train_X,train_y)
    preds = model.predict(imputed_valid_X)
    print('MAE for',i,'trees is',get_mae(valid_y,preds))

#### Again, I see that around 750 trees is where the MAE went the lowest. I will see how the MAE changes from 500 trees to 1000 trees by plotting it

In [None]:
plot_list = []

In [None]:
for i in range(500,1001):
    model = RandomForestRegressor(n_estimators=i,random_state=42)
    model.fit(imputed_train_X,train_y)
    preds = model.predict(imputed_valid_X)
    plot_list.append(get_mae(valid_y,preds))
    print(i,'done')

In [None]:
df_to_plot = pd.DataFrame({'mae':plot_list})

plt.plot(list(range(500,1001)),df_to_plot['mae'])

min_mae_index = plot_list.index(min(plot_list))
plt.annotate("({},{})".format(500+min_mae_index,min(plot_list)),(500+min_mae_index,min(plot_list)),(500+min_mae_index+7,min(plot_list)))

plt.plot(500+min_mae_index,min(plot_list),'o')

plt.show()

 ### 3. RandomForestRegressor - n_estimators = [10,25,50,75,100,250,500,750,1000,1250,1500,1750,2000,2500,3500,5000,10000]
###    SimpleImputer strategy - most_frequent

In [None]:
train_X,valid_X,train_y,valid_y = train_test_split(X,y,train_size=0.8,test_size=0.2,random_state=42)

imputer = SimpleImputer(strategy='most_frequent')
imputed_train_X = pd.DataFrame(imputer.fit_transform(train_X))
imputed_valid_X = pd.DataFrame(imputer.transform(valid_X))

imputed_train_X.columns = train_X.columns
imputed_valid_X.columns = valid_X.columns

In [None]:
for i in [10,25,50,75,100,250,500,750,1000,1250,1500,1750,2000,2500,3500,5000,10000]:
    model = RandomForestRegressor(n_estimators=i,random_state=42)
    model.fit(imputed_train_X,train_y)
    preds = model.predict(imputed_valid_X)
    print('MAE for',i,'trees is',get_mae(valid_y,preds))

In [None]:
plot_list = []

In [None]:
for i in range(500,1001):
    model = RandomForestRegressor(n_estimators=i,random_state=42)
    model.fit(imputed_train_X,train_y)
    preds = model.predict(imputed_valid_X)
    plot_list.append(get_mae(valid_y,preds))
    print(i,'done')

In [None]:
df_to_plot = pd.DataFrame({'mae':plot_list})

plt.plot(list(range(500,1001)),df_to_plot['mae'])

min_mae_index = plot_list.index(min(plot_list))
plt.annotate("({},{})".format(500+min_mae_index,min(plot_list)),(500+min_mae_index,min(plot_list)),(500+min_mae_index+7,min(plot_list)))

plt.plot(500+min_mae_index,min(plot_list),'o')

plt.show()

 ### 3. RandomForestRegressor - n_estimators = [10,25,50,75,100,250,500,750,1000,1250,1500,1750,2000,2500,3500,5000,10000]
###    SimpleImputer strategy - mean (adding a column indicating which column was imputed)

In [None]:
train_X,valid_X,train_y,valid_y = train_test_split(X,y,train_size=0.8,test_size=0.2,random_state=42)

cols_with_missing = [col for col in train_X.columns
                     if train_X[col].isnull().any()]

train_X_m3 = train_X.copy()
valid_X_m3 = valid_X.copy()

for col in cols_with_missing:
    train_X_m3[col + '_was_missing'] = train_X_m3[col].isnull()
    valid_X_m3[col + '_was_missing'] = valid_X_m3[col].isnull()

#Let's go with the mean as the strategy this time.
imputer = SimpleImputer(strategy='mean')
imputed_train_X = pd.DataFrame(imputer.fit_transform(train_X_m3))
imputed_valid_X = pd.DataFrame(imputer.transform(valid_X_m3))

imputed_train_X.columns = train_X_m3.columns
imputed_valid_X.columns = valid_X_m3.columns

In [None]:
for i in [10,25,50,75,100,250,500,750,1000,1250,1500,1750,2000,2500,3500,5000,10000]:
    model = RandomForestRegressor(n_estimators=i,random_state=42)
    model.fit(imputed_train_X,train_y)
    preds = model.predict(imputed_valid_X)
    print('MAE for',i,'trees is',get_mae(valid_y,preds))

In [None]:
plot_list = []

In [None]:
for i in range(500,1001):
    model = RandomForestRegressor(n_estimators=i,random_state=42)
    model.fit(imputed_train_X,train_y)
    preds = model.predict(imputed_valid_X)
    plot_list.append(get_mae(valid_y,preds))
    print(i,'done')

In [None]:
df_to_plot = pd.DataFrame({'mae':plot_list})

plt.plot(list(range(500,1001)),df_to_plot['mae'])

min_mae_index = plot_list.index(min(plot_list))
plt.annotate("({},{})".format(500+min_mae_index,min(plot_list)),(500+min_mae_index,min(plot_list)),(500+min_mae_index+7,min(plot_list)))

plt.plot(500+min_mae_index,min(plot_list),'o')

plt.show()

### That's all for now folks!