In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [4]:
train_df = pd.read_csv('train.csv')
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
train_df.dropna(axis=0, subset=['SalePrice'], inplace=True)

In [13]:
y = train_df.SalePrice
X = train_df.drop(['SalePrice'], axis=1).select_dtypes(exclude=['object'])

X = np.array(X)
y = np.array(y)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [17]:
my_imputer = SimpleImputer()
X_train = my_imputer.fit_transform(X_train)
X_test = my_imputer.transform(X_test)

In [19]:
from xgboost import XGBRegressor

model = XGBRegressor()

model.fit(X_train, y_train, verbose=False)

In [20]:
predictions = model.predict(X_test)

from sklearn.metrics import mean_absolute_error
print('Mean Absolute Error: ' + str(mean_absolute_error(predictions, y_test)))

Mean Absolute Error: 18666.320633561645


In [21]:
#N-estimators specifies how many times to go through the modeling cycle described above.
#Underfitting = inaccurate predictions on both training and new data.
#Overfitting = inaccuracte predictions on new data but accurate predictions on training data.
#Modifying the learning rate can affect the fitting of the model.

#Early_stopping_rounds causes the model to stop iterating when the validation score stops improving.

In [22]:
model = XGBRegressor(n_estimators=1000)
model.fit(
    X_train,
    y_train,
    early_stopping_rounds=5,
    eval_set=[(X_test, y_test)],
    verbose=False
)



In [23]:
#Trick for better XGBoost models:
#--Instead of getting predictions by simply adding up the predictions from each component model,
#--we will multiply the predictions from each model by a small number before adding them in.

#--This means each tree we add to the ensemble helps us less. In practice, this reduces the model's propensity to overfit.

#--In general, a small learning rate (and large number of estimators) will yield more accurate XGBoost models,
#--though, it will also take the model longer to train since it does more iterations through the cycle.

In [24]:
model = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05
)

model.fit(
    X_train,
    y_train,
    early_stopping_rounds=5,
    eval_set=[(X_test, y_test)],
    verbose=False
)

In [25]:
#NOTE FOR LARGER DATASETS:
#--You can use parallelism to build your models faster.
#--It's common to set the parameter 'n_jobs' equal to the number of cores on your machine.