# XGBoost: Extreme Gradient Boosting

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# read the data
X_full = pd.read_csv('./train.csv', index_col='Id')
X_test_full = pd.read_csv('./test.csv', index_col='Id')

# remove rows with missing target
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)

# separate target from predictors
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)

# split dataset into training and validation data
X_train_full, X_valid_full,  y_train, y_valid = train_test_split(X_full, y, train_size=0.8, test_size=0.2, random_state=0)

# columns with relatively low cardinality
low_cardinality_cols = [col for col in X_train_full.columns if X_train_full[col].dtype == 'object' and X_train_full[col].nunique() < 10]

# numeric columns
numeric_cols = [col for col in X_train_full.columns if X_train_full[col].dtype in ['int64', 'float64']] 

# selected cols
selected_cols = low_cardinality_cols + numeric_cols
X_train = X_train_full[selected_cols].copy()
X_valid = X_valid_full[selected_cols].copy()
X_test = X_test_full[selected_cols].copy()

# one hot encode using pandas
X_train = pd.get_dummies(X_train)
X_valid = pd.get_dummies(X_valid)
X_test = pd.get_dummies(X_test)

X_train, X_valid = X_train.align(X_valid, join='left', axis=1)
X_train, X_test = X_train.align(X_test, join='left', axis=1)

### Build model with XGBoost

In [5]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

my_model = XGBRegressor(random_state=0)
my_model.fit(X_train, y_train)
preds_1 = my_model.predict(X_valid)
print(f"MAE: {mean_absolute_error(y_valid, preds_1)}")

MAE: 17662.736729452055


### Improve model

In [16]:
model = XGBRegressor(n_estimators=500, learning_rate=0.05, early_stopping_rounds=5, n_jobs=4)
model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)
predictions = model.predict(X_valid)
print(f'MAE: {mean_absolute_error(y_valid, predictions)}')

MAE: 16802.965325342466


### Prediction on test data

In [18]:
predictions_test = model.predict(X_test)
output = pd.DataFrame({'Id': X_test.index, 'SalePrice': predictions_test})
output.to_csv('XGBoost_predictions.csv', index=False)

In [19]:
output.head()

Unnamed: 0,Id,SalePrice
0,1461,124070.335938
1,1462,151594.609375
2,1463,189254.25
3,1464,187754.9375
4,1465,196770.6875
