# What Algorithm to choose?

In [58]:
from numpy import loadtxt
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
import pandas as pd
import numpy as np

df = pd.read_csv('ml_ready_data.csv', index_col='Unnamed: 0', low_memory=False)

In [59]:
cols = df.columns

df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')

# Building your first Model

In [60]:
# convert to matrix
y = 'SalesClosePrice'
feature_names = list(df.columns.values)
feature_names.remove(y)

In [61]:
# split data into train and test sets
Xtrain, Xtest, ytrain, ytest = train_test_split(df[feature_names], df[y], test_size=0.5)

# See below two lines
X_train = pd.DataFrame(data=Xtrain, columns=feature_names)
Xtest = pd.DataFrame(data=Xtest, columns=feature_names)

dtrain = xgb.DMatrix(Xtrain, label=ytrain)

In [62]:
# fit model no training data
model = xgb.XGBRegressor(n_jobs=4)
model.fit(Xtrain, ytrain)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=4, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

# Evaluating your model

### What Metrics to use?

* metrics.explained_variance_score(y_true, y_pred)	
    * Explained variance regression score function


* metrics.mean_absolute_error(y_true, y_pred)	       
    * Mean absolute error regression loss

* metrics.mean_squared_error(y_true, y_pred[, …])	    
    * Mean squared error regression loss


* metrics.mean_squared_log_error(y_true, y_pred)	    
    * Mean squared logarithmic error regression loss


* metrics.median_absolute_error(y_true, y_pred)	    
    * Median absolute error regression loss


* metrics.r2_score(y_true, y_pred[, …])	            
    * R^2 (coefficient of determination) regression score function.

In [63]:
from math import sqrt
mse = mean_squared_error(ytest, ypred)
rmse = sqrt(mse)
rmse

192107.75845679737

In [64]:
# make predictions for test data
ypred = model.predict(Xtest)
predictions = [round(value) for value in ypred]

In [65]:
# evaluate predictions
r2_score(ytest, ypred)

0.9549179713328426

# Tuning Your Model

In [None]:
# Split the dataset in two equal parts
Xtrain, Xtest, ytrain, ytest = train_test_split(df[feature_names], df[y], test_size=0.2, random_state=42)

# Set the parameters by cross-validation
tuned_parameters = [{'subsample': [1], 'colsample_bytree': [1], 'gamma':[0], 
                     'learning_rate':[0.1], 'max_depth': [4, 5, 6], 
                     'min_child_weight': [0.1, 0.5, 0.75]}]


print("# Tuning hyper-parameters for r2")
print()

model = GridSearchCV(xgb.XGBRegressor(n_jobs=4), tuned_parameters, cv=5, scoring='r2')
model.fit(Xtrain, ytrain)

print("Best parameters set found on development set:")
print()
print(model.best_params_)
print()
print("Grid scores on development set:")
print()
means = model.cv_results_['mean_test_score']
stds = model.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, model.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

# Tuning hyper-parameters for r2



# What is important?

In [None]:
from xgboost import plot_importance
%matplotlib inline
from matplotlib import pyplot as plt

# feature importance
fig, ax = plt.subplots(figsize=(40, 40))
plot_importance(model, color='red', grid=False, show_values=False, ax=ax)
plt.title('importance', fontsize = 36)
plt.yticks(fontsize = 18)
plt.ylabel('features', fontsize = 36)

# What can we afford to lose?