In [61]:
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor, XGBClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error as MSE
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
sns.set()

In [83]:
X, y = load_boston(return_X_y=True)

In [3]:
kfold = KFold(n_splits=5, shuffle=True, random_state=2)

In [112]:
def regression_model(model):
    # Obtain scores of cross-validation using 10 splits and mean squared error
    scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=kfold)

    # Take square root of the scores
    rmse = (-scores)**0.5

    # Return mean score
    return rmse.mean()

In [113]:
regression_model(XGBRegressor(booster='gblinear', objective='reg:squarederror'))

6.066598868705145

In [114]:
regression_model(LinearRegression())

4.776220445390916

In [115]:
regression_model(Lasso())

5.246243815193398

In [116]:
regression_model(Ridge())

4.789706788050634

In [117]:
regression_model(XGBRegressor(booster='gbtree', objective='reg:squarederror'))

3.0782665241356404

In [119]:
def grid_search(params, reg=XGBRegressor(booster='gblinear', objective='reg:squarederror')):

    # Instantiate GridSearchCV as grid_reg
    grid_reg = GridSearchCV(reg, params, scoring='neg_mean_squared_error', cv=kfold)
    
    # Fit grid_reg on X_train and y_train
    grid_reg.fit(X, y)

    # Extract best params
    best_params = grid_reg.best_params_

    # Print best params
    print("Best params:", best_params)
    
    # Compute best score
    best_score = np.sqrt(-grid_reg.best_score_)

    # Print best score
    print("Training score:", best_score)

In [120]:
grid_search(params={'reg_alpha':[0.01, 0.1, 0.5, 1, 2, 4]})

Best params: {'reg_alpha': 0.01}
Training score: 6.118521408456118


In [12]:
grid_search(params={'reg_lambda':[0.01, 0.1, 0.5, 1, 2, 4]})

Best params: {'reg_lambda': 0.01}
Training score: 5.95


In [13]:
grid_search(params={'reg_lambda':[0.01, 0.05, 0.15, 0.2]})

Best params: {'reg_lambda': 0.01}
Training score: 5.95


In [14]:
grid_search(params={'feature_selector':['shuffle']})

Best params: {'feature_selector': 'shuffle'}
Training score: 6.19


In [15]:
grid_search(params={'feature_selector':['random', 'greedy', 'thrifty'], 'updater':['coord_descent'] })

Best params: {'feature_selector': 'greedy', 'updater': 'coord_descent'}
Training score: 5.67


In [16]:
grid_search(params={'feature_selector':['greedy', 'thrifty'], 'updater':['coord_descent'], 'top_k':[2, 4, 6, 8, 10, 12]})

Best params: {'feature_selector': 'greedy', 'top_k': 12, 'updater': 'coord_descent'}
Training score: 5.67


In [24]:
grid_search(params={'feature_selector':['greedy'], 'updater':['coord_descent'], 
                    'learning_rate':[0.3]})

Best params: {'feature_selector': 'greedy', 'learning_rate': 0.3, 'updater': 'coord_descent'}
Training score: 5.55


In [121]:
regression_model(XGBRegressor(booster='dart', objective='reg:squarederror'))

3.0782664031309923

In [56]:
df_census = pd.read_csv('census_cleaned.csv')
X_census = df_census.iloc[:, :-1]
y_census = df_census.iloc[:, -1]

In [103]:
def classification_model(model):
    # Obtain scores of cross-validation using 10 splits and mean squared error
    scores = cross_val_score(model, X_census, y_census, scoring='accuracy', cv=kfold)

    # Return mean score
    return scores.mean()

In [108]:
classification_model(XGBClassifier(booster='gbtree', random_state=2))

0.8646848431279569

In [109]:
classification_model(XGBClassifier(booster='dart', random_state=2))

0.8646848431279569

In [106]:
classification_model(XGBClassifier(booster='gblinear'))

0.8084211325977793

In [107]:
classification_model(LogisticRegression(max_iter=1000))

0.8008968643699182

In [110]:
classification_model(XGBClassifier(booster='dart', one_drop=1))

0.8629649414080551

In [111]:
regression_model(XGBRegressor(booster='dart', one_drop=1))



3.02

In [17]:
regression_model(XGBRegressor(booster='dart', objective='reg:squarederror'))

3.08

In [48]:
regression_model(XGBRegressor(booster='dart', objective='reg:squarederror', normalize_type='forest'))

3.08

In [47]:
regression_model(XGBRegressor(booster='dart', objective='reg:squarederror', sample_type='weighted'))

3.08

In [44]:
regression_model(XGBRegressor(booster='dart', objective='reg:squarederror', 
                             one_drop=1))

3.02

In [54]:
regression_model(XGBRegressor(booster='dart', objective='reg:squarederror', 
                             skip_drop=1))

3.08

In [84]:
regression_model(XGBRegressor(booster='dart', objective='reg:squarederror', 
                             rate_drop=0.2))

8.19

In [96]:
regression_model(XGBRegressor(booster='dart', objective='reg:squarederror', 
                             rate_drop=0.05))

3.68

In [97]:
regression_model(XGBRegressor(booster='dart', objective='reg:squarederror', 
                             rate_drop=0.02))

3.37

In [99]:
regression_model(XGBRegressor(booster='dart', objective='reg:squarederror', 
                             rate_drop=0.001))

3.07

In [101]:
regression_model(XGBRegressor(booster='dart', objective='reg:squarederror', 
                             one_drop=1))

3.02