In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor, XGBClassifier, XGBRFRegressor, XGBRFClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error as MSE

In [2]:
X, y = load_boston(return_X_y=True)

In [3]:
kfold = KFold(n_splits=5, shuffle=True, random_state=2)

In [4]:
def regression_model(model):
    # Obtain scores of cross-validation using 10 splits and mean squared error
    scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=kfold)

    # Take square root of the scores
    rmse = (-scores)**0.5

    # Return mean score
    return rmse.mean()

In [5]:
regression_model(XGBRegressor(booster='gblinear', objective='reg:squarederror'))

6.066598868705145

In [6]:
regression_model(LinearRegression())

4.776220445390916

In [7]:
regression_model(Lasso())

5.246243815193398

In [8]:
regression_model(Ridge())

4.789706788050634

In [9]:
regression_model(XGBRegressor(booster='gbtree', objective='reg:squarederror'))

3.0782665241356404

In [10]:
X = np.arange(1,100)
np.random.seed(2) 
y = []
for i in X:
    y.append(i*np.random.uniform(-0.2, 0.2))
y = np.array(y)
X = X.reshape(X.shape[0], 1)
y = y.reshape(y.shape[0], 1)

In [11]:
regression_model(XGBRegressor(booster='gblinear', objective='reg:squarederror'))

6.208537305892296

In [12]:
regression_model(XGBRegressor(booster='gbtree', objective='reg:squarederror'))

7.936201694808057

In [13]:
regression_model(LinearRegression())

6.214962315808842

In [14]:
X, y = load_boston(return_X_y=True)

In [15]:
def grid_search(params, reg=XGBRegressor(booster='gblinear', objective='reg:squarederror')):

    # Instantiate GridSearchCV as grid_reg
    grid_reg = GridSearchCV(reg, params, scoring='neg_mean_squared_error', cv=kfold)
    
    # Fit grid_reg on X_train and y_train
    grid_reg.fit(X, y)

    # Extract best params
    best_params = grid_reg.best_params_

    # Print best params
    print("Best params:", best_params)
    
    # Compute best score
    best_score = np.sqrt(-grid_reg.best_score_)

    # Print best score
    print("Best score:", best_score)

In [16]:
grid_search(params={'reg_alpha':[0.001, 0.01, 0.1, 0.5, 1, 5]})

Best params: {'reg_alpha': 0.001}
Best score: 6.117631778928818


In [17]:
grid_search(params={'reg_lambda':[0.001, 0.01, 0.1, 0.5, 1, 5]})

Best params: {'reg_lambda': 0.01}
Best score: 5.94547948977594


In [18]:
grid_search(params={'feature_selector':['shuffle']})

Best params: {'feature_selector': 'shuffle'}
Best score: 6.186757718677648


In [19]:
grid_search(params={'feature_selector':['random', 'greedy', 'thrifty'], 'updater':['coord_descent'] })

Best params: {'feature_selector': 'greedy', 'updater': 'coord_descent'}
Best score: 5.666491009253024


In [20]:
grid_search(params={'feature_selector':['greedy', 'thrifty'], 'updater':['coord_descent'], 'top_k':[2, 4, 6, 8, 10, 12]})

Best params: {'feature_selector': 'greedy', 'top_k': 12, 'updater': 'coord_descent'}
Best score: 5.666491009253024


In [21]:
regression_model(XGBRegressor(booster='dart', objective='reg:squarederror'))

3.0782664031309923

In [22]:
df_census = pd.read_csv('census_cleaned.csv')
X_census = df_census.iloc[:, :-1]
y_census = df_census.iloc[:, -1]

In [23]:
def classification_model(model):
    # Obtain scores of cross-validation using 10 splits and mean squared error
    scores = cross_val_score(model, X_census, y_census, scoring='accuracy', cv=kfold)

    # Return mean score
    return scores.mean()

In [24]:
classification_model(XGBClassifier(booster='gbtree'))

0.8646848431279569

In [25]:
classification_model(XGBClassifier(booster='dart'))

0.8646848431279569

In [26]:
classification_model(XGBClassifier(booster='gblinear'))

0.8084211325977793

In [27]:
classification_model(LogisticRegression(max_iter=1000))

0.8008968643699182

In [28]:
classification_model(XGBClassifier(booster='dart', one_drop=1))

0.8629649414080551

In [29]:
regression_model(XGBRegressor(booster='dart', objective='reg:squarederror', sample_type='weighted'))

3.0782664031309923

In [30]:
regression_model(XGBRegressor(booster='dart', objective='reg:squarederror', normalize_type='forest'))

3.0782664031309923

In [31]:
regression_model(XGBRegressor(booster='dart', objective='reg:squarederror', one_drop=1))

3.0177109201351824

In [32]:
grid_search(params={'rate_drop':[0.001, 0.01, 0.1, 0.2, 0.4]}, reg=XGBRegressor(booster='dart', objective='reg:squarederror'))

Best params: {'rate_drop': 0.001}
Best score: 3.096546230565424


In [33]:
grid_search(params={'skip_drop':[0.001, 0.01, 0.1, 0.2, 0.4]}, reg=XGBRegressor(booster='dart', objective='reg:squarederror'))

Best params: {'skip_drop': 0.001}
Best score: 3.100397263042665


In [34]:
regression_model(XGBRegressor(booster='gbtree', objective='reg:squarederror', num_parallel_tree=25))

3.0809481082878047

In [35]:
regression_model(XGBRegressor(booster='gbtree', objective='reg:squarederror', num_parallel_tree=2))

3.0782659029860833

In [36]:
regression_model(XGBRegressor(booster='gbtree', objective='reg:squarederror', num_parallel_tree=1))

3.0782665241356404

In [37]:
regression_model(XGBRFRegressor(objective='reg:squarederror'))

4.437653978420867

In [38]:
regression_model(RandomForestRegressor())

3.199233481145508

In [39]:
classification_model(XGBRFClassifier())

0.8437088401160258

In [40]:
classification_model(RandomForestClassifier())

0.8555328202034789