In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_diabetes
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor, XGBClassifier, XGBRFRegressor, XGBRFClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error as MSE

In [2]:
X, y = load_diabetes(return_X_y=True)

In [3]:
kfold = KFold(n_splits=5, shuffle=True, random_state=2)

In [4]:
def regression_model(model):
    # Obtain scores of cross-validation using 10 splits and mean squared error
    scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=kfold)

    # Take square root of the scores
    rmse = (-scores)**0.5

    # Return mean score
    return rmse.mean()

In [5]:
regression_model(XGBRegressor(booster='gblinear', objective='reg:squarederror'))

55.503630186113604

In [6]:
regression_model(LinearRegression())

55.50927267834351

In [7]:
regression_model(Lasso())

62.64900771743497

In [8]:
regression_model(Ridge())

58.83525077919004

In [9]:
regression_model(XGBRegressor(booster='gbtree', objective='reg:squarederror'))

65.96608419624594

In [10]:
def grid_search(params, reg=XGBRegressor(booster='gblinear', objective='reg:squarederror')):

    # Instantiate GridSearchCV as grid_reg
    grid_reg = GridSearchCV(reg, params, scoring='neg_mean_squared_error', cv=kfold)
    
    # Fit grid_reg on X_train and y_train
    grid_reg.fit(X, y)

    # Extract best params
    best_params = grid_reg.best_params_

    # Print best params
    print("Best params:", best_params)
    
    # Compute best score
    best_score = np.sqrt(-grid_reg.best_score_)

    # Print best score
    print("Best score:", best_score)

In [11]:
grid_search(params={'reg_alpha':[0.001, 0.01, 0.1, 0.5, 1, 5]})

Best params: {'reg_alpha': 0.001}
Best score: 55.49491862176835


In [12]:
grid_search(params={'reg_lambda':[0.001, 0.01, 0.1, 0.5, 1, 5]})

Best params: {'reg_lambda': 0.001}
Best score: 56.17163548052951


In [13]:
grid_search(params={'feature_selector':['shuffle']})

Best params: {'feature_selector': 'shuffle'}
Best score: 55.527107408614704


In [14]:
grid_search(params={'feature_selector':['random', 'greedy', 'thrifty'], 'updater':['coord_descent'] })

Best params: {'feature_selector': 'thrifty', 'updater': 'coord_descent'}
Best score: 55.48798105805444


In [15]:
grid_search(params={'feature_selector':['greedy', 'thrifty'], 'updater':['coord_descent'], 'top_k':[3, 5, 7, 9]})

Best params: {'feature_selector': 'thrifty', 'top_k': 3, 'updater': 'coord_descent'}
Best score: 55.478623763746256


In [16]:
X = np.arange(1,100)
np.random.seed(2) 
y = []
for i in X:
    y.append(i*np.random.uniform(-0.2, 0.2))
y = np.array(y)
X = X.reshape(X.shape[0], 1)
y = y.reshape(y.shape[0], 1)

In [17]:
regression_model(XGBRegressor(booster='gblinear', objective='reg:squarederror'))

6.214946302686011

In [18]:
regression_model(XGBRegressor(booster='gbtree', objective='reg:squarederror'))

9.37235946501318

In [19]:
regression_model(LinearRegression())

6.214962315808842

In [20]:
X, y = load_diabetes(return_X_y=True)

In [21]:
regression_model(XGBRegressor(booster='dart', objective='reg:squarederror'))

65.96444746130739

In [22]:
df_census = pd.read_csv('census_cleaned.csv')
X_census = df_census.iloc[:, :-1]
y_census = df_census.iloc[:, -1]

In [23]:
def classification_model(model):
    # Obtain scores of cross-validation using 10 splits and mean squared error
    scores = cross_val_score(model, X_census, y_census, scoring='accuracy', cv=kfold)

    # Return mean score
    return scores.mean()

In [24]:
classification_model(XGBClassifier(booster='gbtree'))

0.8701208195968675

In [25]:
classification_model(XGBClassifier(booster='dart'))

0.8701208195968675

In [26]:
classification_model(XGBClassifier(booster='gblinear'))

0.8500354281042902

In [27]:
classification_model(LogisticRegression(max_iter=1000))

0.8008968643699182

In [28]:
classification_model(XGBClassifier(booster='dart', one_drop=1))

0.8718714338474818

In [29]:
regression_model(XGBRegressor(booster='dart', objective='reg:squarederror', sample_type='weighted'))

65.96444746130739

In [30]:
regression_model(XGBRegressor(booster='dart', objective='reg:squarederror', normalize_type='forest'))

65.96444746130739

In [31]:
regression_model(XGBRegressor(booster='dart', objective='reg:squarederror', one_drop=1))

61.81275131335009

In [32]:
grid_search(params={'rate_drop':[0.01, 0.1, 0.2, 0.4]}, 
            reg=XGBRegressor(booster='dart', objective='reg:squarederror', one_drop=1))

Best params: {'rate_drop': 0.2}
Best score: 61.07249602732062


In [33]:
grid_search(params={'skip_drop': [0.01, 0.1, 0.2, 0.4]}, 
            reg=XGBRegressor(booster='dart', objective='reg:squarederror', one_drop=1))

Best params: {'skip_drop': 0.1}
Best score: 62.879753748627635


In [34]:
regression_model(XGBRegressor(booster='gbtree', objective='reg:squarederror', num_parallel_tree=25))

65.96604877151103

In [35]:
regression_model(XGBRegressor(booster='gbtree', objective='reg:squarederror', num_parallel_tree=5))

65.96445649315855

In [36]:
regression_model(XGBRFRegressor(objective='reg:squarederror'))

59.447250741400595

In [37]:
regression_model(RandomForestRegressor())

59.46563031802505

In [38]:
classification_model(XGBRFClassifier())

0.856085650471878

In [39]:
classification_model(RandomForestClassifier())

0.8555328202034789