# XGBoost Lenzerheide

## Cross-validation in XGBoost example

In [None]:
import xgboost as xgb
import pandas as pd 
class_data = pd.read_csv("insert_csv_file.csv") 
churn_dmatrix = xgb.DMatrix(data=churn_data.iloc[:,:-1], 
 label=churn_data.month_5_still_here) 
params={"objective":"binary:logistic","max_depth":4}  # change learner 
cv_results = xgb.cv(dtrain=churn_dmatrix, params=params, nfold=4,
 num_boost_round=10, metrics="error", as_pandas=True) 
print("Accuracy: %f" %((1-cv_results["test-error-mean"]).iloc[-1]))

## Linear base learners example: learning API only

In [None]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
boston_data = pd.read_csv("boston_housing.csv")
X, y = boston_data.iloc[:,:-1],boston_data.iloc[:,-1]
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, 
                                                        random_state=123)
DM_train = xgb.DMatrix(data=X_train,label=y_train)
DM_test =  xgb.DMatrix(data=X_test,label=y_test)
params = {"booster":"gblinear","objective":"reg:linear"}
xg_reg = xgb.train(params = params, dtrain=DM_train, num_boost_round=10)
preds = xg_reg.predict(DM_test)

rmse = np.sqrt(mean_squared_error(y_test,preds))

print("RMSE: %f" % (rmse))

## L1 regularization example

In [None]:
import xgboost as xgb
import pandas as pd
boston_data = pd.read_csv("boston_data.csv")
X,y = boston_data.iloc[:,:-1],boston_data.iloc[:,-1] 
boston_dmatrix = xgb.DMatrix(data=X,label=y) 
params={"objective":"reg:linear","max_depth":4} 
l1_params = [1,10,100]
rmses_l1=[]
for reg in l1_params:
   params["alpha"] = reg
   cv_results = xgb.cv(dtrain=boston_dmatrix, params=params,nfold=4, 
                       num_boost_round=10,metrics="rmse",as_pandas=True,seed=123) 
   rmses_l1.append(cv_results["test-rmse-mean"].tail(1).values[0])
print("Best rmse as a function of l1:") 
print(pd.DataFrame(list(zip(l1_params,rmses_l1)), columns=["l1","rmse"]))

## Feature importance

In [None]:
# Create the DMatrix: housing_dmatrix
housing_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter dictionary: params
params = {"objective":"reg:linear", "max_depth":4}

# Train the model: xg_reg
xg_reg = xgb.train(params=params, dtrain=housing_dmatrix, num_boost_round=10)

# Plot the feature importances
xgb.plot_importance(xg_reg)
plt.show()



## Untuned model example

In [None]:
import pandas as pd 
import xgboost as xgb 
import numpy as np 
housing_data = pd.read_csv("ames_housing_trimmed_processed.csv") 
X,y = housing_data[housing_data.columns.tolist()[:-1]], 
       housing_data[housing_data.columns.tolist()[-1]] 
housing_dmatrix = xgb.DMatrix(data=X,label=y) 
untuned_params={"objective":"reg:linear"} 
untuned_cv_results_rmse = xgb.cv(dtrain=housing_dmatrix, 
       params=untuned_params,nfold=4, 
 metrics="rmse",as_pandas=True,seed=123) 
print("Untuned rmse: %f" %((untuned_cv_results_rmse["test-rmse-mean"]).tail(1))

## Tuned model example

In [None]:
import pandas as pd 
import xgboost as xgb 
import numpy as np 
housing_data = pd.read_csv("ames_housing_trimmed_processed.csv") 
X,y = housing_data[housing_data.columns.tolist()[:-1]], 
    housing_data[housing_data.columns.tolist()[-1]] 
housing_dmatrix = xgb.DMatrix(data=X,label=y) 
tuned_params = {"objective":"reg:linear",'colsample_bytree': 0.3, 
 'learning_rate': 0.1, 'max_depth': 5} 
tuned_cv_results_rmse = xgb.cv(dtrain=housing_dmatrix, 
    params=tuned_params, nfold=4, num_boost_round=200, metrics="rmse", 
 as_pandas=True, seed=123) 
print("Tuned rmse: %f" %((tuned_cv_results_rmse["test-rmse-mean"]).tail(1)))

addtional possiblity: tune eta → learning rate

7 tree tunable parameters → see slides part 3 number 6 <br>
3 linear tunable parameters → see slides part 3 number 7

## Grid search

In [None]:
import pandas as pd 
import xgboost as xgb 
import numpy as np 
from sklearn.model_selection import GridSearchCV 
housing_data = pd.read_csv("ames_housing_trimmed_processed.csv") 
X, y = housing_data[housing_data.columns.tolist()[:-1]],  
      housing_data[housing_data.columns.tolist()[-1] 
housing_dmatrix = xgb.DMatrix(data=X,label=y) 
gbm_param_grid = {'learning_rate': [0.01,0.1,0.5,0.9], 
                 'n_estimators': [200], 
 'subsample': [0.3, 0.5, 0.9]} 
gbm = xgb.XGBRegressor() 
grid_mse = GridSearchCV(estimator=gbm,param_grid=gbm_param_grid,  
           scoring='neg_mean_squared_error', cv=4, verbose=1) 
grid_mse.fit(X, y) 
print("Best parameters found: ",grid_mse.best_params_) 
print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_))) 

## Random search

In [None]:
# First example
import pandas as pd 
import xgboost as xgb 
import numpy as np 
from sklearn.model_selection import RandomizedSearchCV 
housing_data = pd.read_csv("ames_housing_trimmed_processed.csv") 
X,y = housing_data[housing_data.columns.tolist()[:-1]], 
     housing_data[housing_data.columns.tolist()[-1]] 
housing_dmatrix = xgb.DMatrix(data=X,label=y) 
gbm_param_grid = {'learning_rate': np.arange(0.05,1.05,.05), 
                 'n_estimators': [200], 
 'subsample': np.arange(0.05,1.05,.05)} 
gbm = xgb.XGBRegressor() 
randomized_mse = RandomizedSearchCV(estimator=gbm, param_distributions=gbm_param_grid,  
                       n_iter=25, scoring='neg_mean_squared_error', cv=4, verbose=1) 
randomized_mse.fit(X, y) 
print("Best parameters found: ",randomized_mse.best_params_) 
print("Lowest RMSE found: ", np.sqrt(np.abs(randomized_mse.best_score_)))

In [None]:
# second example
# Create the parameter grid: gbm_param_grid
gbm_param_grid = {
    'n_estimators': [25],
    'max_depth': range(2, 12)
}

# Instantiate the regressor: gbm
gbm = xgb.XGBRegressor(n_estimators=10)
# Perform random search: grid_mse
randomized_mse = RandomizedSearchCV(param_distributions=gbm_param_grid, estimator=gbm, scor-
ing="neg_mean_squared_error", n_iter=5, cv=4, verbose=1)

# Fit randomized_mse to the data
randomized_mse.fit(X, y)
# Print the best parameters and lowest RMSE
print("Best parameters found: ", randomized_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(randomized_mse.best_score_)))

## Pipeline

### Example I

In [4]:
import pandas as pd 
from sklearn.ensemble import RandomForestRegressor 
import numpy as np 
from sklearn.preprocessing import StandardScaler 
from sklearn.pipeline import Pipeline 
from sklearn.model_selection import cross_val_score 
names = ["crime","zone","industry","charles","no","rooms", 
       "age", "distance","radial","tax","pupil","aam","lower","med_price"] 

data = pd.read_csv("boston_housing.csv",names=names) 
X, y = data.iloc[:,:-1], data.iloc[:,-1] 
rf_pipeline = Pipeline[("st_scaler",  
               StandardScaler()), 
               ("rf_model",RandomForestRegressor())] 

scores = cross_val_score(rf_pipeline,X,y,     
scoring="neg_mean_squared_error",cv=10) 
final_avg_rmse = np.mean(np.sqrt(np.abs(scores)))
print("Final RMSE:", final_avg_rmse) 

SyntaxError: invalid character in identifier (<ipython-input-4-fef785cb252a>, line 20)

### Preprocessing
- LabelEncoder and OneHotEncoder or in one step → DictVectorizer

In [None]:
# Encoding categorical columns I: LabelEncoder 
# Import LabelEncoder
from sklearn.preprocessing import LabelEncoder
# Fill missing values with 0
df.LotFrontage = df.LotFrontage.fillna(value=0)
# Create a boolean mask for categorical columns 
categorical_mask = (df.dtypes == object)  # true → is categorical
# Get list of categorical column names
categorical_columns = df.columns[categorical_mask].tolist()
# Print the head of the categorical columns 
print(df[categorical_columns].head())
# Create LabelEncoder object: le 
le = LabelEncoder()
# Apply LabelEncoder to categorical columns
df[categorical_columns] = df[categorical_columns].apply(lambda x: le.fit_transform(x))
# Print the head of the LabelEncoded categorical columns print(df[categorical_columns].head())



In [None]:
# One Hot Encoding
# Import OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
# Create OneHotEncoder: ohe
ohe = OneHotEncoder(categorical_features=categorical_mask, sparse=False)
# Apply OneHotEncoder to categorical columns - output is no longer a dataframe: df_encoded df_encoded = ohe.fit_transform(df)
# Print first 5 rows of the resulting dataset - again, this will no longer be a pandas dataframe print(df_encoded[:5, :])
# Print the shape of the original DataFrame 
print(df.shape)
# Print the shape of the transformed array 
print(df_encoded.shape)

In [None]:
# DictVectorizer (one step)
# Import DictVectorizer
from sklearn.feature_extraction import DictVectorizer
# Convert df into a dictionary: df_dict 
df_dict = df.to_dict("records")
# Create the DictVectorizer object: dv 
dv = DictVectorizer(sparse=False)
# Apply dv on df: df_encoded 
df_encoded = dv.fit_transform(df_dict)
# Print the resulting first five rows print(df_encoded[:5,:])
# Print the vocabulary 
print(dv.vocabulary_)

### Example II

In [None]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
names = ["crime","zone","industry","charles","no","rooms","age", 
        "distance","radial","tax","pupil","aam","lower","med_price"] 
data = pd.read_csv("boston_housing.csv",names=names)
X, y = data.iloc[:,:-1], data.iloc[:,-1]
xgb_pipeline = Pipeline[("st_scaler", StandardScaler()), 
                        ("xgb_model",xgb.XGBRegressor())]
scores = cross_val_score(xgb_pipeline, X, y, 
                         scoring="neg_mean_squared_error",cv=10) 
final_avg_rmse = np.mean(np.sqrt(np.abs(scores)))
print("Final XGB RMSE:", final_avg_rmse)

In [None]:
# Import necessary modules
from sklearn.feature_extraction import DictVectorizer from sklearn.pipeline import Pipeline
# Fill LotFrontage missing values with 0 X.LotFrontage = X.LotFrontage.fillna(0)
# Setup the pipeline steps: steps
steps = [("ohe_onestep", DictVectorizer(sparse=False)), 
         ("xgb_model", xgb.XGBRegressor())]
# Create the pipeline: xgb_pipeline 
xgb_pipeline = Pipeline(steps)
# Fit the pipeline 
xgb_pipeline.fit(X.to_dict("records"), y)

In [None]:
# Import necessary modules
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline from sklearn.model_selection import cross_val_score
# Fill LotFrontage missing values with 0 X.LotFrontage = X.LotFrontage.fillna(0)
# Setup the pipeline steps: steps
steps = [("ohe_onestep", DictVectorizer(sparse=False)), 
         ("xgb_model", xgb.XGBRegressor(max_depth=2, objective="reg:linear"))]
# Create the pipeline: xgb_pipeline xgb_pipeline = Pipeline(steps)
# Cross-validate the model cross_val_scores = cross_val_score(xgb_pipeline, X.to_dict('records'), y, cv=10, scoring="neg_mean_squared_error")
# Print the 10-fold RMSE
print("10-fold RMSE: ", np.mean(np.sqrt(np.abs(cross_val_scor es))))

### Case Study

In [None]:
# Import necessary modules
from sklearn_pandas import DataFrameMapper from sklearn_pandas import CategoricalImputer
# Check number of nulls in each feature column nulls_per_column = X.isnull().sum() print(nulls_per_column)
# Create a boolean mask for categorical columns categorical_feature_mask = X.dtypes == object
# Get list of categorical column names categorical_columns = X.columns[categorical_feature_mask].tolist()
# Get list of non-categorical column names non_categorical_columns = X.columns[~categorical_feature_mask].tolist()
# Apply numeric imputer 
numeric_imputation_mapper = DataFrameMapper(
[([numeric_feature],Imputer(strategy="median")) for numeric_feature in non_categorical_columns], 
                                            input_df=True, 
                                            df_out=True
                                           )
# Apply categorical imputer categorical_imputation_mapper = DataFrameMapper( 
                                                [(category_feature, CategoricalImputer()) for category_feature in categorical_columns], 
                                                input_df=True, 
                                                df_out=True
                                               )

In [None]:
# Import FeatureUnion
from sklearn.pipeline import FeatureUnion
# Combine the numeric and categorical transformations numeric_categorical_union = FeatureUnion([ 
                                          ("num_mapper", numeric_imputation_mapper), 
                                          ("cat_mapper", categorical_imputation_mapper) 
                                         ])

In [None]:
# Create full pipeline
pipeline = Pipeline([
                     ("featureunion", numeric_categorical_union), 
                     ("dictifier", Dictifier()), 
                     ("vectorizer", DictVectorizer(sort=False)), 
                     ("clf", xgb.XGBClassifier(max_depth=3)) 
                    ])
# Perform cross-validation
cross_val_scores = cross_val_score(pipeline, kidney_data, y, scoring="roc_auc", cv=3)
# Print avg. AUC
print("3-fold AUC: ", np.mean(cross_val_scores))

### Tuning XGBoost hyperparameters in a pipeline

In [None]:
import pandas as pd
  ...: import xgboost as xgb
  ...: import numpy as np
  ...: from sklearn.preprocessing import StandardScaler
  ...: from sklearn.pipeline import Pipeline
  ...: from sklearn.model_selection import RandomizedSearchCV
names = ["crime","zone","industry","charles","no", 
  ...: "rooms","age", "distance","radial","tax",
  ...: "pupil","aam","lower","med_price"]
data = pd.read_csv("boston_housing.csv",names=names)
X, y = data.iloc[:,:-1],data.iloc[:,-1] 
xgb_pipeline = Pipeline[("st_scaler",
  ...: StandardScaler()), ("xgb_model",xgb.XGBRegressor())] 
gbm_param_grid = {
  ...:     'xgb_model__subsample': np.arange(.05, 1, .05),
  ...:     'xgb_model__max_depth': np.arange(3,20,1),
  ...:     'xgb_model__colsample_bytree': np.arange(.1,1.05,.05) }
randomized_neg_mse = RandomizedSearchCV(estimator=xgb_pipeline, 
  ...: param_distributions=gbm_param_grid, n_iter=10,
  ...: scoring='neg_mean_squared_error', cv=4)
randomized_neg_mse.fit(X, y)
print("Best rmse: ", np.sqrt(np.abs(randomized_neg_mse.best_score_))
print("Best model: ", randomized_neg_mse.best_estimator_)

In [None]:

# Create the parameter grid
gbm_param_grid = {
    'clf__learning_rate': np.arange(0.05, 1, 0.05), 
    'clf__max_depth': np.arange(3, 10, 1), 
    'clf__n_estimators': np.arange(50, 200, 50)

In [None]:
# possible add-ons
# Perform RandomizedSearchCV
randomized_roc_auc = RandomizedSearchCV(estimator=pipeline, param_distributions=gbm_param_grid, n_iter=2, scoring='roc_auc', cv=2, verbose=1)
# Fit the estimator 
randomized_roc_auc.fit(X, y)
# Compute metrics 
print(randomized_roc_auc.best_score_) 
print(randomized_roc_auc.best_estimator_)