In [1]:
#numpy and pandas 
import numpy as np
import pandas as pd

#statistics of results
from scipy.stats import uniform, randint

from sklearn.datasets import load_breast_cancer, load_diabetes, load_wine
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split

## Regression

In [2]:
#import xgboost
import xgboost as xgb

In [3]:
# load dataset
dataset = pd.read_csv('winequalityN.csv')
dataset.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [4]:
# find columns with null values
null_cols = dataset.columns[dataset.isnull().any()]
null_cols

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'pH', 'sulphates'],
      dtype='object')

In [5]:
data=dataset.drop(['type'], axis=1)
from sklearn.linear_model import LinearRegression
imputer = LinearRegression()
for col in null_cols:
    if data[col].isnull().sum() == 0:
        continue
    X_train = data.dropna(subset=[col]).drop(null_cols, axis=1)
    y_train = data.dropna(subset=[col])[col]
    X_test = data[data[col].isnull()].drop(null_cols, axis=1)
    imputer.fit(X_train, y_train)
    data.loc[data[col].isnull(), col] = imputer.predict(X_test)
    
data.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [6]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
encoded_type = label_encoder.fit_transform(dataset.type)
label_encoder_name_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

print("Mapping of Label Encoded Classes", label_encoder_name_mapping, sep="\n")
print("Label Encoded Target Variable", encoded_type, sep="\n")

Mapping of Label Encoded Classes
{'red': 0, 'white': 1}
Label Encoded Target Variable
[1 1 1 ... 0 0 0]


In [7]:
encoded_type = pd.DataFrame(encoded_type, columns=['type'])
data = pd.concat([encoded_type, data], axis=1)

x=data.drop(['quality'], axis=1)
y=data.quality
data

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,1,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.450000,8.8,6
1,1,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.490000,9.5,6
2,1,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.440000,10.1,6
3,1,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.400000,9.9,6
4,1,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.400000,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,0,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.580000,10.5,5
6493,0,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.599552,11.2,6
6494,0,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.750000,11.0,6
6495,0,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.710000,10.2,5


In [8]:
from sklearn.preprocessing import StandardScaler

other_columns = x.iloc[:, 1:]

# apply StandardScaler() to the other columns
scaler = StandardScaler()
scaled_columns = scaler.fit_transform(other_columns)

# concatenate the scaled columns with the first column
scaled_x = pd.DataFrame(scaled_columns, columns=other_columns.columns)
X = pd.concat([x.type, scaled_x], axis=1)
X

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,1,-0.167110,-0.423086,0.284188,3.206763,-0.315180,0.815565,0.959976,2.102214,-1.359601,-0.546005,-1.418558
1,1,-0.707277,-0.240782,0.146468,-0.808139,-0.200990,-0.931107,0.287618,-0.232332,0.508022,-0.277125,-0.831615
2,1,0.681724,-0.362318,0.559628,0.305943,-0.172443,-0.029599,-0.331660,0.134525,0.259006,-0.613225,-0.328521
3,1,-0.012776,-0.666158,0.008748,0.642270,0.055936,0.928254,1.243074,0.301278,-0.176773,-0.882106,-0.496219
4,1,-0.012776,-0.666158,0.008748,0.642270,0.055936,0.928254,1.243074,0.301278,-0.176773,-0.882106,-0.496219
...,...,...,...,...,...,...,...,...,...,...,...,...
6492,0,-0.784444,1.582260,-1.643894,-0.724058,0.969451,0.083090,-1.269422,0.067824,1.441834,0.327857,0.006875
6493,0,-1.015944,1.278419,-1.506174,-0.682017,0.170125,0.477500,-1.145567,0.141195,1.877613,0.459284,0.593818
6494,0,-0.707277,1.035347,-1.299594,-0.660996,0.569788,-0.085943,-1.340197,0.347969,1.255072,1.470599,0.426120
6495,0,-1.015944,1.855716,-1.368454,-0.724058,0.541241,0.083090,-1.269422,0.257923,2.188884,1.201718,-0.244672


In [27]:
from sklearn.preprocessing import LabelEncoder

# create an instance of LabelEncoder
le = LabelEncoder()

# fit the encoder to the target variable
le.fit(y)

# transform the target variable into a numerical representation
y = le.transform(y)

In [28]:
#define model
xgb_model = xgb.XGBRegressor(objective="reg:linear", random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [29]:
#xgb.XGBClassifier is for classification
xgb_model.fit(X_train, y_train)



In [30]:
y_pred = xgb_model.predict(X_test)

In [31]:
mse=mean_squared_error(y_test, y_pred)

In [32]:
#print error
print(np.sqrt(mse))

0.6352450427298683


## Binary classification

In [33]:
#define model
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# train the XGBClassifier using the encoded labels
xgb_model.fit(X_train, y_train)

In [34]:
#predict 
y_pred = xgb_model.predict(X_test)

In [35]:
#accuracy 
accuracy_score(y_test, y_pred)

0.6482051282051282

## Multiclass classification

In [36]:
#show y
y

array([3, 3, 3, ..., 3, 2, 3], dtype=int64)

In [37]:
#multiclass classification 
xgb_multi = xgb.XGBClassifier(objective="multi:softprob", random_state=19)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
xgb_multi.fit(X_train, y_train)

In [None]:
#tasks: implement xgb regression, binary classification, and multi on train_test_split 

In [38]:
y_pred_multi = xgb_multi.predict(X_test)
accuracy_score(y_test, y_pred_multi)

0.6482051282051282

## Cross validation

Cross-validation using `KFold`

In [39]:
# diabetes = load_diabetes()

# X = diabetes.data
# y = diabetes.target

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

scores = []

for train_index, test_index in kfold.split(X):   
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    xgb_model = xgb.XGBRegressor(objective="reg:linear")
    xgb_model.fit(X_train, y_train)
    
    y_pred = xgb_model.predict(X_test)
    scores.append(mean_squared_error(y_test, y_pred))

KeyError: "None of [Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,\n       ...\n       6486, 6487, 6489, 6490, 6491, 6492, 6493, 6494, 6495, 6496],\n      dtype='int32', length=5847)] are in the [columns]"

In [None]:
np.sqrt(scores)

Cross-validation using `cross_val_score`

In [40]:
xgb_model = xgb.XGBRegressor(objective="reg:linear", random_state=42)

scores = cross_val_score(xgb_model, X, y, scoring="neg_mean_squared_error", cv=5)

np.sqrt(-scores)



array([0.84555681, 0.75043489, 0.79659832, 0.70682249, 0.80883464])

## Hyperparameter searching

In [None]:
# diabetes = load_diabetes()

# X = diabetes.data
# y = diabetes.target

# xgb_model = xgb.XGBRegressor()

# params = {
#     "colsample_bytree": uniform(0.7, 0.3),
#     "gamma": uniform(0, 0.5),
#     "learning_rate": uniform(0.03, 0.3), # default 0.1 
#     "max_depth": randint(2, 6), # default 3
#     "n_estimators": randint(100, 150), # default 100
#     "subsample": uniform(0.6, 0.4)
# }

# search = RandomizedSearchCV(xgb_model, param_distributions=params, random_state=42, n_iter=200, cv=3, verbose=1, n_jobs=1, return_train_score=True)

# search.fit(X, y)

# report_best_scores(search.cv_results_, 1)

## Early stopping

The number of boosted trees (`n_estimators`) to train is uncapped, rather training continues until validation has not improved in *n* rounds

In [41]:
# cancer = load_breast_cancer()
# X = cancer.data
# y = cancer.target
# if more than one evaluation metric are given the last one is used for early stopping
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42, eval_metric="auc")
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

xgb_model.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_test, y_test)])

y_pred = xgb_model.predict(X_test)

accuracy_score(y_test, y_pred)

[0]	validation_0-auc:nan
[1]	validation_0-auc:nan
[2]	validation_0-auc:nan
[3]	validation_0-auc:nan




[4]	validation_0-auc:nan
[5]	validation_0-auc:nan


0.5458461538461539

`xgb_model.fit()` will produce a model from the last iteration, not the best one, so to get the optimum model consider retraining over `xgb_model.best_iteration` rounds.

In [42]:
print("best score: {0}, best iteration: {1}, best ntree limit {2}".format(xgb_model.best_score, xgb_model.best_iteration, xgb_model.best_ntree_limit))

best score: nan, best iteration: 0, best ntree limit 1


## Evaluations

In [None]:
# cancer = load_breast_cancer()

# X = cancer.data
# y = cancer.target

xgb_model = xgb.XGBClassifier(objective="binary:logistic", n_estimators=20, random_state=42, eval_metric=["auc", "error", "error@0.6"])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

xgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)])

y_pred = xgb_model.predict(X_test)

In [None]:
#avgscore
y_pred = xgb_model.predict(X_test)
accuracy_score(y_test, y_pred)

Gaussian Naives-Bayes

In [None]:
# train a Gaussian Naive Bayes classifier on the training set
from sklearn.naive_bayes import GaussianNB
# instantiate the model
gnb = GaussianNB()
# fit the model
gnb.fit(X_train, y_train)

In [None]:
y_pred2 = gnb.predict(X_test)
y_pred2

In [None]:
#accuracy
from sklearn.metrics import accuracy_score
print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

In [None]:
#Now try the same with the other two datasets