In [1]:
import xgboost as xgb

In [3]:
from sklearn import datasets

iris = datasets.load_iris()

X = iris.data
y = iris.target

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21)

## Building the xgboost model

In [6]:
# convert the dataset to DMatrix format
# Dmatrix is the data structure unique for xgboost algorithm
D_train = xgb.DMatrix(X_train, label=y_train)   # use label for train
D_test = xgb.DMatrix(X_test)  # don't use lable for test data

In [7]:
# initialize a set of parameters

param = {'eta': 0.02, 'max_depth':4, 'objective':'multi:softmax', 'num_class':3}

In [8]:
# model
xgb_model = xgb.train(param, D_train, 20)  # where 20 is the no of iterations you want it to run

In [9]:
# predict
y_xgb_pred = xgb_model.predict(D_test)

In [10]:
# print the accuracy
from sklearn.metrics import accuracy_score
print('The test accuracy for the xgb model on iris dataset is:')
print(accuracy_score(y_test, y_xgb_pred))

The test accuracy for the xgb model on iris dataset is:
0.9111111111111111


#### Try to improve the model by changing the parameters

In [11]:
param = {'eta': 0.3, 'max_depth':3, 'objective':'multi:softmax', 'num_class':3}
# model
xgb_model2 = xgb.train(param, D_train, 20)  # where 20 is the no of iterations you want it to run
# predict
y_xgb_pred2 = xgb_model2.predict(D_test)

In [13]:
print('Accuracy', accuracy_score(y_test, y_xgb_pred2))

Accuracy 0.9333333333333333


# K-Fold Cross Validation

In [16]:
from sklearn.model_selection import cross_val_score

In [17]:
# initialize a model of your choice
from sklearn.neighbors import KNeighborsClassifier

In [18]:
# instantiate the model object
knn_model =KNeighborsClassifier(n_neighbors=4)

In [19]:
# pass the the model you want to use and entire train and test data and the cv i.e the split you want
# remember to specify the mean
cross_val_score(knn_model, X, y, cv=5, scoring='accuracy').mean()

0.9733333333333334

Notice how the accuracy improved using cross validation instead of train_test_split
You can reduce/adjust the cv(k value) to see best value

In [21]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()

cross_val_score(logreg, X, y, cv=5, scoring='accuracy').mean()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9733333333333334

In [23]:
from sklearn.model_selection import GridSearchCV

In [24]:
knn_gscv = KNeighborsClassifier()
# already imported in cells above

In [26]:
params = {'n_neighbors':[5, 6, 10, 15, 25],
         'weights':['uniform','distance'],
         'algorithm':['auto', 'ball_tree', 'kd_tree','brute']
         }

In [27]:
grid_knn = GridSearchCV(estimator = knn_gscv, param_grid=params, cv=5, scoring='accuracy')

The gridsearch using cross validation

In [28]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

In [29]:
grid_knn.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                         'n_neighbors': [5, 6, 10, 15, 25],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [30]:
print(grid_knn.best_estimator_)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')


In [31]:
# the best parameter values from values you passed
grid_knn.best_params_

{'algorithm': 'auto', 'n_neighbors': 5, 'weights': 'uniform'}

In [33]:
y_pred = grid_knn.predict(X_test)

In [34]:
accuracy_score(y_test, y_pred)

0.9666666666666667

In [32]:
# alternative way to check the score
grid_knn.score(X_test, y_test)

0.9666666666666667