In [1]:
# This is a practice of SciKit-Learn of data science analysis. The data set is from Kaggle.com.

from sklearn import datasets, metrics
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
from pprint import pprint


In [2]:
# Load train data which consists of the features and labels.
train_data = pd.read_csv("./data/train.csv", header = None)
train_label = pd.read_csv("./data/trainLabels.csv", header = None)

# Load test data.
test_data = pd.read_csv("./data/test.csv", header = None)


In [3]:
print(f"shape of training data: {train_data.shape}")
print(f"description of training data:\n{train_data.describe()}")
print(f"the first 5 samples:\n{train_data.head()}")


shape of training data: (1000, 40)
description of training data:
                0            1            2            3            4   \
count  1000.000000  1000.000000  1000.000000  1000.000000  1000.000000   
mean      0.025596    -0.024526    -0.024088    -0.002271     1.092329   
std       1.008282     1.016298     0.979109     0.970575     4.538834   
min      -3.365711    -3.492086    -2.695602    -3.460471   -16.421901   
25%      -0.669010    -0.693937    -0.698830    -0.617557    -1.801997   
50%       0.027895    -0.033194     0.008145     0.002327     0.862818   
75%       0.762520     0.682753     0.661434     0.640743     3.843172   
max       3.326246     3.583870     2.546507     3.088738    17.565345   

                5            6            7            8            9   ...  \
count  1000.000000  1000.000000  1000.000000  1000.000000  1000.000000  ...   
mean     -0.006250     0.497342    -0.037883     0.026391    -0.003597  ...   
std       0.989128     2.118819

In [4]:
# split into training / test sets
x_train, x_val, y_train, y_val = train_test_split(train_data,
                                                  train_label,
                                                  test_size=0.25,
                                                  random_state=4
                                                 )


In [5]:
# Establish Gradient Boosting Classifier model
classifier = GradientBoostingClassifier(loss = 'deviance',    # loss function to be optimized
                                        learning_rate = 0.1,
                                        n_estimators = 100,   # The number of boosting stages to perform
                                        criterion = 'friedman_mse'
                                       )

# train model on training set
classifier.fit(x_train, y_train)

# predict test set
y_pred = classifier.predict(x_val)

acc = metrics.accuracy_score(y_val, y_pred)
print(f"Accuracy: {acc}")


  y = column_or_1d(y, warn=True)


Accuracy: 0.892


# Random Search with Cross Validation

In [6]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]

# Number of features to consider at every split
max_features = ['sqrt', 'log2', None]

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 50, num = 10)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               }

pprint(random_grid)


{'max_depth': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, None],
 'max_features': ['sqrt', 'log2', None],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]}


In [7]:
# Use the random grid to search for best hyperparameters

# Random search of parameters, using 3 fold cross validation, 
# search across 50 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = classifier,
                               param_distributions = random_grid,
                               n_iter = 50,
                               # scoring='neg_mean_absolute_error',
                               cv = 3,
                               verbose = 2,
                               random_state = 42,
                               n_jobs = -1,
                               return_train_score = True)

# Fit the random search model
rf_random.fit(x_train, y_train);


Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   14.9s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   43.1s finished
  y = column_or_1d(y, warn=True)


In [8]:
# dump the random search with cross validation results
print(f"the best score: {rf_random.best_score_}")

print(f"the best parameters:\n{rf_random.best_params_}\n")

print(f"the cross validation result:\n{rf_random.cv_results_}\n")


the best score: 0.872
the best parameters:
{'n_estimators': 900, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': None}

the cross validation result:
{'mean_fit_time': array([1.68492198, 0.62570842, 1.41511075, 0.5870928 , 0.98637644,
       0.6369915 , 0.82896058, 1.2124668 , 1.36618821, 0.8067228 ,
       0.9803768 , 0.46770302, 1.29338344, 1.61834979, 0.59351865,
       0.80501548, 0.85136803, 0.81166991, 0.81134693, 0.55818311,
       1.47131817, 0.76717607, 1.36279837, 1.5773836 , 0.59965944,
       0.70703634, 1.59629544, 0.53787533, 0.78666202, 0.84336662,
       0.87835526, 1.57327628, 1.54287728, 0.6748534 , 0.99337943,
       0.63269869, 0.78437527, 0.60032852, 1.34182795, 0.61681326,
       0.83139873, 0.77941664, 0.64904507, 1.03622476, 0.92968551,
       0.84700545, 1.33263516, 0.82904832, 1.2430106 , 0.67020551]), 'std_fit_time': array([0.03711487, 0.01294743, 0.01920036, 0.01375295, 0.02409599,
       0.0157638 , 0.01201591, 0.02472606

In [9]:
# re-build the model using the best parameters
classifier_bestparam = GradientBoostingClassifier(n_estimators = rf_random.best_params_['n_estimators'],
                                                 min_samples_split = rf_random.best_params_['min_samples_split'],
                                                 min_samples_leaf = rf_random.best_params_['min_samples_leaf'],
                                                 max_features = rf_random.best_params_['max_features'],
                                                 max_depth = rf_random.best_params_['max_depth']
                                                 )

# train the model on split data
classifier_bestparam.fit(x_train, y_train)

# predict the test set
y_pred = classifier_bestparam.predict(x_val)

# mean squared error
print(f"mean-squared error: {metrics.mean_squared_error(y_val, y_pred)}\n")

acc = metrics.accuracy_score(y_val, y_pred)
print(f"Accuracy: {acc}")


  y = column_or_1d(y, warn=True)


mean-squared error: 0.1

Accuracy: 0.9


In [10]:
# Use the re-built model on whole training data
classifier_bestparam.fit(train_data, train_label)

# predict the test set
y_pred = classifier_bestparam.predict(test_data)

Id = np.arange(1, 9001)
submit = pd.DataFrame(data = Id, columns = ['Id'])
submit['Solution'] = y_pred

submit.to_csv("PaulChi_Day048_HW.csv", index = False)


  y = column_or_1d(y, warn=True)
