# Using XGBoost on Bike Rentals Dataset

In [73]:
# import pandas and numpy 
import pandas as pd
import numpy as np

In [74]:
# load bike data
df_bikes = pd.read_csv('bike_rentals.csv')

In [75]:
# display first five rows
df_bikes.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1.0,0.0,1.0,0.0,6.0,0.0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1.0,0.0,1.0,0.0,0.0,0.0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1.0,0.0,1.0,0.0,1.0,1.0,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1.0,0.0,1.0,0.0,2.0,1.0,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1.0,0.0,1.0,0.0,3.0,1.0,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [76]:
# describe the data
df_bikes.describe()

Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
count,731.0,731.0,730.0,730.0,731.0,731.0,731.0,731.0,730.0,730.0,728.0,726.0,731.0,731.0,731.0
mean,366.0,2.49658,0.5,6.512329,0.028728,2.997264,0.682627,1.395349,0.495587,0.474512,0.627987,0.190476,848.176471,3656.172367,4504.348837
std,211.165812,1.110807,0.500343,3.448303,0.167155,2.004787,0.465773,0.544894,0.183094,0.163017,0.142331,0.077725,686.622488,1560.256377,1937.211452
min,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.05913,0.07907,0.0,0.022392,2.0,20.0,22.0
25%,183.5,2.0,0.0,4.0,0.0,1.0,0.0,1.0,0.336875,0.337794,0.521562,0.134494,315.5,2497.0,3152.0
50%,366.0,3.0,0.5,7.0,0.0,3.0,1.0,1.0,0.499166,0.487364,0.627083,0.180971,713.0,3662.0,4548.0
75%,548.5,3.0,1.0,9.75,0.0,5.0,1.0,2.0,0.655625,0.608916,0.730104,0.233218,1096.0,4776.5,5956.0
max,731.0,4.0,1.0,12.0,1.0,6.0,1.0,3.0,0.861667,0.840896,0.9725,0.507463,3410.0,6946.0,8714.0


Comparing the mean and median (50%) gives an indication of skewness. As you can see, mean and median are close to one another, so
the data is roughly symmetrical

In [77]:
# gain more info from the data
df_bikes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     731 non-null    int64  
 1   dteday      731 non-null    object 
 2   season      731 non-null    float64
 3   yr          730 non-null    float64
 4   mnth        730 non-null    float64
 5   holiday     731 non-null    float64
 6   weekday     731 non-null    float64
 7   workingday  731 non-null    float64
 8   weathersit  731 non-null    int64  
 9   temp        730 non-null    float64
 10  atemp       730 non-null    float64
 11  hum         728 non-null    float64
 12  windspeed   726 non-null    float64
 13  casual      731 non-null    int64  
 14  registered  731 non-null    int64  
 15  cnt         731 non-null    int64  
dtypes: float64(10), int64(5), object(1)
memory usage: 91.5+ KB


As you can see, .info() gives the number of rows, number of columns, column types, and non-null values. Since the number of nonnull
values differs between columns, null values must be present.

In [78]:
# find null values
df_bikes.isna().sum()

instant       0
dteday        0
season        0
yr            1
mnth          1
holiday       0
weekday       0
workingday    0
weathersit    0
temp          1
atemp         1
hum           3
windspeed     5
casual        0
registered    0
cnt           0
dtype: int64

In [79]:
# display null values
df_bikes[df_bikes.isna().any(axis=1)]

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
56,57,2011-02-26,1.0,0.0,2.0,0.0,6.0,0.0,1,0.2825,0.282192,0.537917,,424,1545,1969
81,82,2011-03-23,2.0,0.0,3.0,0.0,3.0,1.0,2,0.346957,0.337939,0.839565,,203,1918,2121
128,129,2011-05-09,2.0,0.0,5.0,0.0,1.0,1.0,1,0.5325,0.525246,0.58875,,664,3698,4362
129,130,2011-05-10,2.0,0.0,5.0,0.0,2.0,1.0,1,0.5325,0.522721,,0.115671,694,4109,4803
213,214,2011-08-02,3.0,0.0,8.0,0.0,2.0,1.0,1,0.783333,0.707071,,0.20585,801,4044,4845
298,299,2011-10-26,4.0,0.0,10.0,0.0,3.0,1.0,2,0.484167,0.472846,0.720417,,404,3490,3894
388,389,2012-01-24,1.0,1.0,1.0,0.0,2.0,1.0,1,0.3425,0.349108,,0.123767,439,3900,4339
528,529,2012-06-12,2.0,1.0,6.0,0.0,2.0,1.0,2,0.653333,0.597875,0.833333,,477,4495,4972
701,702,2012-12-02,4.0,1.0,12.0,0.0,0.0,0.0,2,,,0.823333,0.124379,892,3757,4649
730,731,2012-12-31,1.0,,,0.0,1.0,0.0,2,0.215833,0.223487,0.5775,0.154846,439,2290,2729


As you can see from the output, there are null values in the windspeed, humidity, and temperature columns along with the last
row.

In [80]:
# handle null values
df_bikes['windspeed'].fillna((df_bikes['windspeed'].mean()), inplace=True)

The median is often a better choice than the mean. The median guarantees that half the data is greater than the given value and half the data is lower. The mean, by contrast, is vulnerable to outliers. In the previous cell, df_bikes[df_bikes.isna().any(axis=1)] revealed rows 56 and 81 with null values for windspeed. These rows may be displayed using .iloc, short for index location.

In [81]:
# display rows 56 and 81
df_bikes.iloc[[56, 81]]

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
56,57,2011-02-26,1.0,0.0,2.0,0.0,6.0,0.0,1,0.2825,0.282192,0.537917,0.190476,424,1545,1969
81,82,2011-03-23,2.0,0.0,3.0,0.0,3.0,1.0,2,0.346957,0.337939,0.839565,0.190476,203,1918,2121


As expected, the null values have been replaced with the windspeed median.
**Tip:**
It's common for users to make mistakes with single or double brackets when using pandas. .iloc uses single brackets for one index as
follows: df_bikes.iloc[56]. Now, df_bikes also accepts a list inside brackets to allow multiple indices. Multiple indices require
double brackets as follows: df_bikes.iloc[[56, 81]].

In [82]:
# handling null values in hum column
df_bikes.groupby('season')['hum'].transform('median')
df_bikes['hum'] = df_bikes['hum'].fillna(df_bikes.groupby('season')['hum'].transform('median'))

In some cases, it may be advantageous to replace null values with data from specific rows.
When correcting temperature, aside from consulting historical records, taking the mean temperature of the day before and the day after should give a good estimate.To find null values of the 'temp' column, enter the following code:

In [83]:
# Temp column null value
df_bikes[df_bikes['temp'].isna()]

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
701,702,2012-12-02,4.0,1.0,12.0,0.0,0.0,0.0,2,,,0.823333,0.124379,892,3757,4649


As you can see, index 701 contains null values. To find the mean temperature of the day before and the day after the 701 index, complete the following steps:
1. Sum the temperatures in rows 700 and 702 and divide by 2. Do this for the 'temp' and 'atemp' columns
2. Replace the null values

In [84]:
mean_temp = (df_bikes.iloc[700]['temp'] + df_bikes.iloc[702]['temp'])/2
mean_atemp = (df_bikes.iloc[700]['atemp'] + df_bikes.iloc[702]['atemp'])/2

df_bikes['temp'].fillna((mean_temp), inplace=True)
df_bikes['atemp'].fillna((mean_atemp), inplace=True)

In [85]:
# import the datetime module
import datetime as dt

df_bikes['dteday'] = pd.to_datetime(df_bikes['dteday'], infer_datetime_format=True)
df_bikes['mnth'] = df_bikes['dteday'].dt.month


As you can see, the month values are all correct, but the year value needs to be changed. The years of the last five rows in the 'dteday' column are all 2012, but the corresponding year provided by the 'yr' column is 1.0.
Why?
The data is normalized, meaning it's converted to values between 0 and 1.
Normalized data is often more efficient because machine learning weights do not have to adjust for different ranges. You can use the .loc method to fill in the correct value. The .loc method is used to locate entries by row and column as follows:

In [86]:
# handling na value in 'yr' column
df_bikes.loc[730, 'yr'] = 1.0

In [87]:
# deleting non-numerical column
df_bikes = df_bikes.drop('dteday', axis=1)

In the bike rentals dataset, df_bikes['cnt'] is the number of bike rentals in a given day. Predicting this column would be of great use to a bike rental company. Our problem is to predict the correct number of bike rentals on a given day based on data such as whether this day is a holiday or working day, forecasted temperature, humidity, windspeed, and so on. According to the dataset, df_bikes['cnt'] is the sum of df_bikes['casual'] and df_bikes['registered']. If df_bikes['registered'] and df_bikes['casual'] were included as input columns, predictions would always be 100% accurate since these columns would always sum to the correct result. Although perfect predictions are ideal in theory, it makes no sense to include input columns that would be unknown in reality. All current columns may be used to predict df_bikes['cnt'] except for 'casual' and 'registered'. Drop the 'casual' and 'registered' columns using the .drop method.

In [88]:
# deleting unwanted column
df_bikes = df_bikes.drop(['casual', 'registered'], axis=1)

### Saving data for future use

In [89]:
df_bikes.to_csv('bike_rentals_cleaned.csv', index=False)

### Declaring predictor and target columns

It's standard to group the predictor columns with a capital X, and the target column as a lowercase y. Since our target column is the last column, splitting the data into predictor and target columns may be done via slicing using index notation:

In [90]:
# Declaring predictor and target columns
X = df_bikes.iloc[:,:-1]
y = df_bikes.iloc[:,-1]

The comma separates columns from rows. The first colon, :, means that all rows are included. After the comma, :-1 means start at the first column and go all the way to the last column without including it. The second -1 takes the last column only.

### Accessing scikit-learn

In [91]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

Note the random_state=2 parameter. Whenever you see random_state=2, this means that you are choosing the seed of a pseudorandom
number generator to ensure reproducible results.

In [92]:
# fit your model
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

y_pred = lin_reg.predict(X_test)

Compare the predictions with the test set. Scoring the model requires a basis of comparison. The standard for linear regression is the root mean squared error (RMSE). The RMSE requires two pieces: mean_squared_error, the sum of the squares of differences between predicted and actual values, and the square root, to keep the units the same. mean_squared_error may be imported, and the square root may be taken with Numerical Python, popularly known as NumPy, a blazingly fast library designed to work with pandas.

In [93]:
# import metrics
from sklearn.metrics import mean_squared_error

# calculate the metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

In [94]:
print("RMSE: %0.2f" % (rmse))

RMSE: 898.30


It's hard to know whether an error of 898 rentals is good or bad without knowing the expected range of rentals per day.
The .describe() method may be used on the df_bikes['cnt'] column to obtain the range and more:

In [95]:
df_bikes['cnt'].describe()

count     731.000000
mean     4504.348837
std      1937.211452
min        22.000000
25%      3152.000000
50%      4548.000000
75%      5956.000000
max      8714.000000
Name: cnt, dtype: float64

With a range of 22 to 8714, a mean of 4504, and a standard deviation of 1937, an RMSE of 898 isn't bad, but it's not great either.

## XGBoost

In [96]:
import warnings
warnings.filterwarnings('ignore')

In [97]:
# import xgboost
from xgboost import XGBRegressor

#fit the model
xg_reg = XGBRegressor()
xg_reg.fit(X_train, y_train)
y_pred = xg_reg.predict(X_test)

In [98]:
# Compare the predictions with the test set:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

In [99]:
# Print your results:
print("RMSE: %0.2f" % (rmse))

RMSE: 693.65


XGBRegressor performs substantially better!

### Cross-validation

One test score is not reliable because splitting the data into different training and test sets would give different results. In effect, splitting
the data into a training set and a test set is arbitrary, and a different random_state will give a different RMSE. One way to address the score discrepancies between different splits is k-fold cross-validation. The idea is to split the data multiple times
into different training sets and test sets, and then to take the mean of the scores. The number of splits, called folds, is denoted by k. It's
standard to use k = 3, 4, 5, or 10 splits.

Cross-validation works by fitting a machine learning model on the first training set and scoring it against the first test set. A different training set and test set are provided for the second split, resulting in a new machine learning model with its own score. A third split results in a new model and scores it against another test set. There is going to be overlap in the training sets, but not the test sets. the number of folds is flexible and depends on the data. Five folds is standard because 20% of the test set is held back each time. With 10 folds, only 10% of the data is held back; however, 90% of the data is available for training and the mean is less vulnerable to outliers. For a smaller datatset, three folds may work better. At the end, there will be k different scores evaluating the model against k different test sets. Taking the mean score of the k folds gives a more reliable score than any single fold. cross_val_score is a convenient way to implement cross-validation. cross_val_score takes a machine learning algorithm as input, along with the predictor and target columns, with optional additional parameters that include a scoring metric and the desired number of folds.

### Cross-Validation with Linear Regression

In [100]:
# import 
from sklearn.model_selection import cross_val_score

# Initialize a machine learning model
model = LinearRegression()

# Implement cross_val_score with the model, X, y, scoring='neg_mean_squared_error', and the number of folds, cv=10
scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=10)

Why scoring='neg_mean_squared_error'? Scikit-learn is designed to select the highest score when training models. This works well for accuracy, but not for errors when the lowest is best. By taking the negative of each mean squared error, the lowest ends up being the highest. This is compensated for later with rmse = np.sqrt(-scores), so the final results are positive.

In [101]:
# Find the RMSE by taking the square root of the negative scores:
rmse = np.sqrt(-scores)

In [102]:
# Display the results:
print('Reg rmse:', np.round(rmse, 2))
print('RMSE mean: %0.2f' % (rmse.mean()))

Reg rmse: [ 503.82  840.79 1141.08  728.45  640.34  970.22 1133.38 1252.6  1084.62
 1425.34]
RMSE mean: 972.06


Linear regression has a mean error of 972.06. This is slightly better than the 980.38 obtained before. The point here is not whether the score is better or worse. The point is that it's a better estimation of how linear regression will perform on unseen data. Using cross-validation is always recommended for a better estimate of the score.

### Cross-validation with XGBoost

In [103]:
# Initialize a machine learning model:
model = XGBRegressor()

# Implement cross_val_score with the model, X, y, scoring='neg_mean_squared_error', and the number of folds, cv=10
scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=10)

In [104]:
# Find the RMSE by taking the square root of the negative scores:
rmse = np.sqrt(-scores)

In [105]:
# Display the results:
print('Reg rmse:', np.round(rmse, 2))
print('RMSE mean: %0.2f' % (rmse.mean()))

Reg rmse: [ 658.25  694.13  553.76  673.01  833.37  999.37 1006.6   748.13  899.66
 1730.47]
RMSE mean: 879.68


XGBRegressor wins again, besting linear regression by about 10%.