In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import warnings
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Lasso,Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
warnings.filterwarnings("ignore")
np.random.seed(42)

Data Set Information:

Bike sharing systems are new generation of traditional bike rentals where whole process from membership, rental and return back has become automatic. Through these systems, user is able to easily rent a bike from a particular position and return back at another position. Currently, there are about over 500 bike-sharing programs around the world which is composed of over 500 thousands bicycles. Today, there exists great interest in these systems due to their important role in traffic, environmental and health issues. 

Apart from interesting real world applications of bike sharing systems, the characteristics of data being generated by these systems make them attractive for the research. Opposed to other transport services such as bus or subway, the duration of travel, departure and arrival position is explicitly recorded in these systems. This feature turns bike sharing system into a virtual sensor network that can be used for sensing mobility in the city. Hence, it is expected that most of important events in the city could be detected via monitoring these data.

cnt: count of total rental bikes including both casual and registered

In [18]:
df = pd.read_csv('hour.csv')
y = df.pop('cnt')
df.drop(['instant', 'casual', 'dteday', 'registered'], axis = 1, inplace = True)
df.head()

Unnamed: 0,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed
0,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0
1,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0
2,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0
3,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0
4,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0


In [3]:
for i in df.columns:
    df[i] = df[i].fillna(np.mean(df[i]))
    
train, test, y_train, y_test = train_test_split(df, y, test_size = 0.2) 

In [4]:
def fit_predict(train, test, y_train, y_test, scaler = None):
    if scaler is None:
        lr = Ridge()
        lr.fit(train, y_train)
        y_pred = lr.predict(test)
        print('MAE score:', mean_absolute_error(y_test, y_pred))
    else:
        train_scaled = scaler.fit_transform(train)
        test_scaled = scaler.transform(test)
        lr = Ridge()
        lr.fit(train_scaled, y_train)
        y_pred = lr.predict(test_scaled)
        print('MAE score:', mean_absolute_error(y_test, y_pred))

In [5]:
print('Baseline', end = ' ')
fit_predict(train, test, y_train, y_test)

Baseline MAE score: 104.802725573


In [6]:
def feat_eng(df):
    df['eng1'] = df['hum'] / df['temp']
    df['eng2'] = df['windspeed'] * df['hum']
    df['eng3'] = df['temp'] * df['hum']
    df['eng4'] = df['temp'] * df['atemp']
    return df

train = feat_eng(train)
test = feat_eng(test)

In [7]:
def get_feat_imp(train,y_train,alpha=0.01):
    lr = Lasso(alpha=alpha)
    lr.fit(train,y_train)
    return lr.coef_
fi = get_feat_imp(train,y_train)
print('number of features is {}'.format(train.shape[1]))
print('number of non zero features:',np.sum(fi != 0))

number of features is 16
number of non zero features: 16


In [8]:
def create_poly(train,test,degree):
    poly = PolynomialFeatures(degree=degree)
    train_poly = poly.fit_transform(train)
    test_poly = poly.fit_transform(test)
    return train_poly,test_poly

In [11]:
for degree in [1,2,3,4,5]:
    train_poly,test_poly = create_poly(train,test,degree)
    print('No feature selection degree',degree)
    fit_predict(train_poly,test_poly,y_train,y_test)
    print(10*'-')

No feature selection degree 1
MAE score: 103.454477177
----------
No feature selection degree 2
MAE score: 89.5565130706
----------
No feature selection degree 3
MAE score: 77.537496916
----------
No feature selection degree 4
MAE score: 71.9204017223
----------
No feature selection degree 5
MAE score: 345.019388746
----------


In [10]:
original_score = 104.802725573
best_score = 71.9204017223
improvement = np.round(100*(original_score - best_score)/original_score,2)
print('overall improvement is {} %'.format(improvement))

overall improvement is 31.38 %
