In [96]:
# read original data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Dataset_dir = "C:/Users/0stix/Datasets/bike_sharing_demand/"

df_orig_train = pd.read_csv(Dataset_dir+"train.csv")
df_orig_test = pd.read_csv(Dataset_dir+"test.csv")
df_orig_sample = pd.read_csv(Dataset_dir+"sampleSubmission.csv")

In [97]:
X = df_orig_train.drop(["casual", "registered", "count"], axis=1).copy()
y = df_orig_train["count"].copy()

In [98]:
from sklearn.metrics import mean_squared_error, mean_absolute_error


def rmsle(y, pred):
    log_y = np.log1p(y)
    log_pred = np.log1p(pred)
    squared_error = (log_y - log_pred) ** 2
    rmsle = np.sqrt(np.mean(squared_error))
    return rmsle


def rmse(y, pred):
    return np.sqrt(mean_squared_error(y, pred))


def evaluate_regr(y, pred):
    dct_ = {'rmsle_val': rmsle(y, pred),
            'rmse_val': rmse(y, pred),
            'mae_val': mean_absolute_error(y, pred)}
    print(dct_.items())

In [99]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion

lin_attrib = ['temp','atemp','humidity','windspeed']
cat_attrib = list(set(X)-set(lin_attrib)-set(['datetime']))

class DataFrameEditor(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        df_copy = X.copy()
        df_copy['datetime'] = df_copy.apply(pd.to_datetime)
        df_copy['year'] = df_copy.datetime.apply(lambda x: x.year)
        df_copy['month'] = df_copy.datetime.apply(lambda x: x.month)
        df_copy['day'] = df_copy.datetime.apply(lambda x: x.day)
        df_copy['hour'] = df_copy.datetime.apply(lambda x: x.hour)
        df_copy.drop('datetime', axis=1, inplace=True)
        return df_copy

class DataFrameSelector(BaseEstimator,TransformerMixin):
    def __init__(self,attribute_names):
        self.attribute_names = attribute_names
    def fit(self,X,y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [100]:
lin_pipeline = Pipeline([
    ('selector', DataFrameSelector(lin_attrib)),
    ('std_scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attrib)),
    ('1hot_encoder', OneHotEncoder(sparse=False))
])


nd_pipeline = FeatureUnion(transformer_list=[
    ('lin_pl', lin_pipeline),
    ('cat_pl', cat_pipeline)
])

full_pipeline = Pipeline([
    ('df_edt',DataFrameEditor()),
    ('bd_pl',nd_pipeline)
])


X = full_pipeline.fit_transform(X)
y_log = np.log1p(y)

In [101]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso

dct_model = {
    'rf_reg':RandomForestRegressor(n_estimators=500),
    'gbm_reg':GradientBoostingRegressor(n_estimators=500),
    'xgb_reg':XGBRegressor(n_estimators=500),
    'lgbm_reg':LGBMRegressor(n_estimators=500)
}

for name_, model_ in dct_model.items():
    print(name_)
    X_train, X_val, y_train_log, y_val_log = train_test_split(X, y_log, test_size=0.3, random_state=0)
    model_.fit(X_train, y_train_log)
    y_pred_log = model_.predict(X_val)
    y_pred = np.expm1(y_pred_log)
    y_val = np.expm1(y_val_log)
    evaluate_regr(y_val, y_pred)

rf_reg
dict_items([('rmsle_val', 1.1743201485317574), ('rmse_val', 156.77034122473484), ('mae_val', 105.19221785243879)])
gbm_reg
dict_items([('rmsle_val', 1.1663981997256434), ('rmse_val', 159.2440777245701), ('mae_val', 107.10088597124299)])
xgb_reg
dict_items([('rmsle_val', 1.2379208888786128), ('rmse_val', 174.34446509629805), ('mae_val', 117.27102496918914)])
lgbm_reg
dict_items([('rmsle_val', 1.1656034728744467), ('rmse_val', 156.29860146654372), ('mae_val', 105.40760586564821)])


In [102]:
X_test = full_pipeline.transform(df_orig_test)
X_test[0]

array([-1.22841418, -1.45029236, -0.30588308,  1.61722711,  1.        ,
        0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  1.        ,  1.        ,  0.        ,  0.        ,
        0.        ])

In [103]:
X_test.shape

(6493, 16)

In [104]:
y_pred_log = dct_model['lgbm_reg'].predict(X_test)
y_pred = np.expm1(y_pred_log)
y_pred[0]

50.250460827743694

In [108]:
df_submission = pd.DataFrame()
df_submission['datetime'] = df_orig_test['datetime']
df_submission['count'] = y_pred
df_submission.set_index('datetime',inplace=True)

In [111]:
df_submission.to_csv('submission.csv')

In [5]:
class DataFrameSelector(BaseEstimator,TransformerMixin):
    def __init__(self,attribute_names):
        self.attribute_names = attribute_names
    def fit(self,X,y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
    
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor


X = df_orig_train.drop(["casual", "registered", "count"], axis=1).copy()
y = df_orig_train["count"].copy()
y_log = np.log1p(y_)

X_["datetime"] = X_.apply(pd.to_datetime)
X_.drop("datetime", axis=1, inplace=True)
X_ = pd.get_dummies(X_, columns=["year", "month", "day", "hour", "holiday", "workingday", "season", "weather"])

NameError: name 'y_' is not defined

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso

X_train, X_val, y_train_log, y_val_log = train_test_split(X_, y_log, test_size=0.3, random_state=0)

lr_reg = LinearRegression()
lr_reg.fit(X_train, y_train_log)
y_pred_log = lr_reg.predict(X_val)
y_pred = np.expm1(y_pred_log)
y_val = np.expm1(y_val_log)
evaluate_regr(y_val, y_pred)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error


def rmsle(y, pred):
    log_y = np.log1p(y)
    log_pred = np.log1p(pred)
    squared_error = (log_y - log_pred) ** 2
    rmsle = np.sqrt(np.mean(squared_error))
    return rmsle


def rmse(y, pred):
    return np.sqrt(mean_squared_error(y, pred))


def evaluate_regr(y, pred):
    dct_ = {'rmsle_val': rmsle(y, pred),
            'rmse_val': rmse(y, pred),
            'mae_val': mean_absolute_error(y, pred)}
    print(dct_.items())