In [14]:
# read original data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Dataset_dir = "C:/Users/0stix/Datasets/bike_sharing_demand/"

df_orig_train = pd.read_csv(Dataset_dir+"train.csv")
df_orig_test = pd.read_csv(Dataset_dir+"test.csv")
df_orig_sample = pd.read_csv(Dataset_dir+"sampleSubmission.csv")

In [15]:
from sklearn.metrics import mean_squared_error, mean_absolute_error


def rmsle(y, pred):
    log_y = np.log1p(y)
    log_pred = np.log1p(pred)
    squared_error = (log_y - log_pred) ** 2
    rmsle = np.sqrt(np.mean(squared_error))
    return rmsle


def rmse(y, pred):
    return np.sqrt(mean_squared_error(y, pred))


def evaluate_regr(y, pred):
    dct_ = {'rmsle_val': rmsle(y, pred),
            'rmse_val': rmse(y, pred),
            'mae_val': mean_absolute_error(y, pred)}
    print(dct_.items())

In [16]:
X_ = df_orig_train.drop(["casual", "registered", "count"], axis=1).copy()
y_ = df_orig_train["count"].copy()
y_log = np.log1p(y_)

X_["datetime"] = X_.apply(pd.to_datetime)
X_["year"] = X_.datetime.apply(lambda x: x.year)
X_["month"] = X_.datetime.apply(lambda x: x.month)
X_["day"] = X_.datetime.apply(lambda x: x.day)
X_["hour"] = X_.datetime.apply(lambda x: x.hour)
X_.drop("datetime", axis=1, inplace=True)
X_ = pd.get_dummies(X_, columns=["year", "month", "day", "hour", "holiday", "workingday", "season", "weather"])

In [17]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso

X_train, X_val, y_train_log, y_val_log = train_test_split(X_, y_log, test_size=0.3, random_state=0)

lr_reg = LinearRegression()
lr_reg.fit(X_train, y_train_log)
y_pred_log = lr_reg.predict(X_val)
y_pred = np.expm1(y_pred_log)
y_val = np.expm1(y_val_log)
evaluate_regr(y_val, y_pred)

dict_items([('rmsle_val', 0.5896341440364988), ('rmse_val', 97.68751161267781), ('mae_val', 63.381932073855516)])


In [20]:
#%%

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

dct_model = {
    'rf_reg':RandomForestRegressor(n_estimators=500),
    'gbm_reg':GradientBoostingRegressor(n_estimators=500),
    'xgb_reg':XGBRegressor(n_estimators=500),
    'lgbm_reg':LGBMRegressor(n_estimators=500)
}

for name_, model_ in dct_model.items():
    print(name_)
    X_train, X_val, y_train_log, y_val_log = train_test_split(X_, y_log, test_size=0.3, random_state=0)
    model_.fit(X_train, y_train_log)
    y_pred_log = model_.predict(X_val)
    y_pred = np.expm1(y_pred_log)
    y_val = np.expm1(y_val_log)
    evaluate_regr(y_val, y_pred)

rf_reg
dict_items([('rmsle_val', 0.35350606915581556), ('rmse_val', 50.35391759908012), ('mae_val', 31.162733444217057)])
gbm_reg
dict_items([('rmsle_val', 0.32996155835389473), ('rmse_val', 53.32688476068016), ('mae_val', 32.74322402125631)])
xgb_reg
dict_items([('rmsle_val', 0.3422048283339225), ('rmse_val', 51.73158151916774), ('mae_val', 31.251221714159207)])
lgbm_reg
dict_items([('rmsle_val', 0.3188456499157367), ('rmse_val', 47.21464677592674), ('mae_val', 29.028770412428237)])


ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 73 is different from 66)

In [None]:
#%%

df_orig_train.describe()

In [None]:
#%%

X_train.info()

#%%