In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import shuffle

from sklearn import ensemble, tree, linear_model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')


# Data Exploratary

In [None]:
test = pd.read_csv('../input/bike-sharing-demand/test.csv')
train = pd.read_csv('../input/bike-sharing-demand/train.csv')
# remove the target and other columns that test doesn't have
train = train.drop(['registered', 'casual'], axis = 1)
train.head()
test_datetime = test['datetime']

In [None]:
sns.distplot(train['count'])

In [None]:
# deal with outliers
# len(train.loc[train['count'] > 900])/len(train)
# train = train.loc[train['count'] <900]
labels = train['count'].copy()

In [None]:
# no missing value
# df.isnull().sum()

In [None]:
train.info()

In [None]:
for var in ['temp','atemp', 'humidity', 'windspeed']:
    sns.scatterplot(x=var, y='count',data=train)
    plt.show()

In [None]:
cates = ['season', 'holiday','workingday', 'weather']
for var in cates: 
    f, ax = plt.subplots(figsize=(16,8))
    fig = sns.boxplot(x=var, y='count', data=train)
    fig.axis(ymin=0, ymax=900)
    plt.show()

In [None]:
# with the label
corrmat = train.corr()
f, ax = plt.subplots(figsize=(12,9))
sns.heatmap(corrmat, vmax=.8, square=True)

In [None]:
k = 8
cols = corrmat.nlargest(k, 'count')['count'].index
cm = np.corrcoef(train[cols].values.T)
# or set "rowvar=False"
f, ax = plt.subplots(figsize=(12,9))
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size':10},
                 yticklabels=cols.values,xticklabels=cols.values)
plt.show()

In [None]:
#  pd.to_datetime(df['datetime'], 
#  format = '%Y-%m-%dT%H:%M:%SZ', 
#  errors = 'coerce')

In [None]:
df = pd.concat([train, test], keys = ["train", "test"], axis = 0)
df['datetime'] = df['datetime'].astype('datetime64')
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['hour'] = df['datetime'].dt.hour
for var in ['season', 'holiday', 'workingday', 'weather' ,'year','month','hour']:   
    df[var] = df[var].astype('object')
# since tem and atemps are highly correlative we just remove atemp
df.drop(['datetime', 'atemp', 'count'], axis=1, inplace=True)

In [None]:
# ordinals = ['year', 'month', 'hour']
# ordinal_features = df[ordinals]
cates = ['season', 'holiday', 'workingday', 'year']
categorical_features = df[cates]
nums= ['temp', 'humidity','windspeed', 'month', 'hour']
numeric_features = df[nums]

In [None]:
nums_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler()),  
])
cates_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('one_hot', OneHotEncoder()),  
])
full_pipeline = ColumnTransformer([
    ('num', nums_pipeline, nums),
    ('cate', cates_pipeline, cates)
])

In [None]:

# # create a transformer for the categorical values
# categorical_transformer = Pipeline(steps=[
#    # ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
#     ('one_hot', OneHotEncoder())])

# # create a transformed for the numerical values
# numeric_transformer = Pipeline(steps=[
#     #('imputer', SimpleImputer(strategy='median')),
#     ('scaler', StandardScaler())])
# ordinal_transformer
# ordinal_transformer = Pipeline(steps=[
#    # ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
#     ('ordinal', OrdinalEncoder())])

In [None]:
clf = Pipeline(steps=[('preprocessor', full_pipeline),
                      ('regressor', LinearRegression())])
df_train = df.loc['train']
df_test = df.loc['test']
print(df_train.shape, len(labels))

### fit with linear and without model

In [None]:
clf.fit(df_train,labels)
data_prepared = full_pipeline.fit_transform(df)

## Get the name after one-hot without model

In [None]:
full_pipeline.transformers_[1][1]\
   .named_steps['one_hot'].get_feature_names(cates)

## Get the name after one_hot with model

In [None]:
clf.named_steps['preprocessor'].transformers_[1][1]\
   .named_steps['one_hot'].get_feature_names(cates)

In [None]:
cates_one_hot = clf['preprocessor'].transformers_[1][1]['one_hot']\
                   .get_feature_names(cates)

# Data visualization of time data

In [None]:
time = pd.concat([df.loc['train'][['year', 'month','hour']], labels], axis =1)
cates = ['year', 'month','hour']
for var in cates: 
    f, ax = plt.subplots(figsize=(16,8))
    fig = sns.boxplot(x=var, y='count', data=time)
    fig.axis(ymin=0, ymax=900)
    plt.show()

In [None]:
sns.set()
#cols = ['year', 'month','hour', 'count']
sns.pairplot(time, height=2.5)
plt.show()

In [None]:

features = list(nums) + list(cates_one_hot) 
features

In [None]:
df_prepared = pd.DataFrame(data_prepared, index=df.index,columns= features)
train_features = df_prepared.loc['train']
test_features = df_prepared.loc['test']
train_labels = labels

# Model

In [None]:
# Prints R2 and RMSE scores
def get_score(prediction, lables):    
    print('R2: {}'.format(r2_score(prediction, lables)))
    print('RMSE: {}'.format(np.sqrt(mean_squared_error(prediction, lables))))

# Shows scores for train and validation sets    
def train_test(estimator, x_trn, x_tst, y_trn, y_tst):
    prediction_train = estimator.predict(x_trn)
    # Printing estimator
    print(estimator)
    # Printing train scores
    get_score(prediction_train, y_trn)
    prediction_test = estimator.predict(x_tst)
    # Printing test scores
    print("Test")
    get_score(prediction_test, y_tst)

In [None]:
### Splitting train data and test data from the train 
x_train, x_val, y_train, y_val= train_test_split(train_features, train_labels, test_size=0.1, random_state=200)
# x_train_st, x_val_st, y_train_st, y_val_st = train_test_split(train_features_st, train_labels, test_size=0.1, random_state=200)

In [None]:
LR = linear_model.LinearRegression().fit(x_train,y_train)
train_test(LR, x_train, x_val, y_train, y_val)

In [None]:
ENSTest = linear_model.ElasticNetCV(alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10], l1_ratio=[.01, .1, .5, .9, .99]).fit(x_train, y_train)
train_test(ENSTest, x_train, x_val, y_train, y_val)

In [None]:
RDF = ensemble.RandomForestRegressor(max_features=8, n_estimators=15, oob_score=True).fit(x_train, y_train)
train_test(RDF, x_train, x_val, y_train, y_val)

In [None]:
GBest = ensemble.GradientBoostingRegressor(random_state=0).fit(x_train, y_train)
train_test(GBest, x_train, x_val, y_train, y_val)


# Cross validation

In [None]:
from sklearn.model_selection import cross_val_score
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [None]:
# scores = cross_val_score(RDF, x_train, y_train,
#                              scoring="neg_mean_squared_error", cv=10)
# RDF_scores = np.sqrt(-scores)
# display_scores(RDF_scores)

In [None]:
# scores = cross_val_score(GBest, x_train, y_train,
#                              scoring="neg_mean_squared_error", cv=10)
# GBest_scores = np.sqrt(-scores)
# display_scores(GBest_scores)

In [None]:
# scores = cross_val_score(LR, x_train, y_train,
#                              scoring="neg_mean_squared_error", cv=10)
# LR_scores = np.sqrt(-scores)
# display_scores(LR_scores)

# GridSearch


In [None]:
# param_grid = [
#     # try 12 (3×4) combinations of hyperparameters
#     {'n_estimators': [5, 10, 50, 100], 'max_features': [2, 4, 6, 8]},
#     # then try 6 (2×3) combinations with bootstrap set as False
#     {'bootstrap': [False], 'n_estimators': [10,50,100], 'max_features': [2, 3, 4]},
#   ]
# #forest_reg = ensemble.RandomForestRegressor(random_state=42)
# # train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
# grid_search = GridSearchCV(RDF, param_grid, cv=5,
#                            scoring='neg_mean_squared_error',
#                            return_train_score=True)
# grid_search.fit(x_train, y_train)

In [None]:
# grid_search.best_params_
# grid_search.best_estimator_

In [None]:
columns = pd.DataFrame(features, columns=['variable'])
score = pd.DataFrame(RDF.feature_importances_, columns = ['score'])
importance = pd.concat([score, columns], axis=1).sort_values(ascending=False, by='score')
importance[:10]

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
sns.barplot(x='score', y='variable', data=importance[:10])
plt.show()

# H2o for Random Forest

In [None]:
import h2o
from h2o.frame import H2OFrame
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.grid.grid_search import H2OGridSearch

In [None]:
results1 = RDF.predict(test_features)
# pd.DataFrame({'datetime': test_datetime, 'count': results}).to_csv("submission.csv", index=False)


In [None]:
# Initialize H2O cluster
h2o.init()
h2o.remove_all()

In [None]:
train = df.loc['train']
feature = list(train.columns)
train['count'] = labels
test = df.loc['test']

In [None]:
# Transform to H2O Frame, and make sure the target variable is categorical
h2o_df = H2OFrame(train)
h2o_df
for var in [cates]:
    h2o_df[var] = h2o_df[var].asfactor()

h2o_df.summary()

In [None]:
h2o_test = H2OFrame(test)
h2o_test
for var in [cates]:
    h2o_df[var] = h2o_df[var].asfactor()

h2o_test.summary()

In [None]:
train1,valid1 = h2o_df.split_frame(ratios=[0.8], seed =1234)
target = 'count'
feature

In [None]:
model = H2ORandomForestEstimator(balance_classes=True,  ntrees=100, max_depth=20, 
                                 mtries=-1,seed=42, score_each_iteration=True)
model.train(x=feature, y=target, training_frame=train1)

In [None]:
model.varimp_plot()

In [None]:
# Make predictions
train_true = train1.as_data_frame()['count'].values
test_true = valid1.as_data_frame()['count'].values
train_pred = model.predict(train1).as_data_frame()['predict'].values
test_pred = model.predict(valid1).as_data_frame()['predict'].values
 

In [None]:
print('train data:')
get_score(train_true,train_pred)
print('test data:')
get_score(test_true,test_pred)

In [None]:
results2 = model.predict(h2o_test).as_data_frame()['predict'].values

In [None]:
# Shutdown h2o instance
h2o.cluster().shutdown()

In [None]:
pd.DataFrame({'datetime': test_datetime, 'count': results1}).to_csv("submission.csv", index=False)