## Goal
Find out how many bicycles need to be ready at a given time in the Washington, D.C. market.

### Import necessary libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline , make_pipeline as mk_pip
from sklearn.preprocessing import KBinsDiscretizer , StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.preprocessing import FunctionTransformer 
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_log_error

In [None]:
# Set default plot size
plt.rcParams["figure.figsize"] = (5,4)

### Load data

In [None]:
df = pd.read_csv('../data/train.csv', index_col=0, parse_dates=True)

In [None]:
df.head()


In [None]:
df.info()

### Add datetime features

In [None]:

df['datetime'] = df.index
df["year"] = df.index.year
df['month'] = df.index.month
df["day"] = df.index.day
df["weekday"] = df.index.weekday
df['hour'] = df.index.hour

### Exploratory data analysis

In [None]:
df.info()
df.groupby(["year","month","day"])["holiday"].count()

In [None]:
# Plot correlations
sns.heatmap(df.corr())

In [None]:
# Summarize data by date or time
b=df.resample("W").median()
b

In [None]:
# Have a look at counts with moving average
df.rolling(100).mean().plot(y="count")

In [None]:
# Check out the median count by columns, for linearity:
for column in df.columns[df.columns != "datetime"]:
    median_count = df.groupby(column)["count"].median()
    plt.plot(median_count)
    plt.ylabel('Median Count') 
    plt.xlabel(column)
    plt.show()


* For some features, an feature exapnsion might be needed, if you want to do linear regression!


### Set target variable

In [None]:
y = df['count']
# Drop unneeded columns
X = df.drop(columns=["registered", "count", 'datetime'])
X

### Split into train and test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)


### Run the linreg blindly

In [None]:
linreg=LinearRegression()

In [None]:
linreg.fit(X_train,np.log1p(y_train))

In [None]:
linreg.score(X_test,np.log1p(y_test))

* The result is bad. Either do feture exapnsion and repet the linreg, or go with randomforest regressor 

### RandomForestReg

In [None]:
# Check prameter importance
#firt fit all and then call the importances
rf = RFR(max_depth=5, random_state=0)

In [None]:
rf.fit(X_train, y_train)

In [None]:
IMP=pd.DataFrame({ 'feature': X_train.columns, 'importance': rf.feature_importances_})
IMP

* Day should be out, cuz it has no relevance to the test data .... See the test data.
* windspeed and weather has no meaningful correlation with the count in train data ->Out

### Define preprocessing pipeline

In [None]:
#a_1=mk_pip(MinMaxScaler() ) #KBinsDiscretizer(n_bins=24)
a_2=mk_pip(StandardScaler())
a_3=mk_pip(KBinsDiscretizer(n_bins=7))
a_4=mk_pip(OneHotEncoder())

In [None]:
# Transform the columns with 
feature_transformer=ColumnTransformer(
transformers=[#('chaps_a_1', a_1, [""]),
              ('chaps_a_2', a_2, [ "atemp",'humidity']),
              ('chaps_a_4', a_3, ['weekday' ]),
              ("do_nothing", 'passthrough', ['holiday', 'workingday', 'year', 
                                             'weather','season','month','hour'])]
)

### Fit and check the score

In [None]:
pipeline=mk_pip(feature_transformer, RFR())
pipeline.fit(X_train,np.log1p(y_train))
pipeline.score(X_train,np.log1p(y_train))

### Cross-validation-score for train or test

In [None]:
cross_r2_pip=cross_val_score(pipeline, 
               X_train,
               y_train,
               cv=5, 
               scoring='r2',
               verbose=3
)

In [None]:
cross_r2_pip.mean()

In [None]:
### Check the test data

In [None]:
np.sqrt(mean_squared_log_error(y_test, np.expm1(pipeline.predict(X_test))))

### Hyperparameter optimization

In [None]:
# What are the relevant parameters?

pipeline.get_params()

In [None]:
parameters = { 'randomforestregressor__n_estimators' : [2 ** i for i in range(7, 9)],  # parameters grid
              'randomforestregressor__max_depth': [ 4,8,10,12, None]
              #'randomforestregressor__ccp_alpha': []...
    

}
grid_cv = GridSearchCV(estimator=pipeline, 
                    param_grid=parameters, 
                    cv=5, 
                    scoring='r2',
                    verbose = 3)

In [None]:
# Searching for the optimal set of parameters
grid_cv.fit(X_train, np.log1p(y_train));

In [None]:
# call the best parameters:
grid_cv.best_params_

In [None]:
pipeline_best=grid_cv.best_estimator_

In [None]:
# Check the score
pipeline_best.score(X_train,np.log1p(y_train))

* Optimization has improved the result to some extent

### creat output with optimized

In [None]:
bike_df=pd.read_csv('../data/test.csv',index_col=0, parse_dates=True)
bike_df['hour']=bike_df.index.hour
bike_df['day']=bike_df.index.day
bike_df['weekday']=bike_df.index.weekday
bike_df['month']=bike_df.index.month
bike_df['year']=bike_df.index.year


### Create csv file for Kaggle competiton

In [None]:
My={"datetime": bike_df.index.to_numpy(),"count": np.expm1(pipeline_best.predict(bike_df))}
My_prediction=pd.DataFrame(data=My)
My_prediction.set_index("datetime", inplace=True)
My_prediction.to_csv("rfra.csv")
My_prediction