In [276]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, MinMaxScaler, RobustScaler, OneHotEncoder
from sklearn.metrics import r2_score

In [277]:
df = pd.read_csv("Datas/data_rudy.csv")

In [278]:
df.head(1)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,humidity,windspeed,count,day,month,day_number,year,hour
0,2011-01-01 00:00:00,1,0,0,1,9.84,81,0.0,16,Saturday,1,0,2011,0


## Train Test Split

Split des données avant (train) et après(test) Septembre 2012

In [279]:
X_train = df[df.datetime <=  "2012-08-19 23:00:00"].drop(["datetime","count"], axis=1)
X_test = df[df.datetime >  "2012-08-19 23:00:00"].drop(["datetime","count"], axis=1)
y_train = np.log1p(df[df.datetime <=  "2012-08-19 23:00:00"]["count"])
y_test = np.log1p(df[df.datetime >  "2012-08-19 23:00:00"]["count"])

## Pipelines

In [280]:
X_train.dtypes

season          int64
holiday         int64
workingday      int64
weather         int64
temp          float64
humidity        int64
windspeed     float64
day            object
month           int64
day_number      int64
year            int64
hour            int64
dtype: object

In [281]:
one_hot_features = ["weather", "holiday", "workingday", "season", "day", "hour", "month", "year"]
standard_feature = ["temp", "humidity", "windspeed", "day_number"]

one_hot_pipeline = make_pipeline(OneHotEncoder())
standard_pipeline = make_pipeline(StandardScaler())

processor = make_column_transformer((one_hot_pipeline, one_hot_features), (standard_pipeline, standard_feature))

In [282]:
model = make_pipeline(processor, RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='squared_error',
                      max_depth=10, max_features='log2', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0, min_samples_leaf=6,
                      min_samples_split=9, min_weight_fraction_leaf=0.0,
                      n_estimators=150, random_state=1))

## Optimisation des hyper-paramètres

In [283]:
from sklearn.model_selection import GridSearchCV, learning_curve, StratifiedKFold

In [284]:
# params = {"randomforestregressor__n_estimators": [100,150,200,300],
#                "randomforestregressor__max_depth": [None,1,3,5,10],
#                "randomforestregressor__max_features": ["auto", "sqrt", "log2"],
#                "randomforestregressor__min_samples_split": [2,3,5]}
# grid_model = GridSearchCV(model, grid_params, scoring="r2", cv=StratifiedKFold(5))
# grid_model.fit(X_train, y_train)

In [285]:
# rfr = grid_model.best_estimator_

## Scoring du modèle

In [286]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Score R2 :", r2_score(y_test, y_pred))

Score R2 : 0.6361641375996762
