In [207]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, MinMaxScaler, RobustScaler, OneHotEncoder
from sklearn.metrics import r2_score

In [208]:
df = pd.read_csv("Datas/data_rudy.csv")

In [209]:
df.head(1)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,humidity,windspeed,count,day,month,day_number,year,hour
0,2011-01-01 00:00:00,1,0,0,1,9.84,81,0.0,16,Saturday,1,0,2011,0


## Train Test Split

Split des données avant (train) et après(test) Septembre 2012

In [210]:
df=df.sort_values(by="datetime")
X_train = df[df.datetime <=  "2012-08-19 23:00:00"].drop(["datetime","count"], axis=1)

X_test = df[df.datetime >  "2012-08-19 23:00:00"].drop(["datetime","count"], axis=1)
y_train = np.log1p(df[df.datetime <=  "2012-08-19 23:00:00"]["count"])
y_test = np.log1p(df[df.datetime >  "2012-08-19 23:00:00"]["count"])

## Pipelines

In [211]:
X_train.dtypes

season          int64
holiday         int64
workingday      int64
weather         int64
temp          float64
humidity        int64
windspeed     float64
day            object
month           int64
day_number      int64
year            int64
hour            int64
dtype: object

In [212]:
one_hot_features = ["weather", "holiday", "workingday", "season", "hour", "month", "day"]
standard_feature = ["day_number"]
# "temp", "humidity", "windspeed", 
# , "year"

one_hot_pipeline = make_pipeline(OneHotEncoder(handle_unknown="ignore"))
standard_pipeline = make_pipeline(StandardScaler())

processor = make_column_transformer((one_hot_pipeline, one_hot_features), (standard_pipeline, standard_feature))

In [213]:
model = make_pipeline(processor, RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='squared_error',
                      max_depth=10, max_features='log2', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0, min_samples_leaf=6,
                      min_samples_split=9, min_weight_fraction_leaf=0.0,
                      n_estimators=150, random_state=1))

## Optimisation des hyper-paramètres

In [214]:
from sklearn.model_selection import learning_curve, StratifiedKFold

In [215]:
# train_size, train_score, val_score = learning_curve(model, X_train, y_train,train_sizes=np.linspace(0.1,1,20),
#                                                      scoring="r2", shuffle=True)

# plt.plot(train_size, train_score.mean(axis=1), label="Train score")
# plt.plot(train_size, val_score.mean(axis=1), label="Test score")
# plt.legend()
# plt.title("Learning curve of Random Forest Regressor")

## Scoring du modèle

In [216]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Score R2 :", r2_score(y_test, y_pred))

Score R2 : 0.648475672073461


In [217]:
X_train.head(1)

Unnamed: 0,season,holiday,workingday,weather,temp,humidity,windspeed,day,month,day_number,year,hour
0,1,0,0,1,9.84,81,0.0,Saturday,1,0,2011,0


In [218]:
pred = np.array([[4, 1, 0, 1, 25, 30, 20, "Monday", 4, 4100, 2022, 10],
                [4, 1, 0, 1, 25, 30, 20, "Monday", 4, 4100, 2011, 10],
                [4, 1, 0, 1, 25, 30, 20, "Monday", 4, 4100, 2012, 10]])
data_pred = pd.DataFrame(pred, columns=list(X_train.columns))
for col in list(data_pred.columns):
    try:
        data_pred[col] = data_pred[col].astype(float)
    except:
        pass
print(np.exp(model.predict(data_pred)))

[133.67228556 117.09892974 151.72126496]


In [219]:
data_pred.dtypes

season        float64
holiday       float64
workingday    float64
weather       float64
temp          float64
humidity      float64
windspeed     float64
day            object
month         float64
day_number    float64
year          float64
hour          float64
dtype: object

In [220]:
X_test.dtypes

season          int64
holiday         int64
workingday      int64
weather         int64
temp          float64
humidity        int64
windspeed     float64
day            object
month           int64
day_number      int64
year            int64
hour            int64
dtype: object