In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.options.mode.chained_assignment = None



In [2]:
df = pd.read_csv('/Users/sebastienvallin/code/Sebastien01/velib_prediction/raw_data/historique_velib_v2.csv',
                index_col=0)
print(f'Shape : {df.shape}')
df.head()

  mask |= (ar1 == a)


Shape : (2719493, 16)


Unnamed: 0,station_id,docks_available,is_installed,is_returning,is_renting,mechanical_available,ebike_available,time,name,lat,lon,capacity,temp2m,probarain,weather,wind10m
0,213688169,31,True,True,True,3,1,2022-03-19 17:26:50,Benjamin Godard - Victor Hugo,48.865983,2.275725,35,15.0,0.0,0.0,26.0
1,516709288,19,True,True,True,0,1,2022-03-19 17:26:50,Charonne - Robert et Sonia Delauney,48.855908,2.392571,20,15.0,0.0,0.0,26.0
2,36255,14,True,True,True,0,7,2022-03-19 17:26:50,Toudouze - Clauzel,48.879296,2.33736,21,15.0,0.0,0.0,26.0
3,37815204,22,True,True,True,3,4,2022-03-19 17:26:50,Mairie du 12ème,48.840855,2.387555,30,15.0,0.0,0.0,26.0
4,100769544,0,True,True,True,22,23,2022-03-19 17:26:50,Harpe - Saint-Germain,48.851519,2.34367,45,15.0,0.0,0.0,26.0


## Pipelines

In [3]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

In [4]:
def binarize(df):
    for col in ['is_installed', 'is_returning', 'is_renting']:
        df[col] = df[col].map({True : 1, False: 0})
    return df
        

def get_time_info(df):

    df['time'] = pd.to_datetime(df['time'])
    df['month'] = df['time'].dt.month
    df['hour'] = df['time'].dt.hour
    df['day'] = df['time'].dt.dayofweek
    df['minute'] = df['time'].dt.minute
    return df        

def drop_bad_ids(df):
    bad_ids = df[df.is_installed==0].station_id.unique()
    return df[ ~ df.station_id.isin(bad_ids)]
    
binarize_fun = FunctionTransformer(lambda df: binarize(df))
get_time_fun = FunctionTransformer(lambda df: get_time_info(df))

cat_pipe = Pipeline([('drop_bad_ids', FunctionTransformer(lambda df: drop_bad_ids(df))),
                     ('binarize', FunctionTransformer(lambda df: binarize(df))),
                     ('get_time', FunctionTransformer(lambda df: get_time_info(df)))
                    ])

In [49]:
copy = df.copy()
copy = cat_pipe.transform(copy)

In [52]:
copy = copy.query('station_id.between(9000,30551)') 
dum = pd.get_dummies(copy.station_id)
X = pd.concat([copy, dum], axis=1)

# Baseline

In [53]:
to_drop_cols = ['docks_available', 'station_id', 'mechanical_available',
                'ebike_available', 'time', 'name', 'lat', 'lon','is_installed', 'is_returning', 'is_renting']

y = X.docks_available
X = X.drop(columns=to_drop_cols)

In [54]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.75)

In [56]:
rf = RandomForestRegressor().fit(X_train,y_train)
rf.score(X_test,y_test)

0.9696899386990312

In [62]:
test = X_test.copy()
test = X_test[X_test.index == 976218]
test.temp2m = 13
test.probarain = 10
test.weather = 3
test.wind10m = 10
test.month = 4
test.hour = 3
test.day = 5
test.minute = 45

rf.predict(test)

array([13.22])

In [22]:
linear_model = LinearRegression().fit(X_train, y_train)
linear_model.score(X_test, y_test)

0.4553930925266737

In [58]:
X_test

Unnamed: 0,capacity,temp2m,probarain,weather,wind10m,month,hour,day,minute,29327,30550
2212723,21,13.0,50.0,10.0,24.0,4,17,3,56,1,0
2543745,21,5.0,0.0,0.0,6.0,4,4,6,29,1,0
1414543,21,3.0,50.0,20.0,20.0,4,19,4,52,1,0
976218,24,18.0,20.0,5.0,7.0,3,14,1,11,0,1
2198569,24,14.0,60.0,210.0,26.0,4,15,3,24,0,1
...,...,...,...,...,...,...,...,...,...,...,...
2634200,24,13.0,0.0,2.0,10.0,4,20,6,38,0,1
2129785,24,10.0,30.0,10.0,19.0,4,3,3,2,0,1
699473,21,9.0,0.0,0.0,9.0,3,5,6,58,1,0
1771535,24,8.0,10.0,3.0,13.0,4,11,0,29,0,1


In [None]:
cv_score = cross_val_score(RandomForestRegressor(), X_train, y_train, cv=5, n_jobs=-1)
cv_score

# Clustering

In [139]:
station_df = pd.read_csv('/Users/sebastienvallin/code/Sebastien01/velib_prediction/velib_prediction/data/stations_info.csv')

_ = df.copy()
_ = cat_pipe.transform(_)
_.set_index('time', inplace=True)

stacked_df = _.groupby([pd.Grouper('station_id'),
                        pd.Grouper(level='time',freq='H')]).agg({'docks_available':'mean',
                                                                     'is_returning':min,
                                                                     'is_renting':min,
                                                                     'mechanical_available':'mean',
                                                                     'ebike_available':'mean',
                                                                     'capacity':max})

stacked_df['hour'] = stacked_df.index.droplevel(0).hour

In [91]:
daily_var = [stacked_df.query(f'station_id=={station}')['docks_available'].var() 
             for station in station_df.station_id]

daily_mean = [stacked_df.query(f'station_id=={station}')['docks_available'].mean() 
             for station in station_df.station_id]

morning_var = [stacked_df.query(f'station_id=={station} & hour.between(6,12,"left")')['docks_available'].var() 
               for station in station_df.station_id]

morning_mean = [stacked_df.query(f'station_id=={station} & hour.between(6,12,"left")')['docks_available'].mean() 
               for station in station_df.station_id]

eve_var = [stacked_df.query(f'station_id=={station} & hour.between(15,20,"left")')['docks_available'].var() 
               for station in station_df.station_id]

eve_mean = [stacked_df.query(f'station_id=={station} & hour.between(15,20,"left")')['docks_available'].mean() 
               for station in station_df.station_id]

In [94]:
featured_df = pd.DataFrame({'daily_var' :daily_var,
                            'daily_mean':daily_mean,
                            'morning_var': morning_var,
                            'morning_mean':morning_mean,
                            'eve_var': eve_var,
                            'eve_mean':eve_mean})

In [95]:
featured_df

Unnamed: 0,daily_var,daily_mean,morning_var,morning_mean,eve_var,eve_mean
0,37.072147,27.394755,33.931845,27.339947,53.114592,27.228741
1,11.233045,15.409248,14.156277,14.713624,5.642849,16.983844
2,15.145877,16.411318,2.814302,18.845238,5.533049,16.994898
3,69.282559,9.648033,85.191252,11.367725,46.418528,14.821429
4,134.929816,19.623879,97.780517,25.211640,119.859361,12.060374
...,...,...,...,...,...,...
1448,17.178306,13.185300,9.307595,14.951720,5.081446,14.588435
1449,27.624635,17.777433,14.444111,19.722222,8.548856,19.892007
1450,20.850604,30.720324,22.149156,31.246693,4.228131,32.676871
1451,21.633496,14.978778,7.900305,16.828042,19.609429,13.875000


In [99]:
station_df = pd.concat((station_df,featured_df),axis=1)

In [102]:
station_df.set_index('station_id', inplace=True)

In [105]:
station_df

Unnamed: 0_level_0,name,lat,lon,capacity,daily_var,daily_mean,morning_var,morning_mean,eve_var,eve_mean
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
213688169,Benjamin Godard - Victor Hugo,48.865983,2.275725,35,37.072147,27.394755,33.931845,27.339947,53.114592,27.228741
516709288,Charonne - Robert et Sonia Delauney,48.855908,2.392571,20,11.233045,15.409248,14.156277,14.713624,5.642849,16.983844
36255,Toudouze - Clauzel,48.879296,2.337360,21,15.145877,16.411318,2.814302,18.845238,5.533049,16.994898
37815204,Mairie du 12ème,48.840855,2.387555,30,69.282559,9.648033,85.191252,11.367725,46.418528,14.821429
100769544,Harpe - Saint-Germain,48.851519,2.343670,45,134.929816,19.623879,97.780517,25.211640,119.859361,12.060374
...,...,...,...,...,...,...,...,...,...,...
216473073,Tardieu - Chappe,48.884147,2.341845,18,17.178306,13.185300,9.307595,14.951720,5.081446,14.588435
27414924,Sambre et Meuse - Villette,48.874406,2.373807,25,27.624635,17.777433,14.444111,19.722222,8.548856,19.892007
213692227,Square Louise Michel,48.884687,2.344139,36,20.850604,30.720324,22.149156,31.246693,4.228131,32.676871
39202840,Rochechouart - Martyrs,48.882194,2.340550,20,21.633496,14.978778,7.900305,16.828042,19.609429,13.875000


In [103]:
from sklearn.decomposition import PCA

In [108]:
station_df.drop(columns='name', inplace=True)

In [112]:
station_df = station_df.dropna()

In [125]:
pca = PCA(n_components=3).fit(station_df)

In [126]:
ar = pca.transform(station_df)
ar.shape

(1444, 3)

In [132]:
import plotly.graph_objects as go
import numpy as np


fig = go.Figure(data=[go.Scatter3d(
    x=ar[:,0],
    y=ar[:,1],
    z=ar[:,2],
    mode='markers',
    marker=dict(
        size=12,
        color=z,                # set color to an array/list of desired values
        colorscale='Viridis',   # choose a colorscale
        opacity=0.8
    )
)])

fig.show()

In [133]:
docks_model = { 'Mairie du 9ème' : 'artefact_docks.joblib',
                'Geoffroy - Mairie' : 'mairie_neuf_docks.joblib'}

meca_model = {'Mairie du 9ème' : 'artefact_meca.joblib',
              'Geoffroy - Mairie': 'mairie_neuf_meca.joblib'}

stations = ['Mairie du 9ème', 'Geoffroy - Mairie']

In [137]:
for i in docks_model:
    f"{i}_docks" = 2
    

SyntaxError: cannot assign to f-string expression (3582578886.py, line 2)

In [141]:
station_df.query(f'station_id == 36151')

Unnamed: 0,station_id,name,lat,lon,capacity
573,36151,Favart - Italiens,48.871446,2.33829,17
