# Script test and exploration of data

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import pandas as pd

import datetime

# import numpy as np
# import pyarrow

## Test script

### Test fetch_data

In [None]:
import fetch_data as fd

In [None]:
fd.get_statuses()

### Test collect_statuses

In [None]:
from daily_update import collect_statuses

In [None]:
collect_statuses("2022-05-12")

### Test Slack message

In [None]:
from daily_update import slack_message

slack_message("test_example", 120000)

## Load 'historique_stations' data

In [None]:
file_path = os.path.join("..", "data", "historique_stations_2022-04-30_233325.csv")

In [None]:
def get_df(file_path):
    """ Get dataframe from provided path """
    df = pd.read_csv(
        file_path, 
        parse_dates=[0], 
        header=None, 
        names= ["date", "capacity","available_mechanical","available_electrical", "stationCode", "station_geo","operative"]
    )
    return df

In [None]:
df = get_df(file_path)
df

### Extract specific stations

In [None]:
# NB; strings since some station names are not int(!)

ref_ids = [
    #"Molière - République",
    '21209',
    #"Jean Marin Naudin - Stalingrad",
    '22202',
    #"Arthur Auger - Jean Jaurès",
    '21205',
    #"Marne - Germain Dardan"
    '21212',
]

In [None]:
df[df["stationCode"].isin(ref_ids)]

## Load status data

In [None]:
status_path = os.path.join("..", "data", "station_status_2022-05-01_080604.csv")

In [None]:
def get_status_df(file_path):
    """ Get dataframe from provided path """
    df = pd.read_csv(
        file_path, 
        parse_dates=[0], 
        header=None, 
        names= ["date", "station_code", "available_mechanical", "available_electrical", "operative"],
        index_col="date"
    )
    return df

In [None]:
status_df = get_status_df(status_path)

status_df

## Data cleaning and collection

### Process 2022-04-30

In [None]:
from daily_update import get_historique_file


data_path = os.listdir("../data")

process_04_30 = [os.path.join("..", "data", file_name) for file_name in data_path if file_name.startswith("historique_stations_2022-04")]

print(process_04_30)

# Create full histo df
histo_df = pd.concat([get_historique_file(file_path, has_name=True) for file_path in process_04_30[:5]], axis=0)

histo_df = pd.concat([get_historique_file(file_path, has_name=True, has_code=True) for file_path in process_04_30[5:9]]+[histo_df], axis=0)

histo_df = pd.concat([get_historique_file(file_path, has_code=True) for file_path in process_04_30[9:]]+[histo_df], axis=0)

histo_df.sort_index(inplace=True)

#histo_df.drop_duplicates(inplace=True)
#histo_df.to_parquet("..\data\Summary_2022-04-30.parquet")

In [None]:
len(histo_df)

In [None]:

len(histo_df)

In [None]:
histo_df

In [None]:
aux_df=pd.read_parquet("..\data\Summary_2022-04-30.parquet")

In [None]:
aux_df.equals(histo_df)

### Process 2022-05-01

In [None]:
from daily_update import get_historique_file


data_path = os.listdir("../data")

histo_05_01 = [os.path.join("..", "data", file_name) for file_name in data_path if file_name.startswith("historique_stations_2022-05-01")]

status_05_01 = [os.path.join("..", "data", file_name) for file_name in data_path if file_name.startswith("station_status_2022-05-01")]

histo_df = pd.concat([get_historique_file(file_path, has_code=True) for file_path in histo_05_01], axis=0)

histo_df = pd.concat([get_status_df(file_path) for file_path in status_05_01], axis=0)

histo_df.sort_index(inplace=True)

histo_df.to_parquet("..\data\Summary_2022-05-01.parquet")

### Other dates

In [None]:
import daily_update as du

In [None]:
for day in ["02", "03", "04", "05", "06", "07", "08", "09"]:
    date_str = "2022-05-{}".format(day)
    du.collect_statuses(date_str)
    print("Completed: ", date_str)

# Explo data

## First start

In [None]:
# Ignore first days, with different format
data_path = os.listdir("../data")

summaries = [os.path.join("..", "data", file_name) for file_name in data_path 
                if file_name.startswith("Summary_2022-05")]

print(summaries)

In [None]:
data_df = pd.concat([pd.read_parquet(file_path) for file_path in summaries],
                    axis=0)
data_df

Oups ! Dans le ci-dessus il s'est passé des choses horribles, à vue de nez un échange entre `available_mechanical` et `operative` ou quelque chose dans ce goût là ! Il va être nécessaire d'étudier le comportement de plus près !

In [None]:
pd.read_parquet(summaries[0])

In [None]:
pd.read_parquet(summaries[-1])

On voit le problème ci-dessus (`available_mechanical` semble avoir pris la place de `operative`)

In [None]:
pd.read_parquet(summaries[2]).drop_duplicates()

In [None]:
pd.read_parquet(summaries[3])

Le problème apparaît dans `summaries[3]` ! 

NB : 
* comme les données ont été récoltées par tranche de 10 min et que le code n'a pas changé à minuit, il va sans doute être nécessaire d'enquêter de plus près !
* on voit ci-dessus une grosse duplication de lignes pour la station `21110`, il serait sans doute judicier de faire un "drop duplicate". Voir même de faire le "drop duplicate" dans le code de création des "summaries" !

In [None]:
pd.read_parquet(summaries[4]).drop_duplicates()

## Keeping only very first days

In [None]:
data_df = pd.concat([pd.read_parquet(file_path) for file_path in summaries[0:3]],
                    axis=0)
data_df

In [None]:
station_code = '21209' #"Molière - République"

In [None]:
aux_df = data_df[data_df["station_code"]==station_code]

In [None]:
aux_df["date"]

In [None]:
aux_df

## Fastparquet VS pyarrow...

In [None]:
# Summary file generated using `fastparquet` instead of `pyarrow`
aux_df = pd.read_parquet("..\\data\\Summary_2022-05-14.parquet")

In [None]:
aux_df.columns

In [None]:
aux_df.sort_values(by="time", inplace=True)

In [None]:
non_operative = aux_df[aux_df["operative"]==False]
non_operative

In [None]:
aux = non_operative["stationCode"].unique()
print("Total nbr non-operative: ", len(aux))
aux

In [None]:
print("Total number of stations: ", len(aux_df["stationCode"].unique()))

## Back to investigation of specific station

In [None]:
station_code = '21209' #"Molière - République"

current_df = aux_df[aux_df["stationCode"]==station_code]
current_df

In [None]:
print("Initial nbr of rows: ", len(current_df))
print("Dropping duplicates: ", len(current_df.drop_duplicates()))

In [None]:
# Station fully operational!
current_df["operative"].sum()

In [None]:
import sys
sys.path.append("..")

from daily_update import get_status_df

def extract_enrich_data(file_path):
    """ Extract data + add a new column saving file name """
    df = get_status_df(file_path)

    file_time_str = os.path.basename(file_path)[15:-4]

    file_time = datetime.datetime.strptime(file_time_str, '%Y-%m-%d_%H%M%S')

    df["file_time"] = file_time

    return df

In [None]:
data_path = os.listdir("..\data")
date_str = "2022-05-14"

prefix_str = "station_status_{}".format(date_str)
status_day = [os.path.join("..", "data", file_name) 
                for file_name in data_path 
                if file_name.startswith(prefix_str)]

print("status_day: ", status_day)

histo_df = pd.concat([extract_enrich_data(file_path) for file_path in status_day], axis=0)

In [None]:
histo_df

In [None]:
station_code = '21209' #"Molière - République"

current_df = histo_df[histo_df["stationCode"]==station_code]
current_df

In [None]:
# Non-degenerated!
current_df.groupby("file_time")["stationCode"].count().unique()

## Testing daily_update script

# More explo

## Defining graph functions

In [26]:
date_str = "2022-05-11"

file_name = "Summary_{}.parquet".format(date_str)
file_path = os.path.join("data", file_name)

aux_df = pd.read_parquet(file_path)

In [27]:
aux_df

Unnamed: 0_level_0,time,stationCode,operative,available_mechanical,available_electrical
file_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-05-11 00:02:38+02:00,2022-05-10 21:14:00+00:00,16107,True,1.0,3.0
2022-05-11 00:02:38+02:00,2022-05-10 21:13:00+00:00,11104,True,6.0,6.0
2022-05-11 00:02:38+02:00,2022-05-10 21:11:00+00:00,9020,True,0.0,2.0
2022-05-11 00:02:38+02:00,2022-05-10 21:14:00+00:00,12109,True,23.0,6.0
2022-05-11 00:02:38+02:00,2022-05-10 21:12:00+00:00,5001,True,0.0,1.0
...,...,...,...,...,...
2022-05-11 23:52:39+02:00,2022-05-11 21:15:00+00:00,18026,True,8.0,15.0
2022-05-11 23:52:39+02:00,2022-05-11 21:13:00+00:00,18024,True,12.0,18.0
2022-05-11 23:52:39+02:00,2022-05-11 21:17:00+00:00,15056,True,22.0,1.0
2022-05-11 23:52:39+02:00,2022-05-11 21:17:00+00:00,8004,True,1.0,0.0


In [5]:
station_code = '21209' #"Molière - République"

current_df = aux_df[aux_df["stationCode"]==station_code]

In [81]:
import plotly.express as px


fig = px.line(current_df.reset_index(), x="file_time", y="available_mechanical", markers="dot")

fig.show()

In [63]:
import plotly.graph_objects as go

def plot_bikes(date_str, station_code):
    """ Plot graph number of both mech and elec bikes between 07:30 and 10:00 """
    
    file_name = "Summary_{}.parquet".format(date_str)
    file_path = os.path.join("data", file_name)

    aux_df = pd.read_parquet(file_path)

    current_df = aux_df[aux_df["stationCode"]==station_code]

    start_str = "{} 07:25".format(date_str)
    end_str = "{} 10:05".format(date_str)
    my_df = current_df.loc[start_str: end_str]

    fig = go.Figure()

    fig.add_trace(go.Scatter(x=my_df.index, y=my_df["available_mechanical"], 
                    mode='lines+markers', name="mech"))
    fig.add_trace(go.Scatter(x=my_df.index, y=my_df["available_electrical"], 
                    mode='lines+markers', name="elec"))

    fig.show()

## Exploration by station

### Molière - République (Montrouge): 21209

In [71]:
# Mercredi
plot_bikes("2022-05-11", "21209")

In [72]:
# Jeudi
plot_bikes("2022-05-12", "21209")

In [73]:
# Vendredi
plot_bikes("2022-05-13", "21209")

In [66]:
# Samedi
plot_bikes("2022-05-14", "21209")

In [67]:
# Dimanche (pas de données)
plot_bikes("2022-05-15", "21209")

In [68]:
# Lundi
plot_bikes("2022-05-16", "21209")

In [70]:
# mardi
plot_bikes("2022-05-17", "21209")

### Center of Paris: Filles Saint-Thomas - Place de la Bourse (2009)

In [80]:
# Jeudi
plot_bikes("2022-05-13", "2009")

In [79]:
# Vendredi
plot_bikes("2022-05-14", "2009")

In [78]:
# Samedi (données manquantes)
plot_bikes("2022-05-15", "2009")

In [76]:
# Dimanche
plot_bikes("2022-05-16", "2009")

In [75]:
# Lundi
plot_bikes("2022-05-17", "2009")

### Edge of Paris: Porte de Vanves - 14138

In [82]:
# Jeudi
plot_bikes("2022-05-13", "14138")

In [83]:
# Vendredi
plot_bikes("2022-05-14", "14138")

In [84]:
# Samedi (données manquantes)
plot_bikes("2022-05-15", "14138")

In [85]:
# Dimanche
plot_bikes("2022-05-16", "14138")

In [86]:
# Lundi
plot_bikes("2022-05-17", "14138")