# Generate daily data

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd

import datetime

## Regular case

In [3]:
from src.daily_update import collect_statuses

In [10]:
#date_range = pd.date_range("2022-05-11", "2022-05-17")
date_range = ["2022-05-17"]

for day in date_range:
    date_str = day.strftime("%Y-%m-%d")

    collect_statuses(date_str)
    print("Completed: ", date_str)

Completed:  2022-05-16
Completed:  2022-05-17


## Load 'historique_stations' data

In [None]:
file_path = os.path.join("..", "data", "historique_stations_2022-04-30_233325.csv")

In [None]:
def get_df(file_path):
    """ Get dataframe from provided path """
    df = pd.read_csv(
        file_path, 
        parse_dates=[0], 
        header=None, 
        names= ["date", "capacity","available_mechanical","available_electrical", "stationCode", "station_geo","operative"]
    )
    return df

In [None]:
df = get_df(file_path)
df

### Extract specific stations

In [None]:
# NB; strings since some station names are not int(!)

ref_ids = [
    #"Molière - République",
    '21209',
    #"Jean Marin Naudin - Stalingrad",
    '22202',
    #"Arthur Auger - Jean Jaurès",
    '21205',
    #"Marne - Germain Dardan"
    '21212',
]

In [None]:
df[df["stationCode"].isin(ref_ids)]

## Load status data

In [None]:
status_path = os.path.join("..", "data", "station_status_2022-05-01_080604.csv")

In [None]:
def get_status_df(file_path):
    """ Get dataframe from provided path """
    df = pd.read_csv(
        file_path, 
        parse_dates=[0], 
        header=None, 
        names= ["date", "station_code", "available_mechanical", "available_electrical", "operative"],
        index_col="date"
    )
    return df

In [None]:
status_df = get_status_df(status_path)

status_df

## Data cleaning and collection

### Process 2022-04-30

In [10]:
from daily_update import get_historique_file


data_path = os.listdir("../data")

process_04_30 = [os.path.join("..", "data", file_name) for file_name in data_path if file_name.startswith("historique_stations_2022-04")]

print(process_04_30)

# Create full histo df
histo_df = pd.concat([get_historique_file(file_path, has_name=True) for file_path in process_04_30[:5]], axis=0)

histo_df = pd.concat([get_historique_file(file_path, has_name=True, has_code=True) for file_path in process_04_30[5:9]]+[histo_df], axis=0)

histo_df = pd.concat([get_historique_file(file_path, has_code=True) for file_path in process_04_30[9:]]+[histo_df], axis=0)

histo_df.sort_index(inplace=True)

#histo_df.drop_duplicates(inplace=True)
#histo_df.to_parquet("..\data\Summary_2022-04-30.parquet")

['..\\data\\historique_stations_2022-04-30_2237.csv', '..\\data\\historique_stations_2022-04-30_225150.csv', '..\\data\\historique_stations_2022-04-30_225702.csv', '..\\data\\historique_stations_2022-04-30_225841.csv', '..\\data\\historique_stations_2022-04-30_230847.csv', '..\\data\\historique_stations_2022-04-30_231344.csv', '..\\data\\historique_stations_2022-04-30_232351.csv', '..\\data\\historique_stations_2022-04-30_233210.csv', '..\\data\\historique_stations_2022-04-30_233236.csv', '..\\data\\historique_stations_2022-04-30_233325.csv', '..\\data\\historique_stations_2022-04-30_233532.csv', '..\\data\\historique_stations_2022-04-30_233739.csv', '..\\data\\historique_stations_2022-04-30_233946.csv', '..\\data\\historique_stations_2022-04-30_234018.csv', '..\\data\\historique_stations_2022-04-30_235025.csv', '..\\data\\historique_stations_2022-04-30_235943.csv']


In [11]:
len(histo_df)

22976

In [9]:

len(histo_df)

8963

In [None]:
histo_df

In [None]:
aux_df=pd.read_parquet("..\data\Summary_2022-04-30.parquet")

In [None]:
aux_df.equals(histo_df)

### Process 2022-05-01

In [None]:
from daily_update import get_historique_file


data_path = os.listdir("../data")

histo_05_01 = [os.path.join("..", "data", file_name) for file_name in data_path if file_name.startswith("historique_stations_2022-05-01")]

status_05_01 = [os.path.join("..", "data", file_name) for file_name in data_path if file_name.startswith("station_status_2022-05-01")]

histo_df = pd.concat([get_historique_file(file_path, has_code=True) for file_path in histo_05_01], axis=0)

histo_df = pd.concat([get_status_df(file_path) for file_path in status_05_01], axis=0)

histo_df.sort_index(inplace=True)

histo_df.to_parquet("..\data\Summary_2022-05-01.parquet")

## Process other dates

In [3]:
# Ignore first days, with different format
data_path = os.listdir("../data")

summaries = [os.path.join("..", "data", file_name) for file_name in data_path 
                if file_name.startswith("Summary_2022-05")]

print(summaries)

['..\\data\\Summary_2022-05-01.parquet', '..\\data\\Summary_2022-05-02.parquet', '..\\data\\Summary_2022-05-03.parquet', '..\\data\\Summary_2022-05-04.parquet', '..\\data\\Summary_2022-05-05.parquet', '..\\data\\Summary_2022-05-06.parquet', '..\\data\\Summary_2022-05-07.parquet', '..\\data\\Summary_2022-05-08.parquet', '..\\data\\Summary_2022-05-09.parquet', '..\\data\\Summary_2022-05-12.parquet', '..\\data\\Summary_2022-05-13.parquet', '..\\data\\Summary_2022-05-14.parquet']


In [4]:
data_df = pd.concat([pd.read_parquet(file_path) for file_path in summaries],
                    axis=0)
data_df

  data_df = pd.concat([pd.read_parquet(file_path) for file_path in summaries],


Unnamed: 0_level_0,station_code,available_mechanical,available_electrical,operative
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-05-01 06:06:00+00:00,16107,1,3.0,1.0
2022-05-01 06:06:00+00:00,11104,4,10.0,1.0
2022-05-01 06:06:00+00:00,9020,0,0.0,1.0
2022-05-01 06:06:00+00:00,12109,11,8.0,1.0
2022-05-01 06:06:00+00:00,5001,30,1.0,1.0
...,...,...,...,...
2022-05-09 21:10:00+00:00,10026,1,10.0,2.0
2022-05-09 21:10:00+00:00,17010,1,18.0,2.0
2022-05-09 21:10:00+00:00,18023,1,7.0,6.0
2022-05-09 21:10:00+00:00,10026,1,10.0,1.0


Oups ! Dans le ci-dessus il s'est passé des choses horribles, à vue de nez un échange entre `available_mechanical` et `operative` ou quelque chose dans ce goût là ! Il va être nécessaire d'étudier le comportement de plus près !

In [5]:
pd.read_parquet(summaries[0])

Unnamed: 0_level_0,station_code,available_mechanical,available_electrical,operative
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-05-01 06:06:00+00:00,16107,1,3,True
2022-05-01 06:06:00+00:00,11104,4,10,True
2022-05-01 06:06:00+00:00,9020,0,0,True
2022-05-01 06:06:00+00:00,12109,11,8,True
2022-05-01 06:06:00+00:00,5001,30,1,True
...,...,...,...,...
2022-05-01 21:59:00+00:00,18024,3,3,True
2022-05-01 21:59:00+00:00,8004,0,1,True
2022-05-01 21:59:00+00:00,15056,16,2,True
2022-05-01 21:59:00+00:00,4104,3,9,True


In [6]:
pd.read_parquet(summaries[-1])

Unnamed: 0_level_0,station_code,available_mechanical,available_electrical,operative
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-12 06:40:00+00:00,21110,False,0.0,0.0
2018-05-21 05:30:00+00:00,17033,False,0.0,0.0
2020-06-29 12:52:00+00:00,19045,False,0.0,0.0
2020-08-14 02:12:00+00:00,31707,False,0.0,0.0
2021-02-11 09:14:00+00:00,16004,False,0.0,0.0
...,...,...,...,...
2022-05-09 21:10:00+00:00,10026,True,10.0,2.0
2022-05-09 21:10:00+00:00,17010,True,18.0,2.0
2022-05-09 21:10:00+00:00,18023,True,7.0,6.0
2022-05-09 21:10:00+00:00,10026,True,10.0,1.0


On voit le problème ci-dessus (`available_mechanical` semble avoir pris la place de `operative`)

In [7]:
pd.read_parquet(summaries[2]).drop_duplicates()

Unnamed: 0_level_0,station_code,available_mechanical,available_electrical,operative
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-05-02 22:08:00+00:00,16107,3,1,True
2022-05-02 22:08:00+00:00,11104,0,2,True
2022-05-02 22:08:00+00:00,9020,2,2,True
2022-05-02 22:08:00+00:00,12109,14,3,True
2022-05-02 22:08:00+00:00,5001,23,4,True
...,...,...,...,...
2022-05-03 21:54:00+00:00,18026,11,18,True
2022-05-03 21:54:00+00:00,18024,7,14,True
2022-05-03 21:54:00+00:00,13123,5,2,True
2022-05-03 21:54:00+00:00,4005,9,1,True


In [8]:
pd.read_parquet(summaries[3])

Unnamed: 0_level_0,station_code,available_mechanical,available_electrical,operative
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-01-06 15:54:00+00:00,13122,0,1.0,0.0
2022-05-03 22:04:00+00:00,16107,1,3.0,1.0
2022-05-03 22:04:00+00:00,41401,19,7.0,1.0
2022-05-03 22:04:00+00:00,33103,0,2.0,1.0
2022-05-03 22:04:00+00:00,20105,0,0.0,1.0
...,...,...,...,...
2022-05-04 21:54:00+00:00,26004,1,1.0,1.0
2022-05-04 21:54:00+00:00,32602,1,8.0,8.0
2022-05-04 21:54:00+00:00,20034,1,0.0,15.0
2022-05-04 21:54:00+00:00,11007,1,34.0,7.0


Le problème apparaît dans `summaries[3]` ! 

NB : 
* comme les données ont été récoltées par tranche de 10 min et que le code n'a pas changé à minuit, il va sans doute être nécessaire d'enquêter de plus près !
* on voit ci-dessus une grosse duplication de lignes pour la station `21110`, il serait sans doute judicier de faire un "drop duplicate". Voir même de faire le "drop duplicate" dans le code de création des "summaries" !

In [9]:
pd.read_parquet(summaries[4]).drop_duplicates()

Unnamed: 0_level_0,station_code,available_mechanical,available_electrical,operative
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-12 06:40:00+00:00,21110,False,0.0,0.0
2018-05-21 05:30:00+00:00,17033,False,0.0,0.0
2020-01-17 09:15:00+00:00,23204,False,0.0,0.0
2020-06-29 12:52:00+00:00,19045,False,0.0,0.0
2020-08-14 02:12:00+00:00,31707,False,0.0,0.0
...,...,...,...,...
2022-05-05 20:58:00+00:00,9114,True,4.0,3.0
2022-05-05 20:58:00+00:00,14010,True,10.0,8.0
2022-05-05 20:58:00+00:00,1006,True,11.0,0.0
2022-05-05 20:58:00+00:00,11101,True,28.0,14.0
