# Exploration of data

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import pandas as pd

# import numpy as np
# import pyarrow

In [None]:
#!conda install -c conda-forge pyarrow

## Test script

In [None]:
import fetch_data as fd

In [None]:
fd.get_statuses()

## Load 'historique_stations' data

In [None]:
file_path = os.path.join("..", "data", "historique_stations_2022-04-30_233325.csv")

In [None]:
def get_df(file_path):
    """ Get dataframe from provided path """
    df = pd.read_csv(
        file_path, 
        parse_dates=[0], 
        header=None, 
        names= ["date", "capacity","available_mechanical","available_electrical", "stationCode", "station_geo","operative"]
    )
    return df

In [None]:
df = get_df(file_path)
df

### Extract specific stations

In [None]:
# NB; strings since some station names are not int(!)

ref_ids = [
    #"Molière - République",
    '21209',
    #"Jean Marin Naudin - Stalingrad",
    '22202',
    #"Arthur Auger - Jean Jaurès",
    '21205',
    #"Marne - Germain Dardan"
    '21212',
]

In [None]:
df[df["stationCode"].isin(ref_ids)]

## Load status data

In [None]:
status_path = os.path.join("..", "data", "station_status_2022-05-01_080604.csv")

In [None]:
def get_status_df(file_path):
    """ Get dataframe from provided path """
    df = pd.read_csv(
        file_path, 
        parse_dates=[0], 
        header=None, 
        names= ["date", "station_code", "available_mechanical", "available_electrical", "operative"],
        index_col="date"
    )
    return df

In [None]:
status_df = get_status_df(status_path)

status_df

## Data cleaning and collection

In [None]:
def get_historique_file(input_path, has_name=False, has_code=False):
    """ Get 'historique' file """
    
    try:
        col_names = ["date", "capacity","available_mechanical","available_electrical"]
        if has_name:
            col_names.append("stationName")
        if has_code:
            col_names.append("stationCode")
        
        # In any case
        col_names += ["station_geo","operative"]
        
        df = pd.read_csv(input_path, header=None, parse_dates=[0],
            names= col_names,
            index_col="date"
           )
        
        return df
    except KeyError as e:
        msg = "Something wrong in '{}'. Error details:".format(input_path)
        print(msg)
        print(e)

### Process 2022-04-30

In [None]:
data_path = os.listdir("../data")

process_04_30 = [os.path.join("..", "data", file_name) for file_name in data_path if file_name.startswith("historique_stations_2022-04")]

print(process_04_30)

In [None]:
# Create full histo df
histo_df = pd.concat([get_historique_file(file_path, has_name=True) for file_path in process_04_30[:5]], axis=0)

histo_df = pd.concat([get_historique_file(file_path, has_name=True, has_code=True) for file_path in process_04_30[5:9]]+[histo_df], axis=0)

histo_df = pd.concat([get_historique_file(file_path, has_code=True) for file_path in process_04_30[9:]]+[histo_df], axis=0)

histo_df.sort_index(inplace=True)

In [None]:
histo_df.to_parquet("..\data\Summary_2022-04-30.parquet")

In [None]:
histo_df

In [None]:
aux_df=pd.read_parquet("..\data\Summary_2022-04-30.parquet")

In [None]:
aux_df.equals(histo_df)

### Process 2022-05-01

In [None]:
data_path = os.listdir("../data")

In [None]:
histo_05_01 = [os.path.join("..", "data", file_name) for file_name in data_path if file_name.startswith("historique_stations_2022-05-01")]

In [None]:
status_05_01 = [os.path.join("..", "data", file_name) for file_name in data_path if file_name.startswith("station_status_2022-05-01")]

In [None]:
histo_df = pd.concat([get_historique_file(file_path, has_code=True) for file_path in histo_05_01], axis=0)

histo_df = pd.concat([get_status_df(file_path) for file_path in status_05_01], axis=0)

histo_df.sort_index(inplace=True)

histo_df.to_parquet("..\data\Summary_2022-05-01.parquet")

### Other dates

In [None]:
import daily_update as du

In [None]:
for day in ["02", "03", "04", "05", "06", "07", "08", "09"]:
    date_str = "2022-05-{}".format(day)
    du.collect_statuses(date_str)
    print("Completed: ", date_str)

# Explo data

In [None]:
# Ignore first days, with different format
data_path = os.listdir("../data")

summaries = [os.path.join("..", "data", file_name) for file_name in data_path 
                if file_name.startswith("Summary_2022-05")]

print(summaries)

In [None]:
data_df = pd.concat([pd.read_parquet(file_path) for file_path in summaries],
                    axis=0)
data_df

Oups ! Dans le ci-dessus il s'est passé des choses horribles, à vue de nez un échange entre `available_mechanical` et `operative` ou quelque chose dans ce goût là ! Il va être nécessaire d'étudier le comportement de plus près !

In [None]:
pd.read_parquet(summaries[0])

In [None]:
pd.read_parquet(summaries[-1])

On voit le problème ci-dessus (`available_mechanical` semble avoir pris la place de `operative`)

In [None]:
pd.read_parquet(summaries[2]).drop_duplicates()

In [None]:
pd.read_parquet(summaries[3])

Le problème apparaît dans `summaries[3]` ! 

NB : 
* comme les données ont été récoltées par tranche de 10 min et que le code n'a pas changé à minuit, il va sans doute être nécessaire d'enquêter de plus près !
* on voit ci-dessus une grosse duplication de lignes pour la station `21110`, il serait sans doute judicier de faire un "drop duplicate". Voir même de faire le "drop duplicate" dans le code de création des "summaries" !

In [None]:
pd.read_parquet(summaries[4]).drop_duplicates()