# Generate daily summaries

In [2]:
%load_ext autoreload
%autoreload 2

import os
import pandas as pd

In [None]:
from src.daily_update import collect_statuses

### Process 2022-04-30

In [None]:
from src.daily_update import get_historique_file


data_path = os.listdir("data")

process_04_30 = [os.path.join("data", file_name) for file_name in data_path if file_name.startswith("historique_stations_2022-04")]

#print(process_04_30)

# Create full histo df
histo_df = pd.concat([get_historique_file(file_path, has_name=True) 
                        for file_path in process_04_30[:5]], axis=0)

histo_df = pd.concat([get_historique_file(file_path, has_name=True, has_code=True) 
                        for file_path in process_04_30[5:9]]+[histo_df], axis=0)

histo_df = pd.concat([get_historique_file(file_path, has_code=True) 
                        for file_path in process_04_30[9:]]+[histo_df], axis=0)

histo_df.sort_index(inplace=True)

#histo_df.drop_duplicates(inplace=True)
#histo_df.to_parquet("..\data\Summary_2022-04-30.parquet")

In [None]:
histo_df.reset_index(inplace=True)

In [None]:
# Number of entries with missing 'stationCode'
histo_df["stationCode"].isnull().sum()

#### Fill in missing `stationCode`

In [None]:
station_ref = pd.read_csv("data\\station_info_2022-05-15.csv",
                            names= ["stationCode", "stationName", "capacity", "station_geo", "operative"],
                            index_col="stationName")

station_ref.drop(columns=["capacity", "operative"], inplace=True)

In [None]:
# NB: using 'station_geo' since multiple stations can share the same 'stationName'
prod_df = histo_df.merge(station_ref, left_on="station_geo", right_on="station_geo")

In [None]:
# Checking that all 'stationCode' are populated
prod_df["stationCode_y"].isnull().sum()

In [None]:
# Whenever both columns are defined, they coincide...
(prod_df["stationCode_x"] != prod_df["stationCode_y"]).sum()

In [None]:
pd.read_parquet("data\\Summary_2022-05-17.parquet")

In [None]:
# Drop 'stationCode_x', rename 'stationCode_y'
histo_df = prod_df.drop(columns=["stationCode_x"]).rename(columns={"date":"file_time", "stationCode_y": "stationCode"}).set_index("file_time")



#### Save to parquet, under standard format

In [None]:
# Save to parquet
save_cols = ["stationCode", "operative", "available_mechanical", "available_electrical"]
save_path = os.path.join("data", "Summary_2022-04-30.parquet")

histo_df[save_cols].to_parquet(save_path)

In [None]:
pd.read_parquet("data\\Summary_2022-04-30.parquet")


### Process 2022-05-01

In [None]:
from src.daily_update import get_historique_file, get_status_df


data_path = os.listdir("data")

histo_05_01 = [os.path.join("data", file_name) for file_name in data_path if file_name.startswith("historique_stations_2022-05-01")]

status_05_01 = [os.path.join("data", file_name) for file_name in data_path if file_name.startswith("station_status_2022-05-01")]

histo_df0 = pd.concat([get_historique_file(file_path, has_code=True) for file_path in histo_05_01], axis=0)

histo_df1 = pd.concat([get_status_df(file_path, has_header=False) for file_path in status_05_01], axis=0)

histo_df = pd.concat([histo_df0, histo_df1], axis=0)

histo_df.sort_index(inplace=True)

save_path = os.path.join("data", "Summary_2022-05-01.parquet")
histo_df.to_parquet(save_path)

In [None]:
#save_path = os.path.join("data", "Summary_2022-05-01.parquet")

#pd.read_parquet(save_path).reset_index().rename(columns={"date":"file_time"}).set_index("file_time").to_parquet(save_path)

In [None]:
save_path = os.path.join("data", "Summary_2022-05-01.parquet")
pd.read_parquet(save_path)

### Process 2022-05-02 and 2022-05-03 (no headers!)

In [None]:
date_range = pd.date_range("2022-05-02", "2022-05-03")

for day in date_range:
    date_str = day.strftime("%Y-%m-%d")

    collect_statuses(date_str, has_header=False)
    print("Completed: ", date_str)

### Process 2022-05-04, with its specific issue

The issue with data from 2022-05-04 is a permutation (?) between the columns `available_mechanical` and `operative` (or something similar).

The change happens between:
* `station_status_2022-05-04_213207.csv`
* `station_status_2022-05-04_213253.csv`

NB: to find where the change happened, simply check a sudden change in the series of 'seconds' of file names (here from 07 to 53).

In [None]:
from src.daily_update import extract_enrich_data

data_path = os.listdir("data")

status_05_04 = [os.path.join("data", file_name) for file_name in data_path if file_name.startswith("station_status_2022-05-04")]

part_0 = [status for status in status_05_04 
            if status <= os.path.join("data","station_status_2022-05-04_213207.csv")]
part_1 = [status for status in status_05_04 
            if status > os.path.join("data","station_status_2022-05-04_213207.csv")]

histo_df0 = pd.concat([extract_enrich_data(file_path, has_header=False) for file_path in part_0], axis=0)

histo_df1 = pd.concat([extract_enrich_data(file_path, has_header=False) for file_path in part_1], axis=0)

Checking which column corresponds to mechanical and which one corresponds to electrical...

In [None]:
histo_df0[histo_df0["stationCode"]=="12109"].iloc[-5:]

In [None]:
histo_df1

##### Concatenate dataframes!

In [None]:
from src.utils import permute_cols_names

#histo_df1 = permute_cols_names(histo_df1)

histo_df = pd.concat([histo_df0, histo_df1], axis=0)

save_path = os.path.join("data", "Summary_2022-05-04.parquet")
histo_df.to_parquet(save_path)

#### Quick checks

In [None]:
save_path = os.path.join("data", "Summary_2022-05-04.parquet")
pd.read_parquet(save_path)

In [None]:
histo_df[histo_df["stationCode"]=="12109"].loc["2022-05-04 21:00": "2022-05-04 22:00"]

### Process 2022-05-05 to 2022-05-09

In [None]:
date_range = pd.date_range("2022-05-05", "2022-05-09")

for day in date_range:
    date_str = day.strftime("%Y-%m-%d")

    collect_statuses(date_str, has_header=False)
    print("Completed: ", date_str)

    save_path = os.path.join("data", "Summary_{}.parquet".format(date_str))
    aux_df = pd.read_parquet(save_path)

    permute_cols_names(aux_df).to_parquet(save_path)

#### Quick checks

In [None]:
save_path = os.path.join("data", "Summary_2022-05-09.parquet")
aux_df = pd.read_parquet(save_path)

aux_df

### Process 2022-05-10 (from no headers to built in headers)

There is a specific change on 2022-05-10:
* until `station_status_2022-05-10_111159.csv`, the csv file has no header (and there is a permutation among columns...)
* from `station_status_2022-05-10_111230.csv`, the csv file has a header!

In [None]:
from src.daily_update import extract_enrich_data


data_path = os.listdir("data")

status_05_10 = [os.path.join("data", file_name) for file_name in data_path if file_name.startswith("station_status_2022-05-10")]

part_0 = [status for status in status_05_10 
            if status <= os.path.join("data","station_status_2022-05-10_111159.csv")]
part_1 = [status for status in status_05_10 
            if status > os.path.join("data","station_status_2022-05-10_111159.csv")]

histo_df0 = pd.concat([extract_enrich_data(file_path, has_header=False) for file_path in part_0], axis=0)

histo_df1 = pd.concat([extract_enrich_data(file_path) for file_path in part_1], axis=0)

In [None]:
from src.utils import permute_cols_names

histo_df0 = permute_cols_names(histo_df0).rename(columns={"date":"time"})

In [None]:
histo_df0

In [None]:
histo_df1

In [None]:
histo_df = pd.concat([histo_df0, histo_df1], axis=0)

histo_df

#### Quick checks: continuity

In [None]:
# A selection of stations:
# 21209 - Montrouge, Molière - République
# 2009 - Bourse
# 14138 - Porte de Vanves

histo_df[histo_df["stationCode"]=="14138"].loc["2022-05-10 11:00": "2022-05-10 12:00"]

#### Saving file

In [None]:
save_path = os.path.join("data", "Summary_2022-05-10.parquet")
histo_df.to_parquet(save_path)

### Regular case (from 2022-05-11 onwards)

In [None]:
#date_range = pd.date_range("2022-05-11", "2022-05-17")
date_range = pd.date_range("2022-05-17", "2022-05-17")

for day in date_range:
    date_str = day.strftime("%Y-%m-%d")

    collect_statuses(date_str)
    print("Completed: ", date_str)

### Compression for 2022-05-18

NB: should be needed only once! Here simply for reference

In [3]:
date_str = "2022-05-18"

file_name = "Summary_{}.parquet".format(date_str)
file_path = os.path.join("data", file_name)

aux_df = pd.read_parquet(file_path)

aux_df.to_parquet(file_path, compression="brotli")