In [31]:
from openpyxl import load_workbook
import pandas as pd
import os

In [32]:
def extract_ridership(col_lst) -> dict:
    """ Extracts the ridership values from a list of list and puts it in a dictionary.
        This function already partially cleans the data by removing values that are not integers (i.e, str and None types)

    Args:
        col_lst (list): This is a list of list that contains ridership values for each station—grouped by columns.

    Returns:
        dict: This a dictionary containing the ridership numbers as its values and the station names as its keys
    """
    
    station_riders_dict = {}
    
    for col in col_lst:
        
        value_list = []
        str_list = []
        
        for cell in col:
            if isinstance(cell, int):
                value_list.append(cell)
            
            if isinstance(cell, str):
                str_list.append(cell)   
        
        station_name = str_list[0]
        
        station_riders_dict[station_name] = value_list
    
    return station_riders_dict

In [33]:
def extract_hours(row_lst) -> list:
    """ Extracts the hour values from a list of list and puts it in a list.
        This function only extracts the hour values that have ridership values associated with it.

    Args:
        row_list (list): This is a list of list that contains values for all stations—grouped by rows.

    Returns:
        list: This a list containing the hours that have values associated with it (i.e., non-empty entries)
    """
    
    hours_list = []
    
    for row in row_lst:
        if isinstance(row[5], int):
            hours_list.append(row[0])
    
    return hours_list

In [34]:
def generate_date(row_lst, excel_file) -> list:
    """Generates date associated with the entries from the table.
       The assumption of this function is that if the cell contains the string, "Entry",
       then it means that following set of values belong to a new day/date.
        
    Args:
        row_lst (list): This is a list of list that contains values for all stations—grouped by rows.

    Returns:
        list: This is a list containing the dates that have values associated with it (i.e., non-empty entries)
    """
    
    dates_list = []
    day = 0
    year_month = excel_file.split(".")[0]
    
    for row in row_lst:
        if row[1] == "Entry":
            day += 1
        
        if isinstance(row[1], int):
            dates_list.append(f"{year_month}-{day}")
        
    return dates_list

In [None]:
def troubleshoot():
    return None

In [55]:
#  list of all available excel files from "data" folder
excel_file_list = os.listdir("data")

hours_list = []
dates_list = []

for excel_file in excel_file_list:
    
    try:
        daily = load_workbook(filename=f"data/{excel_file}")["Daily"]
        
        daily_total_row = daily.max_row
        
        daily_col_range = daily.iter_cols(min_col=2, max_col=27, min_row=1, max_row=daily_total_row, values_only=True)
        daily_row_range = daily.iter_rows(min_row=1, max_row=daily_total_row, min_col=1, max_col=27, values_only=True)
        
        daily_cols = [col for col in daily_col_range]
        daily_rows = [row for row in daily_row_range]
        
        for hour in extract_hours(daily_rows):
            hours_list.append(hour)
        
        for date in generate_date(daily_rows, excel_file):
            dates_list.append(date)
    except ValueError:
        pass
    
    

In [57]:
print(len(hours_list))
print(len(dates_list))
print(len(hours_list)-len(dates_list))

26585
26578
7
