In [1]:
# Imports the required Python libraries and 
# sets global variables for the assignment
import calendar
import datetime
from pathlib import PurePosixPath

today = datetime.date.today()
current_year = today.year
days_in_year = 365

if calendar.isleap(current_year):
    days_in_year +=1

hours_in_year = days_in_year * 24

In [2]:
# Creates paths for the external, interim, processed, and raw directories
# Use these paths when creating new paths

root_data_dir = PurePosixPath('/data')
external_data_dir = root_data_dir.joinpath('external')
interim_data_dir = root_data_dir.joinpath('interim')
processed_data_dir = root_data_dir.joinpath('processed')
raw_data_dir = root_data_dir.joinpath('raw')

print('Root Data Directory: {}'.format(root_data_dir))
print('External Data Directory: {}'.format(external_data_dir))
print('Interim Data Directory: {}'.format(interim_data_dir))
print('Processed Data Directory: {}'.format(processed_data_dir))
print('Raw Data Directory: {}'.format(raw_data_dir))

Root Data Directory: /data
External Data Directory: /data/external
Interim Data Directory: /data/interim
Processed Data Directory: /data/processed
Raw Data Directory: /data/raw


In [3]:
acs_summary_file_dirs = set()
pums_dirs = set()
tiger_dirs = set()

# TODO: Create and add the paths for this data set

In [4]:
# Will generate directories to reduce repeated code
# This function is continiously used throughout assignment
def DirGenerator(root, dir_names, add_to=None):
    paths = [root.joinpath(directory) for directory in dir_names]
    # Appends to a set for refrence
    if add_to is not None:
        for path in paths:
            add_to.add(format(path))

In [5]:
# Creates initial directories
census_data_dir = external_data_dir.joinpath('census')
acs_summary_data_dir = census_data_dir.joinpath('acs-summaryfile')
pums_data_dir = census_data_dir.joinpath('pums')
tiger_data_dir = census_data_dir.joinpath('tiger')

# Generates the directories
years = ['2015', '2016', '2017', '2018', '2019']       
DirGenerator(acs_summary_data_dir, years, add_to=acs_summary_file_dirs)

years = ['2015', '2016', '2017', '2018', '2019', '2020']
DirGenerator(pums_data_dir, years, add_to=pums_dirs)
DirGenerator(tiger_data_dir, years, add_to=tiger_dirs)

# Should output sorted directories from 2015 to present 
sorted(list(acs_summary_file_dirs)), sorted(list(pums_dirs)), sorted(list(tiger_dirs))

(['/data/external/census/acs-summaryfile/2015',
  '/data/external/census/acs-summaryfile/2016',
  '/data/external/census/acs-summaryfile/2017',
  '/data/external/census/acs-summaryfile/2018',
  '/data/external/census/acs-summaryfile/2019'],
 ['/data/external/census/pums/2015',
  '/data/external/census/pums/2016',
  '/data/external/census/pums/2017',
  '/data/external/census/pums/2018',
  '/data/external/census/pums/2019',
  '/data/external/census/pums/2020'],
 ['/data/external/census/tiger/2015',
  '/data/external/census/tiger/2016',
  '/data/external/census/tiger/2017',
  '/data/external/census/tiger/2018',
  '/data/external/census/tiger/2019',
  '/data/external/census/tiger/2020'])

In [6]:
# Code below is used to build dict structure {month: [days]}
# Helpful for passing to function DirGenerator
import calendar

# Collapses dates in calander dates list
def flatten(dates):
    for i in dates:
        if isinstance(i, list):
            for j in flatten(i):
                yield j
        else:
            yield i
            

cal = calendar.Calendar()
year = 2020

# Creates dict with {month: []} pairs
dates = list(flatten(cal.yeardatescalendar(year)))
months = set(str(date.month) for date in dates)
month_days = {month: [] for month in months}

# Adds list of values for each month's days within month_days
for date in dates:
    if str(date.day) not in month_days[str(date.month)]:
        month_days[str(date.month)].append(str(date.day))

# Creates padding with 0s (e.g., 1 to 01)
padded_dict = {}
for month, days in month_days.items():
    days = [str(day.zfill(2)) for day in days]
    padded_dict[str(month.zfill(2))] = list(days)
    
month_days = padded_dict

In [7]:
forecast_dirs = set()

# TODO: Create and add the paths for this data set

# Creates initial directories
nwc_wpc_data_dir = external_data_dir.joinpath('nwc-wpc')
forecasts_data_dir = nwc_wpc_data_dir.joinpath('forecasts')
for2020_data_dir = forecasts_data_dir.joinpath('2020')

In [8]:
# Creates directories for months
DirGenerator(for2020_data_dir, months)

# Creates directories for days in months dirs
parent = format(for2020_data_dir)
for month, days in month_days.items():
    child = PurePosixPath((f'{parent}/{month}'))
    DirGenerator(child, days, add_to=forecast_dirs)

# Should have 365 directories (366 if leap year)
len(forecast_dirs)

366

In [9]:
inventory_dirs = set()
expenses_dirs = set()

# TODO: Create and add the paths for this data set

In [10]:
# Creates initial directories
raw_expenses_data_dir = raw_data_dir.joinpath('expenses')
raw_inventory_data_dir = raw_data_dir.joinpath('inventory')

in_location_dirs = [raw_inventory_data_dir.joinpath('bwi'),
                    raw_inventory_data_dir.joinpath('cmh'),
                    raw_inventory_data_dir.joinpath('den'),
                    raw_inventory_data_dir.joinpath('oma'),
                    raw_inventory_data_dir.joinpath('sfo')]

ex_location_dirs = [raw_expenses_data_dir.joinpath('bwi'),
                    raw_expenses_data_dir.joinpath('cmh'),
                    raw_expenses_data_dir.joinpath('den'),
                    raw_expenses_data_dir.joinpath('oma'),
                    raw_expenses_data_dir.joinpath('sfo')]

In [11]:
# Creates directories for months in inventory location dirs
for location_dir in in_location_dirs:
    DirGenerator(location_dir, months)

# Creates directories for months in expenses location dirs
for location_dir in ex_location_dirs:
    DirGenerator(location_dir, months)

# Creates directories for days in inventory
for PosixPath in in_location_dirs:
    # Formats parent path
    parent = f'{format(PosixPath)}/2020'
    for month, days in month_days.items():
        child = PurePosixPath((f'{parent}/{month}'))
        DirGenerator(child, days, add_to=inventory_dirs)

# Creates directories for days in expenses
for PosixPath in ex_location_dirs:
    # Formats parent path
    parent = f'{format(PosixPath)}/2020'
    for month, days in month_days.items():
        child = PurePosixPath((f'{parent}/{month}'))
        DirGenerator(child, days, add_to=expenses_dirs)

In [12]:
# Should have 1825 directories (1830 if leap year)
len(inventory_dirs), len(expenses_dirs) 

(1830, 1830)

In [13]:
sales_dirs = set()

# TODO: Create and add the paths for this data set

In [14]:
# Creates initial directories
raw_sales_data_dir = raw_data_dir.joinpath('sales')

sales_location_dirs = [raw_sales_data_dir.joinpath('bwi'),
                       raw_sales_data_dir.joinpath('cmh'),
                       raw_sales_data_dir.joinpath('den'),
                       raw_sales_data_dir.joinpath('oma'),
                       raw_sales_data_dir.joinpath('sfo')]

In [15]:
# Creates directories for months in sales location dirs
for location_dir in sales_location_dirs:
    DirGenerator(location_dir, months)

# Creates directories for days in sales
for PosixPath in sales_location_dirs:
    # Formats parent path
    parent = f'{format(PosixPath)}/2020'
    for month, days in month_days.items():
        child = PurePosixPath((f'{parent}/{month}'))
        DirGenerator(child, days, add_to=sales_dirs)

# Lists hours in a day
hours = [str(hour).zfill(2) for hour in range(24)]

# Creates directories for hours in sales
hourly_sales_dirs = set()
for sales_day_dir in sales_dirs:
    parent = PurePosixPath(sales_day_dir)
    DirGenerator(parent, hours, add_to=hourly_sales_dirs) 

In [16]:
sales_dirs = hourly_sales_dirs

# Should have 43,800 directories (43,920 if leap year)
len(sales_dirs) 

43920

In [17]:
modeling_data_dirs = set()

# TODO: Create and add the paths for this data set

In [18]:
year = 2020
week_starter = 0

# Creates dict {month: [week_start_date]} for year
month_weeks = {str(month+1): [] for month in range(12)}
for i in range(12):
    month = i+1
    for date in cal.itermonthdays4(year, month):
        if date[0] == year and date[3] == week_starter:
            if str(date[2]) not in month_weeks[str(date[1])]:
                month_weeks[str(date[1])].append(str(date[2]))
            
# Creates padding with 0s (e.g., 1 to 01)
padded_dict = {}
for month, weeks in month_weeks.items():
    weeks = [str(week.zfill(2)) for week in weeks]
    padded_dict[str(month.zfill(2))] = list(weeks)
    
month_weeks = padded_dict

In [19]:
# Creates initial directory
modeling_data_dir = processed_data_dir.joinpath('modeling')

# Creates directories for months
DirGenerator(modeling_data_dir, months)

# Creates directories for days in months dirs
parent = format(modeling_data_dir)
for month, weeks in month_weeks.items():
    child = PurePosixPath((f'{parent}/{month}'))
    DirGenerator(child, weeks, add_to=modeling_data_dirs)

In [20]:
# Should have 52 directories
len(modeling_data_dirs)

52

In [21]:
inventory_request_dirs = set()

# TODO: Create and add the paths for this data set

In [22]:
# Creates initial directory
inventory_data_dir = processed_data_dir.joinpath('inventory')
requests_data_dir = processed_data_dir.joinpath('requests')
requests2020_data_dir = processed_data_dir.joinpath('2020')

# Generates the month directories
DirGenerator(requests2020_data_dir, month_days.keys(), add_to=inventory_request_dirs)

In [23]:
 # Should output 12 directories
sorted(list(inventory_request_dirs))

['/data/processed/2020/01',
 '/data/processed/2020/02',
 '/data/processed/2020/03',
 '/data/processed/2020/04',
 '/data/processed/2020/05',
 '/data/processed/2020/06',
 '/data/processed/2020/07',
 '/data/processed/2020/08',
 '/data/processed/2020/09',
 '/data/processed/2020/10',
 '/data/processed/2020/11',
 '/data/processed/2020/12']

In [24]:
financials_dirs = set()

# TODO: Create and add the paths for this data set

In [25]:
# Creates initial directory
financials_data_dir = processed_data_dir.joinpath('financials')
quarterly_data_dir = financials_data_dir.joinpath('quarterly')
quarterly2020_data_dir = quarterly_data_dir.joinpath('2020')

quarters = ['01', '02', '03', '04']

# Generates the quarterly directories
DirGenerator(quarterly2020_data_dir, quarters, add_to=financials_dirs)

In [26]:
# Should output four quarterly directories
sorted(list(financials_dirs)) 

['/data/processed/financials/quarterly/2020/01',
 '/data/processed/financials/quarterly/2020/02',
 '/data/processed/financials/quarterly/2020/03',
 '/data/processed/financials/quarterly/2020/04']