# Data Staging
### Extract, Transform, Load (ETL)

**Create Data Frame for Facility Locations in Ontario**


In [204]:
import pandas as pd

Extract Facility Location

In [205]:
# read file containing information about recreation centre locations and types
# facility location dataframe
df = pd.read_csv('./ODRSF_v1.0.csv', encoding='cp1252')

  df = pd.read_csv('./ODRSF_v1.0.csv', encoding='cp1252')


Transform Facility Dataframe

In [206]:
# Remove unneeded columns
df = df.drop(columns = ['Index', 'Facility_Name', 'Source_Facility_Type', 'Provider', 'City', 'Source_Format_Address', 'CSD_UID', 'PR_UID', 'Latitude', 'Longitude'])

# Remove unit since almost all values are null
df = df.drop(columns = ['Unit'])

In [207]:
#Replace empty values of street_no with -1
df.loc[df['Street_No'] == '..', 'Street_No'] = '-1'

# Replace rest of street strings with an empty value
df.loc[df['Street_Name'] == '..', 'Street_Name'] = ''
df.loc[df['Street_Type'] == '..', 'Street_Type'] = ''
df.loc[df['Street_Direction'] == '..', 'Street_Direction'] = ''
df.loc[df['Street_Direction'] == '..', 'Street_Direction'] = ''
df.loc[df['Postal_Code'] == '..', 'Postal_Code'] = ''

In [208]:
#convert street_no to integer
df['Street_No'] = df['Street_No'].astype(int)

In [209]:
# Captialize all province/territory column
df['Prov_Terr'] = df['Prov_Terr'].str.upper()

In [210]:
# remove all non ontario provinces
df = df[df['Prov_Terr'] == 'ON']

In [211]:
unique_cities = df['CSD_Name'].unique()

**Create Dataframe for City Demographics in Ontario**

In [212]:
demographic_df = pd.read_csv('./Ontario_Demographics.csv', encoding='cp1252')

In [213]:
demographic_df = demographic_df.drop(columns = ['CENSUS_YEAR', 'DGUID', 'ALT_GEO_CODE', 'GEO_LEVEL', 'TNR_SF', 'TNR_LF', 'DATA_QUALITY_FLAG','SYMBOL', 'CHARACTERISTIC_ID', 'CHARACTERISTIC_NOTE', 'C2_COUNT_MEN+', 'C3_COUNT_WOMEN+', 'C10_RATE_TOTAL', 'C11_RATE_MEN+', 'C12_RATE_WOMEN+', 'SYMBOL.1','SYMBOL.2','SYMBOL.3','SYMBOL.4','SYMBOL.5'])

In [214]:
# Remove unnecessary information after comma in geo_name so processing can be easier
demographic_df['GEO_NAME'] = demographic_df['GEO_NAME'].str.split(',').str[0]

In [215]:
# only keep rows where the city is both in the demographic dataframe and the facility location dataframe
demographic_df = demographic_df[demographic_df['GEO_NAME'].isin(unique_cities)]

In [216]:
#get starting and ending index of rows that contain age information about the city
age_indexes = demographic_df[demographic_df['CHARACTERISTIC_NAME'].str.contains('Total - Age groups of the population - 100% data|100 years and over')].index

#get starting and ending index of rows that contain ethnicity information about the city
ethnicity_indexes = demographic_df[demographic_df['CHARACTERISTIC_NAME'].str.contains('Total visible minority population|Total - Ethnic or cultural origin for the population in private households - 25% sample data')].index

In [217]:
selected_age_dfs = []
selected_ethnicity_dfs = []

# store only data in dataframe from start and end indexes
for i in range(0, len(age_indexes), 2):
    age_df = demographic_df[age_indexes[i] + 1: age_indexes[i + 1] + 1]
    ethnicity_df = demographic_df[ethnicity_indexes[i] + 1: ethnicity_indexes[i + 1]]
    selected_age_dfs.append(age_df)
    selected_ethnicity_dfs.append(ethnicity_df)

#resulting dataframes 
result_age_df = pd.concat(selected_age_dfs, ignore_index=True)
result_ethnicity_df = pd.concat(selected_ethnicity_dfs, ignore_index=True)

#result_age_df.to_csv('ageres.csv', index=False)
#result_ethnicity_df.to_csv('ethres.csv', index=False)

In [218]:
# set empty values/null to 0
result_age_df.fillna({'C1_COUNT_TOTAL': 0}, inplace=True)
# check for null
result_age_df['C1_COUNT_TOTAL'].isnull().sum()

0

In [219]:
# set empty values/null to 0
result_ethnicity_df.fillna({'C1_COUNT_TOTAL': 0}, inplace=True)
# check for null
result_ethnicity_df['C1_COUNT_TOTAL'].isnull().sum()

0

In [220]:
# get all unique cities from list
cities = result_age_df['GEO_NAME'].unique()

Transpose Age Dataframe

In [221]:
# Unique age ranges - will be used as the new column in the new dataframe
ages = result_age_df['CHARACTERISTIC_NAME'].unique()

age_df = pd.DataFrame(columns=ages)

# add new column for city
age_df.insert(loc=0, column='City', value='')

In [222]:
# Populate first column cities with all city variables
age_df['City'] = cities

# Set column data
j = 0
for i in range(len(cities)):
    for column in age_df.columns[1:]: 
        age_df.loc[i, column] = result_age_df.loc[j,'C1_COUNT_TOTAL']
        j += 1     

Transpose Ethnicity Dataframe

In [223]:
ethnicities = result_ethnicity_df['CHARACTERISTIC_NAME'].unique()

ethnicity_df = pd.DataFrame(columns=ethnicities)

ethnicity_df.insert(loc=0, column='City', value='')

In [224]:
ethnicity_df['City'] = cities

k = 0
for i in range(len(cities)):
    for column in ethnicity_df.columns[1:]:
        ethnicity_df.loc[i, column] = result_ethnicity_df.loc[k,'C1_COUNT_TOTAL']
        k += 1

#for column in ethnicity_df.columns[1:]:
    #ethnicity_df[column] = ethnicity_df[column].astype(int)

In [225]:
ethnicity_df.to_csv('teste.csv', index=False)
age_df.to_csv('testa.csv', index=False)