In [1]:
# connect to Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# import packages
import pandas as pd
import numpy as np

### How should the output df looks like?
#### Columns:
 - Year,
 - Population_In_Housing_Units,
 - Owner_Occupied_Population,
 - Renter-Occupied-Population,
 - OOP_Moved_From_Different_County,
 - OOP_Moved_From_Different_State,
 - OOP_Moved_From_Abroad,
 - ROP_Moved_From_Different_County,
 - ROP_Moved_From_Different_State,
 - ROP_Moved_From_Abroad.

In [40]:
def clean_column_names(df: pd.DataFrame) -> pd.DataFrame:
  # Rename the columns
  new_columns = []
  for col in df.columns:
    split = col.split('!!')
    if len(split) > 2:
      new_column = split[-2] + ' ' + split[-1]
    else:
      new_column = split[-1]
    new_columns.append(new_column)

  df.columns = new_columns
  df= df.rename(columns={"Label (Grouping)": "Label",
                         "Total Estimate": "Total Population",
                         "Moved; from different county, same state Estimate": "From Different County",
                         "Moved; from different  state Estimate": "From Different State",
                         "Moved; from abroad Estimate": "From Abroad"
                         })

  # filter out columns that are not requried
  df = df[["Label", "Total Population", "From Different County", "From Different State", "From Abroad"]]

  # filter our rows that are not requried
  df = df.iloc[61:64]

  # simplify the label
  df.loc[61]['Label'] = "Population 1 year and over in housing units"
  df.loc[62]['Label'] = "Householder lived in owner-occupied housing units"
  df.loc[63]['Label'] = "Householder lived in renter-occupied housing units"

  return df

In [41]:
def convert_population_to_int(df: pd.DataFrame) -> pd.DataFrame:
  df["Total Population"] = df["Total Population"].str.replace(',', '', regex=True).astype(int)
  df["From Different County"] = df["From Different County"].str.replace('%', '', regex=True).astype(float) / 100
  df["From Different State"] = df["From Different State"].str.replace('%', '', regex=True).astype(float) / 100
  df["From Abroad"] = df["From Abroad"].str.replace('%', '', regex=True).astype(float) / 100
  return df

def calculate_population_from_percentages(df: pd.DataFrame) -> pd.DataFrame:
  df["From Diferent County Population"] = (df["From Different County"] * df["Total Population"]).astype(int)
  df["From Diferent State Population"] = (df["From Different State"] * df["Total Population"]).astype(int)
  df["From Abroad Population"] = (df["From Abroad"] * df["Total Population"]).astype(int)
  return df


In [42]:
def build_data_entry(df: pd.DataFrame, year: int) -> pd.DataFrame:
  Population_In_Housing_Units = int(df[df["Label"]=="Population 1 year and over in housing units"]["Total Population"].iloc[0])
  Owner_Occupied_Population = int(df[df["Label"]=="Householder lived in owner-occupied housing units"]["Total Population"].iloc[0])
  Renter_Occupied_Population = int(df[df["Label"]=="Householder lived in renter-occupied housing units"]["Total Population"].iloc[0])
  OOP_Moved_From_Different_County = int(df[df["Label"]=="Householder lived in owner-occupied housing units"]["From Diferent County Population"].iloc[0])
  OOP_Moved_From_Different_State = int(df[df["Label"]=="Householder lived in owner-occupied housing units"]["From Diferent State Population"].iloc[0])
  OOP_Moved_From_Abroad = int(df[df["Label"]=="Householder lived in owner-occupied housing units"]["From Abroad Population"].iloc[0])
  ROP_Moved_From_Different_County = int(df[df["Label"]=="Householder lived in renter-occupied housing units"]["From Diferent County Population"].iloc[0])
  ROP_Moved_From_Different_State = int(df[df["Label"]=="Householder lived in renter-occupied housing units"]["From Diferent State Population"].iloc[0])
  ROP_Moved_From_Abroad = int(df[df["Label"]=="Householder lived in renter-occupied housing units"]["From Abroad Population"].iloc[0])

  data = {'Year': [year],
          'Population_In_Housing_Units': [Population_In_Housing_Units],
          'Owner_Occupied_Population': [Owner_Occupied_Population],
          'Renter-Occupied-Population': [Renter_Occupied_Population],
          'OOP_Moved_From_Different_County': [OOP_Moved_From_Different_County],
          'OOP_Moved_From_Different_State': [OOP_Moved_From_Different_State],
          'OOP_Moved_From_Abroad': [OOP_Moved_From_Abroad],
          'ROP_Moved_From_Different_County': [ROP_Moved_From_Different_County],
          'ROP_Moved_From_Different_State': [ROP_Moved_From_Different_State],
          'ROP_Moved_From_Abroad': [ROP_Moved_From_Abroad]}

  new_df = pd.DataFrame(data)
  return new_df

In [51]:
def process_population_data(city: str, start_year: int, end_year: int) -> pd.DataFrame:
  result = pd.DataFrame()
  for year in range(start_year, end_year + 1):
    print(f"Processing year: {year}")
    # build the file name
    file_name = f'{city}-{year}.csv'

    # read the csv into pandas data frame
    population_df = pd.read_csv(f'/content/drive/MyDrive/MADS Capstone Team 23/Data/raw/Population/{city}/{file_name}')

    # process data
    population_df = clean_column_names(population_df)
    population_df = convert_population_to_int(population_df)
    population_df = calculate_population_from_percentages(population_df)
    new_df = build_data_entry(population_df, year)

    result = pd.concat([result, new_df])
    result.reset_index(drop=True, inplace=True)
  return result

In [52]:
result = process_population_data('NewYork', 2010, 2022)

result

Processing year: 2010
Processing year: 2011
Processing year: 2012
Processing year: 2013
Processing year: 2014
Processing year: 2015
Processing year: 2016
Processing year: 2017
Processing year: 2018
Processing year: 2019
Processing year: 2020
Processing year: 2021
Processing year: 2022


Unnamed: 0,Year,Population_In_Housing_Units,Owner_Occupied_Population,Renter-Occupied-Population,OOP_Moved_From_Different_County,OOP_Moved_From_Different_State,OOP_Moved_From_Abroad,ROP_Moved_From_Different_County,ROP_Moved_From_Different_State,ROP_Moved_From_Abroad
0,2010,18324602,10196421,8128181,112160,61178,30589,203204,162563,121922
1,2011,18414253,10024811,8389442,100248,60148,30074,226514,176178,125841
2,2012,18563646,10124897,8438749,101248,50624,40499,202529,168774,118142
3,2013,19304795,10618214,8686581,127418,74327,42472,225851,173731,112925
4,2014,19446773,10535713,8911060,115892,63214,42142,204954,169310,124754
5,2015,19531390,10592400,8938990,127108,74146,42369,214535,169840,116206
6,2016,19519193,10570716,8948477,158560,73995,42282,214763,161072,134227
7,2017,19693150,10864167,8828983,162962,76049,43456,203066,176579,132434
8,2018,19345504,10760287,8585217,172164,86082,32280,231800,163119,111607
9,2019,18615239,10196915,8418324,152953,71378,30590,218876,151529,109438


In [53]:
target = [
    ("NewYork", 2010, 2022),
    ("LosAngeles", 2010, 2022),
    ("Chicago", 2010, 2022),
    ("Dallas", 2010, 2022),
    ("Seattle", 2010, 2022)
]

for city, start_year, end_year in target:
  print
  print(f"Processing city: {city}")
  result = process_population_data(city, start_year, end_year)
  result.to_csv(f'/content/drive/MyDrive/MADS Capstone Team 23/Data/processed/Population_{city}_{start_year}_{end_year}.csv', index=False)

Processing city: NewYork
Processing year: 2010
Processing year: 2011
Processing year: 2012
Processing year: 2013
Processing year: 2014
Processing year: 2015
Processing year: 2016
Processing year: 2017
Processing year: 2018
Processing year: 2019
Processing year: 2020
Processing year: 2021
Processing year: 2022
Processing city: LosAngeles
Processing year: 2010
Processing year: 2011
Processing year: 2012
Processing year: 2013
Processing year: 2014
Processing year: 2015
Processing year: 2016
Processing year: 2017
Processing year: 2018
Processing year: 2019
Processing year: 2020
Processing year: 2021
Processing year: 2022
Processing city: Chicago
Processing year: 2010
Processing year: 2011
Processing year: 2012
Processing year: 2013
Processing year: 2014
Processing year: 2015
Processing year: 2016
Processing year: 2017
Processing year: 2018
Processing year: 2019
Processing year: 2020
Processing year: 2021
Processing year: 2022
Processing city: Dallas
Processing year: 2010
Processing year: 2