In [1]:
# import libries
import numpy as np
import pandas as pd
from google.colab import drive
import re

In [2]:
# mount G drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
def read_txt_into_df(file_path):
  # Define the column names
  column_names = ["CSA", "CBSA", "Name", "Total", "1 Unit", "2 Units", "3 & 4 Units", "5 Units or more", "Structures with 5 Units or more", "Monthly Coverage Percent"]

  # Function to check if a line starts with valid CSA and CBSA
  def is_new_record(line):
    return re.match(r'\d{3}\s+\d{5}', line) is not None

  # Read the file and preprocess lines
  processed_lines = []
  with open(file_path, 'r') as file:
    lines = file.readlines()[10:-6]  # Skip the first 6 header lines
    current_line = ""
    for line in lines:
      if is_new_record(line):
        if current_line:
          processed_lines.append(current_line.strip())
        current_line = line.strip()
      else:
        current_line += " " + line.strip()
    if current_line:
      processed_lines.append(current_line.strip())

  # Define a pattern to match each row after preprocessing
  pattern = re.compile(r'(\d{3})\s+(\d{5})\s+(.+?)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)')

  # Parse the processed lines
  rows = []
  for line in processed_lines:
    if line is None or line == '':
      continue
    match = pattern.match(line)
    if match:
      rows.append(match.groups())
    else:
      # Handle cases where the "Name" field is split across multiple lines
      parts = line.split()
      csa = parts[0]
      cbsa = parts[1]
      # Join the parts to recreate the name field until numeric data is encountered
      name_parts = []
      for part in parts[2:]:
        if re.match(r'\d+', part):
          break
        name_parts.append(part)
      name = ' '.join(name_parts)
      numeric_data = parts[2+len(name_parts):]
      rows.append((csa, cbsa, name) + tuple(numeric_data))

  # Create a DataFrame from the parsed rows
  df = pd.DataFrame(rows, columns=column_names)

  # Convert numeric columns to integers
  numeric_columns = ["Total", "1 Unit", "2 Units", "3 & 4 Units", "5 Units or more", "Structures with 5 Units or more", "Monthly Coverage Percent"]
  df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric)
  return df

In [4]:
file_path = '/content/drive/MyDrive/MADS Capstone Team 23/Data/raw/NewHomePermit/NewHomePermit-2010-01.txt'
df = read_txt_into_df(file_path)
df

Unnamed: 0,CSA,CBSA,Name,Total,1 Unit,2 Units,3 & 4 Units,5 Units or more,Structures with 5 Units or more,Monthly Coverage Percent
0,999,10180,"Abilene, TX",18,18,0,0,0,0,96
1,184,10420,"Akron, OH",21,21,0,0,0,0,71
2,999,10500,"Albany, GA",21,11,2,3,5,1,92
3,104,10580,"Albany-Schenectady-Troy, NY",48,48,0,0,0,0,69
4,999,10740,"Albuquerque, NM",100,90,0,0,10,1,100
...,...,...,...,...,...,...,...,...,...,...
360,999,49420,"Yakima, WA",12,12,0,0,0,0,35
361,564,49620,"York-Hanover, PA",27,27,0,0,0,0,66
362,566,49660,"Youngstown-Warren-Boardman, OH-PA",7,7,0,0,0,0,80
363,472,49700,"Yuba City, CA",8,8,0,0,0,0,82


In [8]:
new_df = pd.read_excel('/content/drive/MyDrive/MADS Capstone Team 23/Data/raw/NewHomePermit/NewHomePermit-2019-11.xls', skiprows=7)
new_df = new_df[1:]

In [10]:
cities_to_find = ['New York', 'Chicago', 'Los Angeles', 'Dallas', 'Seattle']
totals = []
for city in cities_to_find:
  total = new_df.loc[new_df['Name'].str.contains(city, case=False), 'Total'].values
  if total.size > 0:
    totals.append(total[0])
  else:
    totals.append(None)  # Handle cases where the city is not found
totals

[6967.0, 1628.0, 2033.0, 5734.0, 2803.0]

In [12]:
def get_total_permits(df):
  # Extract the total permits for each city from the original DataFrame
  cities_to_find = ['New York', 'Chicago', 'Los Angeles', 'Dallas', 'Seattle']
  totals = []
  for city in cities_to_find:
    total = df.loc[df['Name'].str.contains(city, case=False), 'Total'].values
    if total.size > 0:
      totals.append(total[0])
    else:
      totals.append(None)  # Handle cases where the city is not found
  return totals

In [13]:
newhome_df = pd.DataFrame({
    'city': ['New York', 'Chicago', 'Los Angeles', 'Dallas', 'Seattle']
})
new_columns = {}

for year in range(2010, 2024):
  for month in range(1, 13):
    print(f"processing: {year}-{month}")
    if year < 2019 or (year == 2019 and month <= 10):
      file_path = '/content/drive/MyDrive/MADS Capstone Team 23/Data/raw/NewHomePermit/NewHomePermit-' + str(year) + '-' + str(month).zfill(2) + '.txt'
      df = read_txt_into_df(file_path)
      totals = get_total_permits(df)
      new_columns[str(year) + '-' + str(month).zfill(2)] = totals
    else:
      file_path = '/content/drive/MyDrive/MADS Capstone Team 23/Data/raw/NewHomePermit/NewHomePermit-' + str(year) + '-' + str(month).zfill(2) + '.xls'
      df = pd.read_excel(file_path, skiprows=7)
      df = df[1:] # skip the empty line
      totals = get_total_permits(df)
      new_columns[str(year) + '-' + str(month).zfill(2)] = totals

newhome_df = pd.concat([newhome_df, pd.DataFrame(new_columns)], axis=1)
newhome_df


processing: 2010-1
processing: 2010-2
processing: 2010-3
processing: 2010-4
processing: 2010-5
processing: 2010-6
processing: 2010-7
processing: 2010-8
processing: 2010-9
processing: 2010-10
processing: 2010-11
processing: 2010-12
processing: 2011-1
processing: 2011-2
processing: 2011-3
processing: 2011-4
processing: 2011-5
processing: 2011-6
processing: 2011-7
processing: 2011-8
processing: 2011-9
processing: 2011-10
processing: 2011-11
processing: 2011-12
processing: 2012-1
processing: 2012-2
processing: 2012-3
processing: 2012-4
processing: 2012-5
processing: 2012-6
processing: 2012-7
processing: 2012-8
processing: 2012-9
processing: 2012-10
processing: 2012-11
processing: 2012-12
processing: 2013-1
processing: 2013-2
processing: 2013-3
processing: 2013-4
processing: 2013-5
processing: 2013-6
processing: 2013-7
processing: 2013-8
processing: 2013-9
processing: 2013-10
processing: 2013-11
processing: 2013-12
processing: 2014-1
processing: 2014-2
processing: 2014-3
processing: 2014-4


Unnamed: 0,city,2010-01,2010-02,2010-03,2010-04,2010-05,2010-06,2010-07,2010-08,2010-09,...,2023-03,2023-04,2023-05,2023-06,2023-07,2023-08,2023-09,2023-10,2023-11,2023-12
0,New York,874,934,1453,1072,1352,2706,1358,1499,1430,...,5633.0,2514.0,4618.0,3046.0,3026.0,4372.0,1954.0,3105.0,2103.0,2867.0
1,Chicago,345,370,561,543,924,734,631,651,577,...,1754.0,1467.0,1053.0,1397.0,967.0,1417.0,1330.0,1368.0,987.0,895.0
2,Los Angeles,433,869,968,893,612,1018,1053,1673,596,...,1851.0,2447.0,2638.0,3180.0,2484.0,3532.0,1870.0,1630.0,2601.0,2265.0
3,Dallas,1565,1382,2296,1481,1264,1683,1897,1810,1641,...,4291.0,4793.0,6860.0,5532.0,5954.0,5334.0,4509.0,6137.0,5146.0,3759.0
4,Seattle,1190,681,686,680,647,804,716,1327,748,...,1426.0,993.0,2336.0,1294.0,1068.0,1751.0,1271.0,1611.0,1096.0,1592.0


In [20]:
newhome_df.to_csv('/content/drive/MyDrive/MADS Capstone Team 23/Data/processed/NewHomeBuilt/newhome_permits_2010_2024.csv', index=False)
