# Bronze - Orders Ingestion
# Purpose: Ingest raw zip codes data into Bronze layer
# Source: webpage
# Output: bronze.bronze_zip_codes (Delta table)

## CONFIG/PARAMETERS

In [0]:
from pyspark.sql import types as T
from pyspark.sql import functions as F
from bs4 import BeautifulSoup
import requests

In [0]:
%sql
USE CATALOG harris_county_catalog

In [0]:
headers = {
    'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
}

url = 'https://www.zip-codes.com/county/tx-harris.asp'

## READ SOURCE

In [0]:
page = requests.get(url, headers= headers)

soup = BeautifulSoup(page.text, 'html.parser')

In [0]:
is_empty = soup.find() is None

if is_empty:
    print("Soup is empty")
else:
    print("Soup loaded successfully")

Soup loaded successfully


In [0]:
%skip
print(soup.find('table', class_ = 'table table-striped table-hover table-bordered small border rounded-3 overflow-hidden sortableTbl'))

<table border="0" cellpadding="0" cellspacing="0" class="table table-striped table-hover table-bordered small border rounded-3 overflow-hidden sortableTbl" id="tblZIP" width="100%"><thead><tr><th nowrap="">ZIP Code</th><th><span class="d-none d-md-block">Classification</span><span class="d-block d-md-none d-lg-none">Class</span></th><th>City</th><th axis="num"><span class="d-none d-md-block">Population</span><span class="d-block d-md-none d-lg-none">Pop</span></th><th><span class="d-none d-md-block">% of Population</span><span class="d-block d-md-none d-lg-none">% of Pop</span></th></tr></thead><tbody><tr><td><a href="/zip-code/77001/zip-code-77001.asp" title="ZIP Code 77001, Houston, TX">77001</a><i class="fa-solid fa-box" title="PO BOX ZIP Code"></i></td><td>P.O. Box</td><td><a href="/city/tx-houston.asp" title="Houston, TX">Houston</a></td><td>0</td><td>0.00%</td></tr><tr><td><a href="/zip-code/77002/zip-code-77002.asp" title="ZIP Code 77002, Houston, TX">77002</a></td><td>Standard<

In [0]:
zip_table = soup.find('table', class_ = 'table table-striped table-hover table-bordered small border rounded-3 overflow-hidden sortableTbl')

In [0]:
zip_table_titles = zip_table.find('thead').find_all('th')

In [0]:
%skip
print(zip_table_titles)

[<th nowrap="">ZIP Code</th>, <th><span class="d-none d-md-block">Classification</span><span class="d-block d-md-none d-lg-none">Class</span></th>, <th>City</th>, <th axis="num"><span class="d-none d-md-block">Population</span><span class="d-block d-md-none d-lg-none">Pop</span></th>, <th><span class="d-none d-md-block">% of Population</span><span class="d-block d-md-none d-lg-none">% of Pop</span></th>]


## BUILD DATA FRAME

In [0]:
zip_table_titles_list = [titles.get_text(strip=True) for titles in zip_table_titles]
# print(zip_table_titles_list)

['ZIP Code', 'ClassificationClass', 'City', 'PopulationPop', '% of Population% of Pop']


In [0]:
columns_zip_table = zip_table.find('tbody').find_all('tr')

In [0]:
# print(columns_zip_table)

[<tr><td><a href="/zip-code/77001/zip-code-77001.asp" title="ZIP Code 77001, Houston, TX">77001</a><i class="fa-solid fa-box" title="PO BOX ZIP Code"></i></td><td>P.O. Box</td><td><a href="/city/tx-houston.asp" title="Houston, TX">Houston</a></td><td>0</td><td>0.00%</td></tr>, <tr><td><a href="/zip-code/77002/zip-code-77002.asp" title="ZIP Code 77002, Houston, TX">77002</a></td><td>Standard</td><td><a href="/city/tx-houston.asp" title="Houston, TX">Houston</a></td><td>19,844</td><td>0.42%</td></tr>, <tr><td><a href="/zip-code/77003/zip-code-77003.asp" title="ZIP Code 77003, Houston, TX">77003</a></td><td>Standard</td><td><a href="/city/tx-houston.asp" title="Houston, TX">Houston</a></td><td>11,874</td><td>0.25%</td></tr>, <tr><td><a href="/zip-code/77004/zip-code-77004.asp" title="ZIP Code 77004, Houston, TX">77004</a></td><td>Standard</td><td><a href="/city/tx-houston.asp" title="Houston, TX">Houston</a></td><td>37,005</td><td>0.78%</td></tr>, <tr><td><a href="/zip-code/77005/zip-code

In [0]:
rows_table = []
for row in columns_zip_table:
    row_data = row.find_all('td')
    individual_row_data = [data.get_text(strip=True) for data in row_data]
    rows_table.append(individual_row_data)


['77001', 'P.O. Box', 'Houston', '0', '0.00%']
['77002', 'Standard', 'Houston', '19,844', '0.42%']
['77003', 'Standard', 'Houston', '11,874', '0.25%']
['77004', 'Standard', 'Houston', '37,005', '0.78%']
['77005', 'Standard', 'Houston', '28,241', '0.60%']
['77006', 'Standard', 'Houston', '24,129', '0.51%']
['77007', 'Standard', 'Houston', '42,908', '0.91%']
['77008', 'Standard', 'Houston', '36,631', '0.77%']
['77009', 'Standard', 'Houston', '36,425', '0.77%']
['77010', 'Standard', 'Houston', '883', '0.02%']
['77011', 'Standard', 'Houston', '16,841', '0.36%']
['77012', 'Standard', 'Houston', '18,397', '0.39%']
['77013', 'Standard', 'Houston', '16,815', '0.36%']
['77014', 'Standard', 'Houston', '35,161', '0.74%']
['77015', 'Standard', 'Houston', '57,106', '1.21%']
['77016', 'Standard', 'Houston', '29,966', '0.63%']
['77017', 'Standard', 'Houston', '31,659', '0.67%']
['77018', 'Standard', 'Houston', '27,870', '0.59%']
['77019', 'Standard', 'Houston', '24,136', '0.51%']
['77020', 'Standard'

In [0]:
db_zip_table = spark.createDataFrame(rows_table).toDF('ZIP_Code', 'ClassificationClass', 'City', 'random1', 'random2')

In [0]:
# db_zip_table.printSchema()

root
 |-- ZIP_Code: string (nullable = true)
 |-- ClassificationClass: string (nullable = true)
 |-- City: string (nullable = true)
 |-- random1: string (nullable = true)
 |-- random2: string (nullable = true)



## WRITE DELTA TABLE

We write the table in overwrite as we cannot copy into using a python data frame. Schema shift is a concern, duplicate rows are not


In [0]:
db_zip_table.write\
    .mode("overwrite")\
    .saveAsTable(f"bronze.bronze_zip_code")
