# Bronze - Orders Ingestion
# Purpose: Ingest raw zip codes data into Bronze layer
# Source: webpage
# Output: bronze.bronze_zip_codes (Delta table)

## CONFIG/PARAMETERS

In [0]:
from pyspark.sql import types as T
from pyspark.sql import functions as F
from bs4 import BeautifulSoup
import requests

In [0]:
%sql
USE CATALOG harris_county_catalog

In [0]:
headers = {
    'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
}

url = 'https://www.zip-codes.com/county/tx-harris.asp'

## READ SOURCE

In [0]:
page = requests.get(url, headers= headers)

soup = BeautifulSoup(page.text, 'html.parser')

In [0]:
is_empty = soup.find() is None

if is_empty:
    print("Soup is empty")
else:
    print("Soup loaded successfully")

In [0]:
%skip
print(soup.find('table', class_ = 'table table-striped table-hover table-bordered small border rounded-3 overflow-hidden sortableTbl'))

In [0]:
zip_table = soup.find('table', class_ = 'table table-striped table-hover table-bordered small border rounded-3 overflow-hidden sortableTbl')

In [0]:
zip_table_titles = zip_table.find('thead').find_all('th')

In [0]:
%skip
print(zip_table_titles)

## BUILD DATA FRAME

In [0]:
zip_table_titles_list = [titles.get_text(strip=True) for titles in zip_table_titles]
# print(zip_table_titles_list)

In [0]:
columns_zip_table = zip_table.find('tbody').find_all('tr')

In [0]:
# print(columns_zip_table)

In [0]:
rows_table = []
for row in columns_zip_table:
    row_data = row.find_all('td')
    individual_row_data = [data.get_text(strip=True) for data in row_data]
    rows_table.append(individual_row_data)


In [0]:
db_zip_table = spark.createDataFrame(rows_table).toDF('ZIP_Code', 'ClassificationClass', 'City', 'random1', 'random2')

In [0]:
# db_zip_table.printSchema()

## WRITE DELTA TABLE

We write the table in overwrite as we cannot copy into using a python data frame. Schema shift is a concern, duplicate rows are not


In [0]:
db_zip_table.write\
    .mode("overwrite")\
    .saveAsTable(f"bronze.bronze_zip_code")
