In [3]:
import requests
import pandas as pd
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("GitHubCSVFetcher").getOrCreate()

# GitHub repository details
GITHUB_OWNER = "Rulzyushan"
REPO_NAME = "Data-Engineering-Batch-Processing-Project-01"
FOLDER_PATH = "DE-BP-Project-01-Data"  # e.g., "data"
GITHUB_API_URL = f"https://api.github.com/repos/{GITHUB_OWNER}/{REPO_NAME}/contents/{FOLDER_PATH}"

def get_csv_urls_from_github(api_url):
    response = requests.get(api_url)
    if response.status_code == 200:
        files = response.json()
        csv_urls = [{"file_name": file["name"], "csv_url": file["download_url"]} 
                    for file in files if file["name"].endswith(".csv")]
        return csv_urls
    else:
        print(f"Failed to fetch data: {response.status_code}")
        return []

# Get all CSV file URLs
csv_urls = get_csv_urls_from_github(GITHUB_API_URL)

# Print the CSV URLs
#for csv in csv_urls:
#    print(f"File Name: {csv['file_name']}, CSV URL: {csv['csv_url']}")


In [4]:
display(csv_urls)

In [10]:
import pandas as pd

def read_dataframe(url):
    return pd.read_csv(url)

# List to hold the pandas DataFrames
df_dict = {}

# Loop through each URL in the list of dictionaries
for csv_info in csv_urls:
    url = csv_info["csv_url"]
    file_n = csv_info["file_name"].replace(".csv", "")
    # Read the CSV into a pandas DataFrame
    pandas_df = read_dataframe(url)
    # Store the DataFrame in a dictionary with file_name as the key
    df_dict[file_n] = pandas_df

In [15]:
#display(df_dict)
df_dict["SalesLT.Address"].head()

Unnamed: 0,AddressID,AddressLine1,AddressLine2,City,StateProvince,CountryRegion,PostalCode,rowguid,ModifiedDate
0,9,8713 Yosemite Ct.,,Bothell,Washington,United States,98011,268AF621-76D7-4C78-9441-144FD139821A,2006-07-01 00:00:00
1,11,1318 Lasalle Street,,Bothell,Washington,United States,98011,981B3303-ACA2-49C7-9A96-FB670785B269,2007-04-01 00:00:00
2,25,9178 Jumping St.,,Dallas,Texas,United States,75201,C8DF3BD9-48F0-4654-A8DD-14A67A84D3C6,2006-09-01 00:00:00
3,28,9228 Via Del Sol,,Phoenix,Arizona,United States,85004,12AE5EE1-FC3E-468B-9B92-3B970B169774,2005-09-01 00:00:00
4,32,26910 Indela Road,,Montreal,Quebec,Canada,H1Y 2H5,84A95F62-3AE8-4E7E-BBD5-5A6F00CD982D,2006-08-01 00:00:00


In [33]:
import os

table_names = df_dict.keys()

for t in table_names:
    # DataFrame
    df = df_dict[t]

    # Specify the output path for the Delta Parquet file
    output_path = f'/lakehouse/default/Files/{t}_W01_data_delta_bronze'

    # Check if the file exists and delete it
    if os.path.exists(output_path):
       os.remove(output_path)
       print(f"File '{output_path}' has been deleted.")
    else:
       print(f"File '{output_path}' does not exist.")

    # Write the DataFrame to a Delta Parquet file
    df.to_parquet(output_path, engine='pyarrow')


File '/lakehouse/default/Files/SalesLT.Address_W01_data_delta_bronze' has been deleted.
File '/lakehouse/default/Files/SalesLT.Customer_W01_data_delta_bronze' has been deleted.
File '/lakehouse/default/Files/SalesLT.CustomerAddress_W01_data_delta_bronze' has been deleted.
File '/lakehouse/default/Files/SalesLT.Product_W01_data_delta_bronze' has been deleted.
File '/lakehouse/default/Files/SalesLT.ProductCategory_W01_data_delta_bronze' has been deleted.
File '/lakehouse/default/Files/SalesLT.ProductDescription_W01_data_delta_bronze' has been deleted.
File '/lakehouse/default/Files/SalesLT.ProductModel_W01_data_delta_bronze' has been deleted.
File '/lakehouse/default/Files/SalesLT.ProductModelProductDescription_W01_data_delta_bronze' has been deleted.
File '/lakehouse/default/Files/SalesLT.SalesOrderDetail_W01_data_delta_bronze' has been deleted.
File '/lakehouse/default/Files/SalesLT.SalesOrderHeader_W01_data_delta_bronze' has been deleted.
