In [0]:
import requests
import pandas as pd
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("GitHubCSVFetcher").getOrCreate()

# GitHub repository details
GITHUB_OWNER = "Rulzyushan"
REPO_NAME = "Data-Engineering-Batch-Processing-Project-01"
FOLDER_PATH = "DE-BP-Project-01-Data"  # e.g., "data"
GITHUB_API_URL = f"https://api.github.com/repos/{GITHUB_OWNER}/{REPO_NAME}/contents/{FOLDER_PATH}"


def get_csv_urls_from_github(api_url):
    response = requests.get(api_url)
    if response.status_code == 200:
        files = response.json()
        csv_urls = [{"file_name": file["name"], "csv_url": file["download_url"]} for file in files if file["name"].endswith(".csv")]
        return csv_urls
    else:
        print(f"Failed to fetch data: {response.status_code}")
        return []

# Get all CSV file URLs
csv_urls = get_csv_urls_from_github(GITHUB_API_URL)

#df = spark.createDataFrame(csv_urls, ["file_name", "csv_url"])

In [0]:
display(csv_urls)

csv_url,file_name
https://raw.githubusercontent.com/Rulzyushan/Data-Engineering-Batch-Processing-Project-01/main/DE-BP-Project-01-Data/SalesLT.Address.csv,SalesLT.Address.csv
https://raw.githubusercontent.com/Rulzyushan/Data-Engineering-Batch-Processing-Project-01/main/DE-BP-Project-01-Data/SalesLT.Customer.csv,SalesLT.Customer.csv
https://raw.githubusercontent.com/Rulzyushan/Data-Engineering-Batch-Processing-Project-01/main/DE-BP-Project-01-Data/SalesLT.CustomerAddress.csv,SalesLT.CustomerAddress.csv
https://raw.githubusercontent.com/Rulzyushan/Data-Engineering-Batch-Processing-Project-01/main/DE-BP-Project-01-Data/SalesLT.Product.csv,SalesLT.Product.csv
https://raw.githubusercontent.com/Rulzyushan/Data-Engineering-Batch-Processing-Project-01/main/DE-BP-Project-01-Data/SalesLT.ProductCategory.csv,SalesLT.ProductCategory.csv
https://raw.githubusercontent.com/Rulzyushan/Data-Engineering-Batch-Processing-Project-01/main/DE-BP-Project-01-Data/SalesLT.ProductDescription.csv,SalesLT.ProductDescription.csv
https://raw.githubusercontent.com/Rulzyushan/Data-Engineering-Batch-Processing-Project-01/main/DE-BP-Project-01-Data/SalesLT.ProductModel.csv,SalesLT.ProductModel.csv
https://raw.githubusercontent.com/Rulzyushan/Data-Engineering-Batch-Processing-Project-01/main/DE-BP-Project-01-Data/SalesLT.ProductModelProductDescription.csv,SalesLT.ProductModelProductDescription.csv
https://raw.githubusercontent.com/Rulzyushan/Data-Engineering-Batch-Processing-Project-01/main/DE-BP-Project-01-Data/SalesLT.SalesOrderDetail.csv,SalesLT.SalesOrderDetail.csv
https://raw.githubusercontent.com/Rulzyushan/Data-Engineering-Batch-Processing-Project-01/main/DE-BP-Project-01-Data/SalesLT.SalesOrderHeader.csv,SalesLT.SalesOrderHeader.csv


In [0]:
import pandas as pd

def read_dataframe(df1):
    return spark.createDataFrame(pd.read_csv(df1)) 

# List to hold the Spark DataFrames
df_dict = {}

# Loop through each URL in the list of dictionaries
for csv_info in csv_urls:
    url = csv_info["csv_url"]
    file_n = csv_info["file_name"].replace(".csv", "")
    # Read the CSV into a Spark DataFrame
    spark_df = read_dataframe(url)
    # Store the DataFrame in a dictionary with file_name as the key
    df_dict[file_n] = spark_df
    
display(df_dict)

{'SalesLT.Address': DataFrame[AddressID: bigint, AddressLine1: string, AddressLine2: double, City: string, StateProvince: string, CountryRegion: string, PostalCode: string, rowguid: string, ModifiedDate: string],
 'SalesLT.Customer': DataFrame[CustomerID: bigint, NameStyle: bigint, Title: string, FirstName: string, MiddleName: string, LastName: string, Suffix: string, CompanyName: string, SalesPerson: string, EmailAddress: string, Phone: string, PasswordHash: string, PasswordSalt: string, rowguid: string, ModifiedDate: string],
 'SalesLT.CustomerAddress': DataFrame[CustomerID: bigint, AddressID: bigint, AddressType: string, rowguid: string, ModifiedDate: string],
 'SalesLT.Product': DataFrame[ProductID: bigint, Name: string, ProductNumber: string, Color: string, StandardCost: double, ListPrice: double, Size: string, Weight: double, ProductCategoryID: bigint, ProductModelID: bigint, SellStartDate: string, SellEndDate: string, DiscontinuedDate: double, ThumbNailPhoto: string, ThumbnailPh

In [0]:
display(df_dict["SalesLT.Address"].limit(10))

AddressID,AddressLine1,AddressLine2,City,StateProvince,CountryRegion,PostalCode,rowguid,ModifiedDate
9,8713 Yosemite Ct.,,Bothell,Washington,United States,98011,268AF621-76D7-4C78-9441-144FD139821A,2006-07-01 00:00:00
11,1318 Lasalle Street,,Bothell,Washington,United States,98011,981B3303-ACA2-49C7-9A96-FB670785B269,2007-04-01 00:00:00
25,9178 Jumping St.,,Dallas,Texas,United States,75201,C8DF3BD9-48F0-4654-A8DD-14A67A84D3C6,2006-09-01 00:00:00
28,9228 Via Del Sol,,Phoenix,Arizona,United States,85004,12AE5EE1-FC3E-468B-9B92-3B970B169774,2005-09-01 00:00:00
32,26910 Indela Road,,Montreal,Quebec,Canada,H1Y 2H5,84A95F62-3AE8-4E7E-BBD5-5A6F00CD982D,2006-08-01 00:00:00
185,2681 Eagle Peak,,Bellevue,Washington,United States,98004,7BCCF442-2268-46CC-8472-14C44C14E98C,2006-09-01 00:00:00
297,7943 Walnut Ave,,Renton,Washington,United States,98055,52410DA4-2778-4B1D-A599-95746625CE6D,2006-08-01 00:00:00
445,6388 Lake City Way,,Burnaby,British Columbia,Canada,V5A 3A6,53572F25-9133-4A8B-A065-102FF35416EE,2006-09-01 00:00:00
446,52560 Free Street,,Toronto,Ontario,Canada,M4B 1V7,801A1DFC-5125-486B-AA84-CCBD2EC57CA4,2005-08-01 00:00:00
447,22580 Free Street,,Toronto,Ontario,Canada,M4B 1V7,88CEE379-DBB8-433B-B84E-A35E09435500,2006-08-01 00:00:00


Transforming date column and remove duplicates and drop null rows

In [0]:

from pyspark.sql.functions import from_utc_timestamp, date_format
from pyspark.sql.types import TimestampType
from pyspark.sql.functions import col,to_date,datediff,current_date,when

table_names = df_dict.keys()
df_dict_T = {}
for t in table_names:
    df_TD = df_dict[t]
    cols = df_TD.columns

    for col in cols:
        if "Date" in col or "date" in col:
            df_TD = df_TD.withColumn (col, date_format(from_utc_timestamp(df_TD[col].cast(TimestampType()), "UTC"), "yyyy-MM-dd"))
    df_dict_T[t] = df_TD.dropDuplicates().na.drop('all')
    #df_TD.write.format('delta').mode("overwrite").save(output_path)

In [0]:
display(df_dict_T["SalesLT.Address"].select("ModifiedDate").limit(10))

ModifiedDate
2006-08-01
2006-09-01
2005-08-01
2006-08-01
2006-09-01
2007-04-01
2006-09-01
2006-07-01
2006-08-01
2005-09-01


In [0]:
df_dict["SalesLT.Address"].count() - df_dict_T["SalesLT.Address"].count()

Out[68]: 0