###**Creating Schema**
Creating Source and Target Schemas in Spark SQL

In [0]:
spark.sql("CREATE SCHEMA IF NOT EXISTS workspace.source")  # Create source schema if it doesn't exist
spark.sql("CREATE SCHEMA IF NOT EXISTS workspace.target")  # Create target schema if it doesn't exist

DataFrame[]

### **Loading and Saving SCD1 Data Table**
Read from AccuWeather Source and Overwrite to Workspace Source Schema

In [0]:
Y1 = spark.sql("select * from samples.bakehouse.sales_suppliers")
Y1.write.mode("overwrite").saveAsTable("workspace.source.Y1")

### **Read and Display SCD1 Source Table**
Load Data from workspace.source.scd1 and Visualize Contents

In [0]:
source = spark.read.table('workspace.source.Y1')
source.display()

### **Concatenate All Columns into 'ConCatValue'**
Transform Source Data by Merging All Columns into a Single Column

In [0]:
from pyspark.sql import functions as F
# Load Data From Source and concatenate all columns into 'ConCatValue'
source = source.withColumn('ConCatValue', F.concat_ws('', *source.columns))
display(source)

### Add Metadata Columns to Source Data
Include IndCurrent, CreatedDate, and ModifiedDate in the Dataset
> These columns are metadata as they describe data, not business values:
IndCurrent: Flags current active record.
CreatedDate: Timestamp when inserted.
ModifiedDate: Timestamp of last update.
  Used for tracking, versioning, and auditing.

In [0]:
# Add IndCurrent, CreatedDate, and ModifiedDate columns
source = source.withColumn("IndCurrent", F.lit(1)) \
    .withColumn("CreatedDate", F.current_timestamp()) \
    .withColumn("ModifiedDate", F.current_timestamp())
source.display()

###  Sort the source DataFrame by the latitude column in ascending order

In [0]:
# Sort the DataFrame by the 'latitude' column in ascending order
source = source.orderBy(F.col("latitude"))

# Display the sorted DataFrame
display(source)

### Revert the source DataFrame to its original state before any transformations,

In [0]:
# Reload the original table
source = spark.read.table('samples.bakehouse.sales_suppliers')

# Display the reloaded DataFrame
display(source)

### Sort the source DataFrame in descending order based on the size column

In [0]:
from pyspark.sql.functions import col

# Sort the DataFrame by the 'size' column in descending order
source = source.orderBy(col("size").desc())

# Display the sorted DataFrame
display(source)

###Filter
#####Remove all rows where the supplierID is <= 4000010

In [0]:
from pyspark.sql.functions import col

# Filter out rows where supplierID is greater than 4000010
source = source.filter(col("supplierID") <= 4000010)

# Display the filtered DataFrame
display(source)

supplierID,name,ingredient,continent,city,district,size,longitude,latitude,approved
4000006,Hazelnut Haven,hazelnuts,Europe,Istanbul,Kadıköy,XXL,28.9784,41.0082,Y
4000008,Cashew Corner,cashews,Asia,Goa,Anjuna Beach,XL,73.8067,15.3173,Y
4000003,Sugar Cane Harvest,cane sugar,South America,Sao Paulo,Vila Madalena,XL,-46.6333,-23.5489,Y
4000010,Pistachio Palace,pistachios,Asia,Tehran,Tajrish Bazaar,S,51.4215,35.7106,Y
4000005,Pecan Pleasures,pecans,North America,Atlanta,Virginia-Highland,S,-84.3888,33.749,Y
4000001,Coconut Grove,coconut,Asia,Manila,Intramuros,S,121.0221,14.6042,Y
4000004,Vanilla Valley,vanilla,North America,Mexico City,Roma Norte,M,-99.1332,19.4326,Y
4000000,Cacao Wonders,cacao,South America,Guayaquil,Las Peñas,M,-79.8974,-2.1791,Y
4000009,Maple Monarch,maple syrup,North America,Montreal,Plateau Mont-Royal,M,-73.5673,45.5017,Y
4000002,Almond Delights,almonds,Europe,Valencia,Ruzafa,L,-0.3762,39.4699,Y


### To swap the columns ingredient and continent in the DataFrame

In [0]:
# Ensure the columns 'ingredient' and 'continent' exist in the DataFrame
if 'ingredient' in source.columns and 'continent' in source.columns:
    # Reorder the columns to swap 'ingredient' and 'continent'
    columns = source.columns
    ingredient_index = columns.index('ingredient')
    continent_index = columns.index('continent')

    # Swap the columns
    columns[ingredient_index], columns[continent_index] = columns[continent_index], columns[ingredient_index]

    # Select the DataFrame with the new column order
    source = source.select(*columns)

    # Display the updated DataFrame
    display(source)
else:
    print("Columns 'ingredient' and 'continent' must exist in the DataFrame.")

### To revert the source DataFrame to its original state
> To revert the source DataFrame to its original state before any transformations, you can reload the table from the original source. 

In [0]:
# Reload the original table
source = spark.read.table('samples.bakehouse.sales_suppliers')
# Display the reloaded DataFrame
display(source)

### To get the 4th highest latitude city
> - To get the 4th highest latitude city, you can use the orderBy and limit functions along with the dropDuplicates function to ensure unique latitude values

In [0]:
from pyspark.sql.functions import col

# Order the DataFrame by 'latitude' in descending order and drop duplicate latitudes
source = source.orderBy(col("latitude").desc()).dropDuplicates(["latitude"])

# Get the 4th highest latitude city
fourth_highest_latitude_city = source.limit(4).orderBy(col("latitude").asc()).limit(1)

# Display the result
display(fourth_highest_latitude_city)

supplierID,name,ingredient,continent,city,district,size,longitude,latitude,approved
4000023,Fennel Fields,fennel seeds,Europe,Florence,Santo Spirito,L,11.2558,43.7695,Y
