In [1]:
# 
'''
  1. read the csv file 
  2. perform the transformation
  3. do the upsert 
  4. finally insert into rds table

'''

'\n  1. read the csv file \n  2. perform the transformation\n  3. do the upsert \n  4. finally insert into rds table\n\n'

In [None]:
#Here's a step-by-step guide to performing these tasks using PySpark:

### **1. Read the CSV File**
### First, read a CSV file into a PySpark DataFrame.

from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("ReadCSV") \
    .getOrCreate()

# Read the CSV file into a DataFrame
df = spark.read.csv("path/to/your/file.csv", header=True, inferSchema=True)


### **2. Perform the Transformation**
#Next, perform any necessary transformations on the DataFrame. For example, you might want to filter out rows 
#with null values in certain columns and add a new column.
from pyspark.sql.functions import col, lit

# Filter out rows with null values in the 'important_column'
df_filtered = df.filter(col("important_column").isNotNull())

# Add a new column 'new_column' with a constant value
df_transformed = df_filtered.withColumn("new_column", lit("some_value"))


#**Explanation:**
#- `filter(col("important_column").isNotNull())` filters out rows where `important_column` is null.
#- `withColumn("new_column", lit("some_value"))` adds a new column named `new_column` with a constant value.

### **3. Perform the Upsert Operation**
#To perform an upsert (insert or update) operation, you'll need to match records in your DataFrame against records in the target table in RDS. 
# Let's assume we have an `id` column that can be used to match records.

# Assuming df_target is the DataFrame read from the target RDS table
df_target = spark.read.jdbc(url="jdbc:mysql://your-rds-endpoint/db_name",
                            table="target_table",
                            properties={"user": "your_username", "password": "your_password"})

# Merge the DataFrames based on the 'id' column
df_upsert = df_transformed.alias("source").join(
    df_target.alias("target"),
    on=["id"],
    how="outer"
)

# Resolve conflicts and determine updates or inserts
df_upsert_final = df_upsert.selectExpr(
    "coalesce(source.id, target.id) as id",
    "coalesce(source.column1, target.column1) as column1",
    "coalesce(source.column2, target.column2) as column2",
    # Include other columns as needed
)

#**Explanation:**
#- The `join` operation merges the source DataFrame with the target DataFrame from RDS.
#- `coalesce` is used to resolve conflicts between source and target, preferring the non-null value.


### **4. Insert into RDS Table**
#Finally, insert the upserted DataFrame back into the RDS table.

# Write the final DataFrame back to the RDS table
df_upsert_final.write.jdbc(
    url="jdbc:mysql://your-rds-endpoint/db_name",
    table="target_table",
    mode="overwrite",  # Use 'append' if you want to add new records without deleting old ones
    properties={"user": "your_username", "password": "your_password"}
)

#**Explanation:**
#- `mode="overwrite"` replaces the existing table with the new data. Use `mode="append"` to add new records without replacing the old ones.
#- The `jdbc` method is used to write the DataFrame to an RDS table.


In [None]:
# dataframe vs dynamic frame 