In [None]:
# 1. DataFrame Operations
# Question: How would you convert a DataFrame column with JSON strings into multiple columns in PySpark?
# Answer: You can use the from_json() function along with a schema to parse the JSON strings and selectExpr() 
# to extract the fields into new columns.

from pyspark.sql.functions import from_json
from pyspark.sql.types import StructType, StructField, StringType

schema = StructType([
    StructField("field1", StringType(), True),
    StructField("field2", StringType(), True)
])

df = df.withColumn("jsonData", from_json(df.jsonColumn, schema))
df = df.selectExpr("*", "jsonData.*").drop("jsonData")


In [None]:
#2. Handling Nulls
#Question: How would you handle missing values (nulls) in a PySpark DataFrame?
#Answer: PySpark provides several methods to handle null values:
    # Use fillna() to replace nulls with a specific value.
    # Use dropna() to remove rows with null values.
    # Use fillna() to replace nulls selectively in specific columns.

df = df.fillna({'column1': 0, 'column2': 'unknown'})
df = df.dropna(subset=['column3'])

In [None]:
#3. Joins in PySpark
# Question: How would you perform a join between two DataFrames on multiple columns in PySpark?
# Use the join() function and specify the join condition using a list of columns or expressions.

df1 = None 
df2 = None 

df_joined = df1.join(
    df2, 
    (df1['col1'] == df2['col1']) & (df1['col2'] == df2['col2']), 
    'inner'
)

In [None]:
# 4. UDFs (User Defined Functions)
# Question: How would you create and use a UDF in PySpark to apply a custom function on a DataFrame column?
# Answer: First, define the UDF using the udf decorator or function, and then apply it to the DataFrame column.

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def custom_function(value):
    return value.upper()

udf_custom_function = udf(custom_function, StringType())

df = df.withColumn('new_column', udf_custom_function(df['existing_column']))


In [None]:
# 5. Window Functions
# Question: Can you explain how to use window functions in PySpark to calculate a moving average?
# You can use the window function to define the partitioning and ordering, and then apply aggregation functions like avg() over this window.

from pyspark.sql.window import Window
from pyspark.sql.functions import avg

window_spec = Window.partitionBy("partition_col").orderBy("order_col").rowsBetween(-2, 0)
df = df.withColumn("moving_avg", avg("value_col").over(window_spec))


In [None]:
## 6. Optimization Techniques
'''
Question: What are some best practices for optimizing PySpark code?
 - Use select() to project only necessary columns.
 - Cache DataFrames when they are reused multiple times using cache() or persist().
Avoid using collect() on large datasets.
Use broadcast() for small DataFrames when joining with a large DataFrame.
Consider using partitioning and bucketing for large datasets
'''