# **PYSPARK INTERVIEW QUESTIONS - ANSH LAMBA**

In [5]:
from pyspark.sql.functions import * 
from pyspark.sql.types import *
from pyspark.sql import SparkSession


spark = SparkSession.builder \
    .appName("Test PySpark Installation") \
    .getOrCreate()

**Q1 While ingesting customer data from an external source, you notice duplicate entries. How would you remove duplicates and retain only the latest entry based on a timestamp column?**

In [15]:
data = [("101", "2023-12-01", 100), ("101", "2023-12-02", 150), 
        ("102", "2023-12-01", 200), ("102", "2023-12-02", 250)]
columns = ["product_id", "date", "sales"]

df = spark.createDataFrame(data, columns)
df.show()

+----------+----------+-----+
|product_id|      date|sales|
+----------+----------+-----+
|       101|2023-12-01|  100|
|       101|2023-12-02|  150|
|       102|2023-12-01|  200|
|       102|2023-12-02|  250|
+----------+----------+-----+



**Solution**

In [16]:
df=df.withColumn('date',col('date').cast(DateType()))
#df=df.withColumn('date',to_date(col('date')))
df=df.orderBy('product_id','date',ascending=[1,0]).dropDuplicates(subset=['product_id'])
df.show()

+----------+----------+-----+
|product_id|      date|sales|
+----------+----------+-----+
|       101|2023-12-02|  150|
|       102|2023-12-02|  250|
+----------+----------+-----+



It keeps only the first occurrence of each unique value in the specified subset column(s).

It removes all other rows that have the same value(s) in those columns.

product_id,date,sales
101,2023-12-02,150
102,2023-12-02,250


**2. While processing data from multiple files with inconsistent schemas, you need to merge them into a single DataFrame. How would you handle this inconsistency in PySpark?**

**Solution**

In [0]:
df=spark.read.format('parquet')\
                .option('mergeSchema',True)\
                .load('Path to file')

#only works for Parquet
#ORC (some extent)

For CSV and JSON Files, handle manually

In [None]:
df1 = spark.read.option("header", True).csv("file1.csv")
df2 = spark.read.option("header", True).csv("file2.csv")

df1 = spark.read.option("multiline", True).json("file1.json")
df2 = spark.read.option("multiline", True).json("file2.json")

all_columns = list(set(df1.columns).union(set(df2.columns)))

from pyspark.sql.functions import lit

def align_columns(df, all_cols):
    for col_name in all_cols:
        if col_name not in df.columns:
            df = df.withColumn(col_name, lit(None))  # Add missing col with null
    return df.select(sorted(all_cols))  # Keep consistent column order

df1_aligned = align_columns(df1, all_columns)
df2_aligned = align_columns(df2, all_columns)


merged_df = df1_aligned.unionByName(df2_aligned)


**3. What are the key differences between Spark and Hadoop MatpReduce in terms of performance and scalability?**

Hadoop MapReduce is a framework to process data paralelly

Hadoop does the computation by Map and Reduce
Map is used to distribute data among nodes and this data is combined at Reduce stage, the issue is the map is writing the intermediate data to disk, then the reduces will read data from disk, so it impact the required resources and time and performance.

as spark being work with in-memory it is much better in terms of performance.

"In summary, Spark is preferred for speed and flexibility, especially when in-memory processing is feasible, while MapReduce is more suited to traditional, disk-based batch jobs in resource-constrained environments."

and hadoop is mostly created for Batch processing, whereas the spark is created for both batch and streaming processing.

**4. You are working with a real-time data pipeline, and you notice missing values in your streaming data Column - Category. How would you handle null or missing values in such a scenario?**

**df_stream = spark.readStream.schema("id INT, value STRING").csv("path/to/stream")**

In [None]:
df=df.fillna({'Product_id':'N/A'})

**5. You need to calculate the total number of actions performed by users in a system. How would you calculate the top 5 most active users based on this information?**

In [32]:
data = [("user1", 5), ("user2", 8), ("user3", 2), ("user4", 10), ("user2", 3)]
columns = ["user_id", "actions"]

df = spark.createDataFrame(data, columns)
df.show()

+-------+-------+
|user_id|actions|
+-------+-------+
|  user1|      5|
|  user2|      8|
|  user3|      2|
|  user4|     10|
|  user2|      3|
+-------+-------+



In [33]:
df=df.groupBy('user_id').agg(sum('actions').alias('total_actions')).orderBy('total_actions',ascending=False).limit(5)
df.show()

+-------+-------------+
|user_id|total_actions|
+-------+-------------+
|  user2|           11|
|  user4|           10|
|  user1|            5|
|  user3|            2|
+-------+-------------+



**6. While processing sales transaction data, you need to identify the most recent transaction for each customer. How would you approach this task?**

In [45]:
data = [("cust1", "2023-12-01", 100), ("cust2", "2023-12-02", 150),
        ("cust1", "2023-12-03", 200), ("cust2", "2023-12-04", 250)]
columns = ["customer_id", "transaction_date", "sales"]
df = spark.createDataFrame(data, columns)
df.show()

+-----------+----------------+-----+
|customer_id|transaction_date|sales|
+-----------+----------------+-----+
|      cust1|      2023-12-01|  100|
|      cust2|      2023-12-02|  150|
|      cust1|      2023-12-03|  200|
|      cust2|      2023-12-04|  250|
+-----------+----------------+-----+



In [48]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col

df=df.withColumn('transaction_date',col('transaction_date').cast(DateType()))

df=df.withColumn('flag',dense_rank().over(Window.partitionBy('customer_id').orderBy(col('transaction_date').desc()))).filter(col('flag') == 1)

df.show()



+-----------+----------------+-----+----+
|customer_id|transaction_date|sales|flag|
+-----------+----------------+-----+----+
|      cust1|      2023-12-03|  200|   1|
|      cust2|      2023-12-04|  250|   1|
+-----------+----------------+-----+----+



**7. You need to identify customers who haven’t made any purchases in the last 30 days. How would you filter such customers?**

In [55]:
data = [("cust1", "2025-12-01"), ("cust2", "2024-11-20"), ("cust3", "2024-11-25")]
columns = ["customer_id", "last_purchase_date"]

df = spark.createDataFrame(data, columns)

df.show()

+-----------+------------------+
|customer_id|last_purchase_date|
+-----------+------------------+
|      cust1|        2025-12-01|
|      cust2|        2024-11-20|
|      cust3|        2024-11-25|
+-----------+------------------+



In [57]:
df=df.withColumn('last_purchase_date',to_date('last_purchase_date'))
df.show()
df=df.withColumn('date_diff',date_diff(current_date(),'last_purchase_date')).filter(col('date_diff')>30)
df.show()

+-----------+------------------+---------+
|customer_id|last_purchase_date|date_diff|
+-----------+------------------+---------+
|      cust1|        2025-12-01|     -233|
|      cust2|        2024-11-20|      143|
|      cust3|        2024-11-25|      138|
+-----------+------------------+---------+

+-----------+------------------+---------+
|customer_id|last_purchase_date|date_diff|
+-----------+------------------+---------+
|      cust2|        2024-11-20|      143|
|      cust3|        2024-11-25|      138|
+-----------+------------------+---------+



**8. While analyzing customer reviews, you need to identify the most frequently used words in the feedback. How would you implement this?**

In [64]:
data = [("customer1", "The product is great"), ("customer2", "Great product, fast delivery"), ("customer3", "Not bad, could be better")]
columns = ["customer_id", "feedback"]

df = spark.createDataFrame(data, columns)

df.show(truncate=False)

+-----------+----------------------------+
|customer_id|feedback                    |
+-----------+----------------------------+
|customer1  |The product is great        |
|customer2  |Great product, fast delivery|
|customer3  |Not bad, could be better    |
+-----------+----------------------------+



In [66]:
df=df.withColumn('feedback',lower('feedback')).withColumn('feedback',explode(split('feedback',' ')))
df_grp=df.groupBy('feedback').agg(count('feedback').alias('wordcount'))
df_grp.show(truncate=False)

+--------+---------+
|feedback|wordcount|
+--------+---------+
|great   |2        |
|is      |1        |
|the     |1        |
|product |1        |
|fast    |1        |
|delivery|1        |
|product,|1        |
|could   |1        |
|not     |1        |
|be      |1        |
|bad,    |1        |
|better  |1        |
+--------+---------+



**9. You need to calculate the cumulative sum of sales over time for each product. How would you approach this?**

In [69]:
data = [("product1", "2023-12-01", 100), ("product2", "2023-12-02", 200),
        ("product1", "2023-12-03", 150), ("product2", "2023-12-04", 250)]
columns = ["product_id", "date", "sales"]
df = spark.createDataFrame(data, columns)
df.show()

+----------+----------+-----+
|product_id|      date|sales|
+----------+----------+-----+
|  product1|2023-12-01|  100|
|  product2|2023-12-02|  200|
|  product1|2023-12-03|  150|
|  product2|2023-12-04|  250|
+----------+----------+-----+



In [70]:
df=df.withColumn('date',to_date('date'))
df=df.withColumn('CumSum',sum('sales').over(Window.partitionBy('product_id').orderBy('date')))

df.show()

+----------+----------+-----+------+
|product_id|      date|sales|CumSum|
+----------+----------+-----+------+
|  product1|2023-12-01|  100|   100|
|  product1|2023-12-03|  150|   250|
|  product2|2023-12-02|  200|   200|
|  product2|2023-12-04|  250|   450|
+----------+----------+-----+------+



**10. While preparing a data pipeline, you notice some duplicate rows in a dataset. How would you remove the duplicates without affecting the original order?**

In [None]:
data = [("John", 25), ("Jane", 30), ("John", 25), ("Alice", 22)]
columns = ["name", "age"]
df = spark.createDataFrame(data, columns)
df.show()

+-----+---+
| name|age|
+-----+---+
| John| 25|
| Jane| 30|
| John| 25|
|Alice| 22|
+-----+---+

+-----+---+
| name|age|
+-----+---+
| John| 25|
| Jane| 30|
|Alice| 22|
+-----+---+



In [None]:
df=df.withColumn('rowflag',row_number().over(Window.partitionBy('name').orderBy('age'))).filter(col('rowflag')==1)
df.show()

#df.distinct.show()

+-----+---+-------+
| name|age|rowflag|
+-----+---+-------+
|Alice| 22|      1|
| Jane| 30|      1|
| John| 25|      1|
+-----+---+-------+



**11. You are working with user activity data and need to calculate the average session duration per user. How would you implement this?**

In [80]:
data = [("user1", "2023-12-01", 50), ("user1", "2023-12-02", 60), 
        ("user2", "2023-12-01", 45), ("user2", "2023-12-03", 75)]
columns = ["user_id", "session_date", "duration"]
df = spark.createDataFrame(data, columns)

df.show()

+-------+------------+--------+
|user_id|session_date|duration|
+-------+------------+--------+
|  user1|  2023-12-01|      50|
|  user1|  2023-12-02|      60|
|  user2|  2023-12-01|      45|
|  user2|  2023-12-03|      75|
+-------+------------+--------+



In [81]:
df=df.groupBy('user_id').agg(avg('duration').alias('avg_dur'))
df.show()

+-------+-------+
|user_id|avg_dur|
+-------+-------+
|  user1|   55.0|
|  user2|   60.0|
+-------+-------+



**12. While analyzing sales data, you need to find the product with the highest sales for each month. How would you accomplish this?**

In [91]:
data = [("product1", "2023-12-01", 100), ("product2", "2023-12-01", 150), 
        ("product1", "2023-12-02", 200), ("product2", "2023-12-02", 250)]
columns = ["product_id", "date", "sales"]
df = spark.createDataFrame(data, columns)
df.show()

+----------+----------+-----+
|product_id|      date|sales|
+----------+----------+-----+
|  product1|2023-12-01|  100|
|  product2|2023-12-01|  150|
|  product1|2023-12-02|  200|
|  product2|2023-12-02|  250|
+----------+----------+-----+



In [92]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col
df=df.withColumn('date',to_date('date'))

df=df.withColumn('date',month('date')).groupBy('date','product_id').agg(sum('sales').alias('sales'))
df=df.withColumn('ranking',dense_rank().over(Window.partitionBy('date').orderBy(col('sales').desc()))).filter(col('ranking')==1)
df.show()

+----+----------+-----+-------+
|date|product_id|sales|ranking|
+----+----------+-----+-------+
|  12|  product2|  400|      1|
+----+----------+-----+-------+



What is the SparkContext in PySpark?\
Explain Spark Architecture?\
What is difference between RDD,Dataframe and Dataset?\
What is QUERY Optimization?\
Tell about SPARK SESSION?\
Difference between Wide Transformatons and Narrow Transformations?\
What is the use of COALESCE() and Repartition()?\
When to use CACHE() and PERSIST()? What's the difference?\
What is the importance of PARTITIONS in pyspark?\
What is MPP?\
What are broadcast variables, and why are they used?\
What is difference between df.show() and df.collect()\
What is LAZY EVALUATION in Pyspark?\
What are the Advantages of Delta Lake over traditional file formats?\
What happens when a Pyspark job runs of out memory?(DOOM - Drivr out of memory or Executor Out of Memory)\
What is Skewed?\
What is AQE(Adaptive Query Executions) in PySpark? Why is it useful?\
What is salting?\
How would you handle skewed data in pyspark?\
What is broadcast join, and when you use it?\
What is spill in spark, why does it happen?\
What are Delta lake's time travel features, and how do they work?\









**13. You are working with a large Delta table that is frequently updated by multiple users. The data is stored in partitions, and sometimes updates can cause inconsistent reads due to concurrent transactions. How would you ensure ACID compliance and avoid data corruption in PySpark?**

In [0]:
#New Data

df=spark.read.format('parquet').load('path')

frome delta.table import DeltaTable

delta_tbl=DeltaTable.forPath('path')

delta_tbl.alias('trg').merge(df.alias('src'),"src.id==trg.id")\
                    .whenNotMatchedInsertAll() \
                    .whenMatchedUpdateAll() \
                    .execute()

**14. You need to process a large dataset stored in PARQUET format and ensure that all columns have the right schema (Almost). How would you do this?**

In [0]:
df=spark.read.format('parquet')\
            .option('inferSchema',True)\
            .load('path')

**15. You are reading a CSV file and need to handle corrupt records gracefully by skipping them. How would you configure this in PySpark?**

In [0]:
df=spark.read.format('csv')\
                .option("mode",'DROPMALFORMED')\
                .load('staging location')

**22. You have a dataset containing the names of employees and their departments. You need to find the department with the most employees.**

In [3]:
data = [("Alice", "HR"), ("Bob", "Finance"), ("Charlie", "HR"), ("David", "Engineering"), ("Eve", "Finance")]
columns = ["employee_name", "department"]

df = spark.createDataFrame(data, columns)
df.show()

+-------------+-----------+
|employee_name| department|
+-------------+-----------+
|        Alice|         HR|
|          Bob|    Finance|
|      Charlie|         HR|
|        David|Engineering|
|          Eve|    Finance|
+-------------+-----------+



In [5]:
df = df.groupBy('department').agg(count('employee_name').alias('total_employees')).sort('total_employees',ascending=False)
df.show()

+-----------+---------------+
| department|total_employees|
+-----------+---------------+
|         HR|              2|
|    Finance|              2|
|Engineering|              1|
+-----------+---------------+



**23. While processing sales data, you need to classify each transaction as either 'High' or 'Low' based on its amount. How would you achieve this using a when condition**

In [7]:
data = [("product1", 100), ("product2", 300), ("product3", 50)]
columns = ["product_id", "sales"]

df = spark.createDataFrame(data, columns)
df.show()

+----------+-----+
|product_id|sales|
+----------+-----+
|  product1|  100|
|  product2|  300|
|  product3|   50|
+----------+-----+



In [9]:
df=df.withColumn('price_cat',when(col('sales')>50,"high").otherwise("low"))
df.show()

+----------+-----+---------+
|product_id|sales|price_cat|
+----------+-----+---------+
|  product1|  100|     high|
|  product2|  300|     high|
|  product3|   50|      low|
+----------+-----+---------+



**24. While analyzing a large dataset, you need to create a new column that holds a timestamp of when the record was processed. How would you implement this and what can be the best USE CASE?**

In [3]:
data = [("product1", 100), ("product2", 200), ("product3", 300)]
columns = ["product_id", "sales"]

df = spark.createDataFrame(data, columns)
df.show()

+----------+-----+
|product_id|sales|
+----------+-----+
|  product1|  100|
|  product2|  200|
|  product3|  300|
+----------+-----+



In [5]:
df=df.withColumn("processed_time",current_timestamp())
df.show(truncate=False)

+----------+-----+--------------------------+
|product_id|sales|processed_time            |
+----------+-----+--------------------------+
|product1  |100  |2025-04-16 22:26:57.570872|
|product2  |200  |2025-04-16 22:26:57.570872|
|product3  |300  |2025-04-16 22:26:57.570872|
+----------+-----+--------------------------+



**25. You need to register this PySpark DataFrame as a temporary SQL object and run a query on it. How would you achieve this?**

In [4]:
data = [("product1", 100), ("product2", 200), ("product3", 300)]
columns = ["product_id", "sales"]

df = spark.createDataFrame(data, columns)
df.show()

+----------+-----+
|product_id|sales|
+----------+-----+
|  product1|  100|
|  product2|  200|
|  product3|  300|
+----------+-----+



In [7]:
df.createOrReplaceTempView('tempsqldf')

In [8]:
spark.sql("select * from tempsqldf").show()

+----------+-----+
|product_id|sales|
+----------+-----+
|  product1|  100|
|  product2|  200|
|  product3|  300|
+----------+-----+



**26. You need to register this PySpark DataFrame as a temporary SQL object and run a query on it (FROM DIFFERENT NOTEBOOKS AS WELL)?**

In [0]:
df.createOrReplaceGlobalTempView("globalview")
select * from global_temp.globalview;
#use global_temp keyword

**27. You need to query data from a PySpark DataFrame using SQL, but the data includes a nested structure. How would you flatten the data for easier querying?**

In [2]:
data = [("product1", {"price": 100, "quantity": 2}), 
        ("product2", {"price": 200, "quantity": 3})]
columns = ["product_id", "product_info"]

df = spark.createDataFrame(data, columns)
df.show(truncate=False)

+----------+-----------------------------+
|product_id|product_info                 |
+----------+-----------------------------+
|product1  |{price -> 100, quantity -> 2}|
|product2  |{price -> 200, quantity -> 3}|
+----------+-----------------------------+



In [3]:
df.select("product_id","product_info.price","product_info.quantity").createOrReplaceTempView("flatview")

In [4]:
spark.sql("select * from flatview").show()

+----------+-----+--------+
|product_id|price|quantity|
+----------+-----+--------+
|  product1|  100|       2|
|  product2|  200|       3|
+----------+-----+--------+



**28. You are ingesting data from an external API in JSON format where the schema is inconsistent. How would you handle this situation to ensure a robust pipeline?**

In [0]:
df=spark.read.format("json").option("mergeSchema",True)

**29. While reading data from Parquet, you need to optimize performance by partitioning the data based on a column. How would you implement this?**

In [0]:
df.write.format("parquet").mode("append").partitionBy("category").save("location")

**30. You are working with a large dataset in Parquet format and need to ensure that the data is written in an optimized manner with proper compression. How would you accomplish this?**

In [0]:
df.write.format('parquet').option("compression","snappy")

**31. Your company uses a large-scale data pipeline that reads from Delta tables and processes data using complex aggregations. However, performance is becoming an issue due to the growing dataset size. How would you optimize the performance of the pipeline?**

In [0]:
OPTIMIZE tabledelta ZORDER BY ('order_date')

**43. You are processing sales data. Group by product categories and create a list of all product names in each category.**

In [7]:
data = [("Electronics", "Laptop"), ("Electronics", "Smartphone"), ("Furniture", "Chair"), ("Furniture", "Table")]
columns = ["category", "product"]
df = spark.createDataFrame(data, columns)
df.show()

+-----------+----------+
|   category|   product|
+-----------+----------+
|Electronics|    Laptop|
|Electronics|Smartphone|
|  Furniture|     Chair|
|  Furniture|     Table|
+-----------+----------+



In [8]:
df=df.groupBy('category').agg(collect_list('product').alias('products'))
df.show()

+-----------+--------------------+
|   category|            products|
+-----------+--------------------+
|Electronics|[Laptop, Smartphone]|
|  Furniture|      [Chair, Table]|
+-----------+--------------------+



**44. You are analyzing orders. Group by customer IDs and list all unique product IDs each customer purchased.**

In [9]:
data = [(101, "P001"), (101, "P002"), (102, "P001"), (101, "P001")]
columns = ["customer_id", "product_id"]
df = spark.createDataFrame(data, columns)
df.show()

+-----------+----------+
|customer_id|product_id|
+-----------+----------+
|        101|      P001|
|        101|      P002|
|        102|      P001|
|        101|      P001|
+-----------+----------+



In [10]:
df=df.groupBy('customer_id').agg(collect_set('product_id').alias('unique_products'))
df.show()

+-----------+---------------+
|customer_id|unique_products|
+-----------+---------------+
|        101|   [P002, P001]|
|        102|         [P001]|
+-----------+---------------+



**45. For customer records, combine first and last names only if the email address exists.**

In [11]:
data = [("John", "Doe", "john.doe@example.com"), ("Jane", "Smith", None)]
columns = ["first_name", "last_name", "email"]
df = spark.createDataFrame(data, columns)
df.show()

+----------+---------+--------------------+
|first_name|last_name|               email|
+----------+---------+--------------------+
|      John|      Doe|john.doe@example.com|
|      Jane|    Smith|                NULL|
+----------+---------+--------------------+



In [13]:
df=df.withColumn('fullname',when(col('email').isNotNull(),concat_ws(".",'first_name','last_name')).otherwise(None))
df.show()

+----------+---------+--------------------+--------+
|first_name|last_name|               email|fullname|
+----------+---------+--------------------+--------+
|      John|      Doe|john.doe@example.com|John.Doe|
|      Jane|    Smith|                NULL|    NULL|
+----------+---------+--------------------+--------+



**46. You have a DataFrame containing customer IDs and a list of their purchased product IDs. Calculate the number of products each customer has purchased.**

In [14]:
data = [
    (1, ["prod1", "prod2", "prod3"]),
    (2, ["prod4"]),
    (3, ["prod5", "prod6"]),
]
myschema = "customer_id INT ,product_ids array<STRING>"

df = spark.createDataFrame(data, myschema)
df.show()

+-----------+--------------------+
|customer_id|         product_ids|
+-----------+--------------------+
|          1|[prod1, prod2, pr...|
|          2|             [prod4]|
|          3|      [prod5, prod6]|
+-----------+--------------------+



In [16]:
df=df.withColumn('no_of_prod',size(col('Product_ids')))

df.show()

+-----------+--------------------+----------+
|customer_id|         product_ids|no_of_prod|
+-----------+--------------------+----------+
|          1|[prod1, prod2, pr...|         3|
|          2|             [prod4]|         1|
|          3|      [prod5, prod6]|         2|
+-----------+--------------------+----------+



**47. You have employee IDs of varying lengths. Ensure all IDs are 6 characters long by padding with leading zeroes.**

In [17]:
data = [
    ("1",),
    ("123",),
    ("4567",),
]
schema = ["employee_id"]

df = spark.createDataFrame(data, schema)
df.show()

+-----------+
|employee_id|
+-----------+
|          1|
|        123|
|       4567|
+-----------+



In [19]:
df=df.withColumn("employee_id",lpad(col('employee_id'),6,'0'))
df.show()

+-----------+
|employee_id|
+-----------+
|     000001|
|     000123|
|     004567|
+-----------+



**48. You need to validate phone numbers by checking if they start with "91"**

In [20]:
data = [
    ("911234567890",),
    ("811234567890",),
    ("912345678901",),
]
schema = ["phone_number"]

df = spark.createDataFrame(data, schema)
df.show()

+------------+
|phone_number|
+------------+
|911234567890|
|811234567890|
|912345678901|
+------------+



In [21]:
df=df.filter(substring(col('phone_number'),1,2)=="91")
df.show()

+------------+
|phone_number|
+------------+
|911234567890|
|912345678901|
+------------+



**49. You have a dataset with courses taken by students. Calculate the average number of courses per student.**

In [23]:
data = [
    (1, ["Math", "Science"]),
    (2, ["History"]),
    (3, ["Art", "PE", "Biology"]),
]
schema = ["student_id", "courses"]

df = spark.createDataFrame(data, schema)
df.show()

+----------+------------------+
|student_id|           courses|
+----------+------------------+
|         1|   [Math, Science]|
|         2|         [History]|
|         3|[Art, PE, Biology]|
+----------+------------------+



In [24]:
df=df.withColumn("course_size",size('courses')).groupBy().agg(avg('course_size'))
df.show()

+----------------+
|avg(course_size)|
+----------------+
|             2.0|
+----------------+



**50. You have a dataset with primary and secondary contact numbers. Use the primary number if available; otherwise, use the secondary number.**

In [30]:
data = [
    (None, "1234567890"),
    ("9876543210", None),
    ("7894561230", "4567891230"),
]
schema = ["primary_contact", "secondary_contact"]

df = spark.createDataFrame(data, schema)
df.show()

+---------------+-----------------+
|primary_contact|secondary_contact|
+---------------+-----------------+
|           NULL|       1234567890|
|     9876543210|             NULL|
|     7894561230|       4567891230|
+---------------+-----------------+



In [32]:
df=df.withColumn("contact",coalesce(col('primary_contact'),col('secondary_contact')))
df.show()

+---------------+-----------------+----------+
|primary_contact|secondary_contact|   contact|
+---------------+-----------------+----------+
|           NULL|       1234567890|1234567890|
|     9876543210|             NULL|9876543210|
|     7894561230|       4567891230|7894561230|
+---------------+-----------------+----------+



**51. You are categorizing product codes based on their lengths. If the length is 5, label it as "Standard"; otherwise, label it as "Custom".**

In [35]:
data = [
    ("prod1",),
    ("prd234",),
    ("pr9876",),
]
schema = ["product_code"]

df = spark.createDataFrame(data, schema)
df.show()

+------------+
|product_code|
+------------+
|       prod1|
|      prd234|
|      pr9876|
+------------+



In [37]:
df=df.withColumn("code_flag",when(length(col('product_code'))==5,"standard").otherwise("custom"))
df.show()

+------------+---------+
|product_code|code_flag|
+------------+---------+
|       prod1| standard|
|      prd234|   custom|
|      pr9876|   custom|
+------------+---------+

