In [0]:
print("Heelo")

#PYSPARK INTERVIEW QUESTIONS from Ansh

###Q1 While ingesting customer data from an external source, you notice duplicate entries. How would you remove duplicates and retain only the latest entry based on a timestamp column?

In [0]:
from pyspark.sql.functions import * 
from pyspark.sql.types import *

In [0]:
data = [("101", "2023-12-01", 100), ("101", "2023-12-02", 150), 
        ("102", "2023-12-01", 200), ("102", "2023-12-02", 250)]
columns = ["product_id", "date", "sales"]

df = spark.createDataFrame(data, columns)
df.display()

In [0]:
df = df.withColumn('date', col('date').cast(DateType()))

In [0]:
df.withColumn('date',to_date('date')).display()

In [0]:
df.display()

In [0]:
df.dropDuplicates(subset=['product_id']).display()

In [0]:
df.display()

In [0]:
df = df.orderBy(['product_id','date'], ascending=[1,0]).dropDuplicates(['product_id'])

# df = df.orderBy(['product_id', 'date'], ascending=[True, False])\
    #    .dropDuplicates(['product_id'])

In [0]:
df.display()

# 2. While processing data from multiple files with inconsistent schemas, you need to merge them into a single DataFrame. How would you handle this inconsistency in PySpark?

In [0]:
df = spark.read.format('parquet')\
    .option('mergeSchema',True)\
    .load('/Volumes/datasets/practice/internal_files/output/')

In [0]:
df.display()

#MapReduce
Writes intermediate results to disk between each stage (e.g., between Mapper and Reducer).

The Reducer reads data from disk, which makes the process time-consuming.

It is primarily designed for batch processing.

Due to frequent disk I/O, MapReduce is slower than in-memory frameworks like Spark.

#Apache Spark
Performs most computations in memory, writing to disk only during final write operations or when memory overflows.

Faster compared to MapReduce due to reduced disk I/O.

Less time-consuming for iterative and complex computations.

Supports both batch and streaming processing.

#4. You are working with a real-time data pipeline, and you notice missing values in your streaming data Column - Category. How would you handle null or missing values in such a scenario?

df_stream = spark.readStream.schema("id INT, value STRING").csv("path/to/stream")

In [0]:
df_stream = spark.read.format('csv')\
    .option('InferSchema',True)\
    .option('Header','True')\
    .load('/Volumes/datasets/practice/internal_files/BigMart Sales.csv')

In [0]:
df_stream.display()

In [0]:
df_st = df_stream.fillna({'Item_Weight':100})

In [0]:
df_st.display()

##5. You need to calculate the total number of actions performed by users in a system. How would you calculate the top 5 most active users based on this information?

In [0]:
data = [("user1", 5), ("user2", 8), ("user3", 2), ("user4", 10), ("user2", 3)]
columns = ["user_id", "actions"]

df = spark.createDataFrame(data, columns)
df.display()

In [0]:
df.groupBy('user_id').agg(sum('actions').alias('Total_Actions')).orderBy('Total_Actions',ascending=False).limit(5).display()

#6. While processing sales transaction data, you need to identify the most recent transaction for each customer. How would you approach this task?

In [0]:
data = [("cust1", "2023-12-01", 100), ("cust2", "2023-12-02", 150),
        ("cust1", "2023-12-03", 200), ("cust2", "2023-12-04", 250)]
columns = ["customer_id", "transaction_date", "sales"]
df = spark.createDataFrame(data, columns)
df.display()

In [0]:
df.groupBy('customer_id').agg(max('transaction_date').alias('Last_Transaction')).display()
# df.groupBy('customer_id').agg(sum('sales').alias('Total_Sales')).display()

In [0]:
from pyspark.sql.window import Window

In [0]:
df = df.withColumn('dense_rank', dense_rank().over(Window.partitionBy('customer_id').orderBy(col('transaction_date').desc()))).filter(col('dense_rank')==1)

In [0]:
df.display()