# Basic Data Processing with PySpark

This notebook demonstrates basic data processing operations using PySpark.

In [None]:
# Import required libraries
import sys
sys.path.append('../..')

from utilities.spark_utils.session import create_spark_session
from utilities.io.readers import DataReader
from utilities.transformations.data_cleaning import remove_duplicates, handle_null_values
from pyspark.sql.functions import col, when, count

In [None]:
# Create Spark session
spark = create_spark_session(app_name="Data Processing Example")
reader = DataReader(spark)

In [None]:
# Create sample data
data = [
    (1, "John Doe", "john@email.com", "active"),
    (2, "Jane Smith", "jane@email.com", "inactive"),
    (3, None, "bob@email.com", "active"),
    (1, "John Doe", "john@email.com", "active"),  # duplicate
    (4, "Alice Brown", None, "active")
]

columns = ["id", "name", "email", "status"]
df = spark.createDataFrame(data, columns)

print(f"Original data: {df.count()} rows")
df.show()

In [None]:
# Remove duplicates
df_dedup = remove_duplicates(df)
print(f"After removing duplicates: {df_dedup.count()} rows")
df_dedup.show()

In [None]:
# Handle null values
df_cleaned = handle_null_values(df_dedup, strategy="fill", fill_value="UNKNOWN")
print("After handling nulls:")
df_cleaned.show()

In [None]:
# Basic analytics
status_counts = df_cleaned.groupBy("status").count()
print("Status distribution:")
status_counts.show()

In [None]:
# Clean up
spark.stop()