In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

# Initialize Spark
spark = SparkSession.builder.appName("PySpark_Mastery").getOrCreate()

# Employee Dataset
employee_data = [
    (1, "John", "Doe", 28, "Engineering", 75000, "2020-01-15", ["Python", "Spark", "SQL"], "john.doe@company.com", "New York"),
    (2, "Jane", "Smith", 32, "Marketing", 65000, "2019-03-20", ["Excel", "PowerBI", "SQL"], "jane.smith@company.com", "Los Angeles"),
    (3, "Mike", "Johnson", 45, "Engineering", 95000, "2018-07-10", ["Java", "Scala", "Spark"], "mike.johnson@company.com", "New York"),
    (4, "Sarah", "Wilson", 29, "Sales", 55000, "2021-02-28", ["Salesforce", "Excel"], "sarah.wilson@company.com", "Chicago"),
    (5, "David", "Brown", 35, "Engineering", 85000, "2019-11-05", ["Python", "AWS", "Docker"], "david.brown@company.com", "Seattle"),
    (6, "Lisa", "Davis", 27, "Marketing", 60000, "2020-09-12", ["Google Ads", "Analytics"], None, "Los Angeles"),
    (7, "Tom", "Miller", 41, "Sales", 70000, "2017-05-18", ["CRM", "Excel"], "tom.miller@company.com", "Chicago"),
    (8, "Anna", "Garcia", 33, "Engineering", 80000, "2020-12-01", ["React", "Node.js", "MongoDB"], "anna.garcia@company.com", "Austin")
]

employee_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("first_name", StringType(), True),
    StructField("last_name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("department", StringType(), True),
    StructField("salary", IntegerType(), True),
    StructField("hire_date", StringType(), True),
    StructField("skills", ArrayType(StringType()), True),
    StructField("email", StringType(), True),
    StructField("city", StringType(), True)
])

df_employees = spark.createDataFrame(employee_data, employee_schema)

In [0]:
# Sales Dataset
sales_data = [
    (101, 1, "2023-01-15", "Electronics", "Laptop", 1200, 1, "Online"),
    (102, 2, "2023-01-16", "Clothing", "T-Shirt", 25, 3, "Store"),
    (103, 1, "2023-01-17", "Electronics", "Mouse", 30, 2, "Online"),
    (104, 3, "2023-01-18", "Books", "Python Guide", 45, 1, "Online"),
    (105, 4, "2023-01-19", "Electronics", "Keyboard", 80, 1, "Store"),
    (106, 2, "2023-01-20", "Clothing", "Jeans", 60, 2, "Online"),
    (107, 5, "2023-01-21", "Books", "Data Science", 55, 1, "Online"),
    (108, 1, "2023-01-22", "Electronics", "Monitor", 300, 1, "Store")
]

sales_schema = StructType([
    StructField("order_id", IntegerType(), True),
    StructField("customer_id", IntegerType(), True),
    StructField("order_date", StringType(), True),
    StructField("category", StringType(), True),
    StructField("product", StringType(), True),
    StructField("price", IntegerType(), True),
    StructField("quantity", IntegerType(), True),
    StructField("channel", StringType(), True)
])

df_sales = spark.createDataFrame(sales_data, sales_schema)

In [0]:
# Department Dataset
dept_data = [
    ("Engineering", "Tech Tower", "Alice Johnson", 50),
    ("Marketing", "Business Center", "Bob Smith", 25),
    ("Sales", "Sales Plaza", "Carol White", 30),
    ("HR", "Admin Building", "David Lee", 15)
]

dept_schema = StructType([
    StructField("dept_name", StringType(), True),
    StructField("location", StringType(), True),
    StructField("manager", StringType(), True),
    StructField("budget", IntegerType(), True)
])

df_departments = spark.createDataFrame(dept_data, dept_schema)

In [0]:

df_employees_enhanced = df_employees.withColumn(
    "full_name", 
    concat(col("first_name"), lit(" "), col("last_name"))
)

# Calculate years of experience (assuming current year is 2024)
# df_employees_enhanced = df_employees_enhanced.withColumn(
#     "years_experience",
#     year(lit("2024-01-01")) - year(to_date(col("hire_date"), "yyyy-MM-dd"))
# )
# df_employees_enhanced.display()

# Calculate years of experience (assuming current year is 2024)
df_employees_enhanced = df_employees_enhanced.withColumn(
    "years_experience",
    year(current_date()) - year(to_date(col("hire_date"), "yyyy-MM-dd"))
)

df_employees_enhanced.display()


# Create salary categories
df_employees_enhanced = df_employees_enhanced.withColumn(
    "salary_category",
    when(col("salary") >= 80000, "High")
    .when(col("salary") >= 60000, "Medium")
    .otherwise("Entry Level")
)

df_employees_enhanced.select("full_name", "years_experience", "salary_category").show()

In [0]:
completely_unique = df_employees.dropDuplicates()
print(f"Original count: {df_employees.count()}")
print(f"After removing duplicates: {completely_unique.count()}")

In [0]:
employees_no_email = df_employees.filter(col("email").isNull())
print(f"Employees without email: {employees_no_email.count()}")

# Find complete records (no nulls in critical fields)
complete_records = df_employees.filter(
    col("email").isNotNull() & 
    col("salary").isNotNull() & 
    col("department").isNotNull()
)
complete_records.display()
# Data quality report
total_records = df_employees.count()
null_emails = df_employees.filter(col("email").isNull()).count()
print(f"Data Quality Report:")
print(f"Total Records: {total_records}")
print(f"Missing Emails: {null_emails} ({null_emails/total_records*100:.1f}%)")


In [0]:
# Department-wise salary statistics
dept_stats = df_employees.groupBy("department").agg(
    avg("salary").alias("avg_salary"),
    max("salary").alias("max_salary"),
    min("salary").alias("min_salary"),
    count("*").alias("employee_count"),
    stddev("salary").alias("salary_stddev")
)
dept_stats.show()

In [0]:
# Calculate standard deviation of salary by department
dept_salary_stddev = df_employees.groupBy("department").agg(
    stddev("salary").alias("salary_stddev")
)
display(dept_salary_stddev)
# This computes the spread (standard deviation) of salaries for each department.
# Useful for understanding salary variability within departments.

 fillna() and dropna() â€” The Null Handlers

In [0]:
df_employees.display()

In [0]:
# Fill missing emails with a default patt
df_filled_data = df_employees.fillna({"email": "sugan@gmail.com", "city" : "Rochester"})
df_filled_data.display()

df_filled_data = df_employees.dropna()
df_filled_data.display()

df_specific = df_employees.dropna(subset = ["email", "department"])
df_specific.display()

In [0]:
parse_skill = df_employees.withColumn("skill1", col("skills").getItem(0)).withColumn("skill2", col("skills").getItem(1))
parse_skill.display()

In [0]:
parse_skill = df_employees.withColumn("skill1", col("skills").getItem(0)).withColumn("skill2", col("skills").getItem(1)).withColumn("skill3", col("skills").getItem(2))
parse_skill.display()

In [0]:
from pyspark.sql.functions import col, get

parse_skill = (
    df_employees
    .withColumn("skill1", get(col("skills"), 0))
    .withColumn("skill2", get(col("skills"), 1))
    .withColumn("skill3", get(col("skills"), 2))
)
display(parse_skill)

In [0]:
# Get all skills by department
skills_by_dept = df_employees.groupBy("department").agg(
    # collect_set(explode(col("skills"))).alias("unique_skills"),
    collect_list("first_name").alias("employee_names")
)
skills_by_dept.show()

In [0]:
# Explode skills to analyze individual skill distribution
skills_exploded = df_employees.select(
    "first_name",
    "last_name", 
    "department",
    explode(col("skills")).alias("individual_skill")
)
skills_exploded.show()

In [0]:
# Analyze skill popularity
skill_popularity = skills_exploded.groupBy("individual_skill").agg(
    count("*").alias("skill_count"),
    collect_list("department").alias("departments_using")
).orderBy(col("skill_count").desc())
skill_popularity.show(truncate=False)
# Find employees with specific skills
python_experts = skills_exploded.filter(col("individual_skill") == "Python")
python_experts.show()

In [0]:
# Convert string dates to proper date types
df_with_dates = df_employees.withColumn(
    "hire_date_formatted", 
    to_date(col("hire_date"), "yyyy-MM-dd")
).withColumn(
    "current_date", 
    current_date()
).withColumn(
    "days_employed",
    datediff(current_date(), to_date(col("hire_date"), "yyyy-MM-dd")))

In [0]:
# Extract date components
df_date_analysis = df_with_dates.withColumn(
    "hire_year", year(col("hire_date_formatted"))
).withColumn(
    "hire_month", month(col("hire_date_formatted"))
).withColumn(
    "hire_quarter", quarter(col("hire_date_formatted"))
)
# Analyze hiring patterns
hiring_trends = df_date_analysis.groupBy("hire_year", "hire_quarter").agg(
    count("*").alias("hires_count")
).orderBy("hire_year", "hire_quarter")
hiring_trends.show()
# Format dates for reporting
df_formatted_dates = df_with_dates.withColumn(
    "hire_date_display",
    date_format(col("hire_date_formatted"), "MMM dd, yyyy")
)
df_formatted_dates.select("first_name", "hire_date_display", "days_employed").show()

In [0]:
# Extract information using regex
df_regex = df_employees.withColumn(
    "email_username",
    regexp_extract(col("email"), "([^@]+)@", 1)
).withColumn(
    "has_tech_skills",
    when(
        regexp_extract(concat_ws(",", col("skills")), "(Python|Java|SQL|Spark)", 1) != "",
        True
    ).otherwise(False))
df_regex.select("first_name", "email_username", "has_tech_skills").show()

In [0]:
# Clean and standardize data
df_cleaned = df_employees.withColumn(
    "phone_cleaned",
    regexp_replace(col("first_name"), "[^a-zA-Z]", "")  # Remove non-alphabetic characters
).withColumn(
    "city_standardized",
    regexp_replace(col("city"), "\\s+", " ")  # Replace multiple spaces with single space
)
df_regex.select("first_name", "email_username", "has_tech_skills").show()

In [0]:
# Detect skew by checking partition sizes
def check_skew(df):
    sizes = df.rdd.mapPartitions(lambda x: [len(x)]).collect()
    max_size = max(sizes)
    min_size = min(sizes)
    skew_ratio = max_size / min_size if min_size > 0 else float('inf')
    return skew_ratio > 3  # threshold for skew detection

# Mitigate skew with salting
from pyspark.sql.functions import rand, floor, concat, lit

def salted_join(df1, df2, join_key):
    # Add salt to smaller table
    salt_df = df2.withColumn("salt", floor(rand() * 10).cast("int")) \
                 .withColumn("join_key_salt", concat(join_key, lit("_"), col("salt")))
    
    # Add salt to larger table
    large_salt_df = df1.withColumn("salt", floor(rand() * 10).cast("int")) \
                      .withColumn("join_key_salt", concat(join_key, lit("_"), col("salt")))
    
    return large_salt_df.join(salt_df, large_salt_df.join_key_salt == salt_df.join_key_salt)