PySpark Practice - Complete Examples

In [27]:
# Import libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
import pandas as pd
import os

In [28]:
spark = SparkSession.builder \
    .appName("PySpark Practice") \
    .master("local[*]") \
    .config("spark.driver.memory", "2g") \
    .getOrCreate()

# Set log level to reduce noise
spark.sparkContext.setLogLevel("ERROR")

print(f"‚úÖ Spark Version: {spark.version}")
print(f"‚úÖ Spark Master: {spark.sparkContext.master}")

‚úÖ Spark Version: 4.0.1
‚úÖ Spark Master: local[*]


In [29]:
# ============================================
# 2. Create Sample DataFrames
# ============================================

# Method 1: From Python list
data = [
    ("Alice", 25, "Engineering", 75000),
    ("Bob", 30, "Marketing", 65000),
    ("Charlie", 35, "Engineering", 85000),
    ("Diana", 28, "HR", 60000),
    ("Eve", 32, "Engineering", 90000),
    ("Frank", 29, "Marketing", 70000)
]

columns = ["name", "age", "department", "salary"]
df = spark.createDataFrame(data, columns)

print("\nüìä Sample DataFrame:")
df.show()


üìä Sample DataFrame:


                                                                                

+-------+---+-----------+------+
|   name|age| department|salary|
+-------+---+-----------+------+
|  Alice| 25|Engineering| 75000|
|    Bob| 30|  Marketing| 65000|
|Charlie| 35|Engineering| 85000|
|  Diana| 28|         HR| 60000|
|    Eve| 32|Engineering| 90000|
|  Frank| 29|  Marketing| 70000|
+-------+---+-----------+------+



In [30]:
# ============================================
# 3. Basic DataFrame Operations
# ============================================

# Select columns
print("\nüîπ Select specific columns:")
df.select("name", "salary").show()

# Filter rows
print("\nüîπ Filter: Salary > 70000")
df.filter(df.salary > 70000).show()

# Add new column
print("\nüîπ Add bonus column (10% of salary):")
df.withColumn("bonus", df.salary * 0.1).show()

# Rename column
print("\nüîπ Rename column:")
df.withColumnRenamed("name", "employee_name").show(5)

# Drop column
print("\nüîπ Drop age column:")
df.drop("age").show()


üîπ Select specific columns:
+-------+------+
|   name|salary|
+-------+------+
|  Alice| 75000|
|    Bob| 65000|
|Charlie| 85000|
|  Diana| 60000|
|    Eve| 90000|
|  Frank| 70000|
+-------+------+


üîπ Filter: Salary > 70000
+-------+---+-----------+------+
|   name|age| department|salary|
+-------+---+-----------+------+
|  Alice| 25|Engineering| 75000|
|Charlie| 35|Engineering| 85000|
|    Eve| 32|Engineering| 90000|
+-------+---+-----------+------+


üîπ Add bonus column (10% of salary):
+-------+---+-----------+------+------+
|   name|age| department|salary| bonus|
+-------+---+-----------+------+------+
|  Alice| 25|Engineering| 75000|7500.0|
|    Bob| 30|  Marketing| 65000|6500.0|
|Charlie| 35|Engineering| 85000|8500.0|
|  Diana| 28|         HR| 60000|6000.0|
|    Eve| 32|Engineering| 90000|9000.0|
|  Frank| 29|  Marketing| 70000|7000.0|
+-------+---+-----------+------+------+


üîπ Rename column:
+-------------+---+-----------+------+
|employee_name|age| department|salar

In [31]:
# ============================================
# 4. Aggregations
# ============================================

print("\nüìà Aggregations:")

# Count
print(f"Total employees: {df.count()}")

# Group by and aggregate
print("\nüîπ Average salary by department:")
df.groupBy("department").agg(
    avg("salary").alias("avg_salary"),
    count("*").alias("employee_count")
).show()

# Multiple aggregations
print("\nüîπ Department statistics:")
df.groupBy("department").agg(
    avg("salary").alias("avg_salary"),
    min("salary").alias("min_salary"),
    max("salary").alias("max_salary"),
    count("*").alias("count")
).show()


üìà Aggregations:
Total employees: 6

üîπ Average salary by department:
+-----------+-----------------+--------------+
| department|       avg_salary|employee_count|
+-----------+-----------------+--------------+
|Engineering|83333.33333333333|             3|
|  Marketing|          67500.0|             2|
|         HR|          60000.0|             1|
+-----------+-----------------+--------------+


üîπ Department statistics:
+-----------+-----------------+----------+----------+-----+
| department|       avg_salary|min_salary|max_salary|count|
+-----------+-----------------+----------+----------+-----+
|Engineering|83333.33333333333|     75000|     90000|    3|
|  Marketing|          67500.0|     65000|     70000|    2|
|         HR|          60000.0|     60000|     60000|    1|
+-----------+-----------------+----------+----------+-----+



In [32]:
# ============================================
# 5. Sorting
# ============================================

print("\nüîπ Sort by salary (descending):")
df.orderBy(col("salary").desc()).show()


üîπ Sort by salary (descending):
+-------+---+-----------+------+
|   name|age| department|salary|
+-------+---+-----------+------+
|    Eve| 32|Engineering| 90000|
|Charlie| 35|Engineering| 85000|
|  Alice| 25|Engineering| 75000|
|  Frank| 29|  Marketing| 70000|
|    Bob| 30|  Marketing| 65000|
|  Diana| 28|         HR| 60000|
+-------+---+-----------+------+



In [33]:
# Register as temp view
df.createOrReplaceTempView("employees")

print("\nüîπ SQL Query:")
result = spark.sql("""
    SELECT 
        department,
        AVG(salary) as avg_salary,
        COUNT(*) as emp_count
    FROM employees
    WHERE age >= 28
    GROUP BY department
    ORDER BY avg_salary DESC
""")
result.show()


üîπ SQL Query:
+-----------+----------+---------+
| department|avg_salary|emp_count|
+-----------+----------+---------+
|Engineering|   87500.0|        2|
|  Marketing|   67500.0|        2|
|         HR|   60000.0|        1|
+-----------+----------+---------+



In [34]:
# ============================================
# 7. Working with Dates
# ============================================

date_data = [
    ("2024-01-15", 1000),
    ("2024-02-20", 1500),
    ("2024-03-10", 2000),
    ("2024-04-05", 1200)
]

df_dates = spark.createDataFrame(date_data, ["date", "amount"])

print("\nüìÖ Date operations:")
df_dates = df_dates.withColumn("date", to_date(col("date")))
df_dates = df_dates.withColumn("year", year("date"))
df_dates = df_dates.withColumn("month", month("date"))
df_dates = df_dates.withColumn("day", dayofmonth("date"))
df_dates.show()


üìÖ Date operations:
+----------+------+----+-----+---+
|      date|amount|year|month|day|
+----------+------+----+-----+---+
|2024-01-15|  1000|2024|    1| 15|
|2024-02-20|  1500|2024|    2| 20|
|2024-03-10|  2000|2024|    3| 10|
|2024-04-05|  1200|2024|    4|  5|
+----------+------+----+-----+---+



In [35]:
# ============================================
# 8. String Operations
# ============================================

print("\nüîπ String operations:")
df.withColumn("name_upper", upper("name")) \
  .withColumn("name_length", length("name")) \
  .withColumn("name_substr", substring("name", 1, 2)) \
  .show()


üîπ String operations:
+-------+---+-----------+------+----------+-----------+-----------+
|   name|age| department|salary|name_upper|name_length|name_substr|
+-------+---+-----------+------+----------+-----------+-----------+
|  Alice| 25|Engineering| 75000|     ALICE|          5|         Al|
|    Bob| 30|  Marketing| 65000|       BOB|          3|         Bo|
|Charlie| 35|Engineering| 85000|   CHARLIE|          7|         Ch|
|  Diana| 28|         HR| 60000|     DIANA|          5|         Di|
|    Eve| 32|Engineering| 90000|       EVE|          3|         Ev|
|  Frank| 29|  Marketing| 70000|     FRANK|          5|         Fr|
+-------+---+-----------+------+----------+-----------+-----------+



In [36]:
# ============================================
# 9. Handling Nulls
# ============================================

null_data = [
    ("John", 25, 50000),
    ("Jane", None, 60000),
    ("Bob", 30, None),
    (None, 28, 55000)
]

df_nulls = spark.createDataFrame(null_data, ["name", "age", "salary"])

print("\n‚ùì DataFrame with nulls:")
df_nulls.show()


‚ùì DataFrame with nulls:
+----+----+------+
|name| age|salary|
+----+----+------+
|John|  25| 50000|
|Jane|NULL| 60000|
| Bob|  30|  NULL|
|NULL|  28| 55000|
+----+----+------+



In [37]:
# Drop nulls
print("\nüîπ Drop rows with any null:")
df_nulls.dropna().show()

# Fill nulls
print("\nüîπ Fill nulls:")
df_nulls.fillna({"age": 0, "salary": 0, "name": "Unknown"}).show()


üîπ Drop rows with any null:
+----+---+------+
|name|age|salary|
+----+---+------+
|John| 25| 50000|
+----+---+------+


üîπ Fill nulls:
+-------+---+------+
|   name|age|salary|
+-------+---+------+
|   John| 25| 50000|
|   Jane|  0| 60000|
|    Bob| 30|     0|
|Unknown| 28| 55000|
+-------+---+------+



In [38]:
# ============================================
# 10. Joins
# ============================================

# Create second dataframe
dept_data = [
    ("Engineering", "Building A"),
    ("Marketing", "Building B"),
    ("HR", "Building C")
]

df_dept = spark.createDataFrame(dept_data, ["department", "location"])

print("\nüîó Join example:")
df.join(df_dept, on="department", how="inner").show()


üîó Join example:
+-----------+-------+---+------+----------+
| department|   name|age|salary|  location|
+-----------+-------+---+------+----------+
|Engineering|  Alice| 25| 75000|Building A|
|Engineering|Charlie| 35| 85000|Building A|
|Engineering|    Eve| 32| 90000|Building A|
|         HR|  Diana| 28| 60000|Building C|
|  Marketing|    Bob| 30| 65000|Building B|
|  Marketing|  Frank| 29| 70000|Building B|
+-----------+-------+---+------+----------+



In [39]:
# ============================================
# 11. Window Functions
# ============================================

print("\nü™ü Window functions:")

# Rank employees by salary within department
window_spec = Window.partitionBy("department").orderBy(col("salary").desc())

df.withColumn("rank", rank().over(window_spec)) \
  .withColumn("dense_rank", dense_rank().over(window_spec)) \
  .withColumn("row_number", row_number().over(window_spec)) \
  .show()



ü™ü Window functions:
+-------+---+-----------+------+----+----------+----------+
|   name|age| department|salary|rank|dense_rank|row_number|
+-------+---+-----------+------+----+----------+----------+
|    Eve| 32|Engineering| 90000|   1|         1|         1|
|Charlie| 35|Engineering| 85000|   2|         2|         2|
|  Alice| 25|Engineering| 75000|   3|         3|         3|
|  Diana| 28|         HR| 60000|   1|         1|         1|
|  Frank| 29|  Marketing| 70000|   1|         1|         1|
|    Bob| 30|  Marketing| 65000|   2|         2|         2|
+-------+---+-----------+------+----+----------+----------+



In [40]:
# ============================================
# 12. User Defined Functions (UDF)
# ============================================

# Define UDF
def categorize_salary(salary):
    if salary >= 80000:
        return "High"
    elif salary >= 65000:
        return "Medium"
    else:
        return "Low"

categorize_udf = udf(categorize_salary, StringType())

print("\nüîß UDF example:")
df.withColumn("salary_category", categorize_udf("salary")).show()



üîß UDF example:


                                                                                

+-------+---+-----------+------+---------------+
|   name|age| department|salary|salary_category|
+-------+---+-----------+------+---------------+
|  Alice| 25|Engineering| 75000|         Medium|
|    Bob| 30|  Marketing| 65000|         Medium|
|Charlie| 35|Engineering| 85000|           High|
|  Diana| 28|         HR| 60000|            Low|
|    Eve| 32|Engineering| 90000|           High|
|  Frank| 29|  Marketing| 70000|         Medium|
+-------+---+-----------+------+---------------+



In [41]:
# ============================================
# 13. Reading and Writing Data
# ============================================

# Create output directory
output_dir = "/home/somnath/coding/pyspark/output"
os.makedirs(output_dir, exist_ok=True)

# Write to CSV
print("\nüíæ Writing to CSV...")
df.coalesce(1).write.mode("overwrite").csv(
    f"file://{output_dir}/employees_csv", 
    header=True
)
print(f"‚úÖ CSV written to {output_dir}/employees_csv/")



üíæ Writing to CSV...


[Stage 57:>                                                         (0 + 1) / 1]

‚úÖ CSV written to /home/somnath/coding/pyspark/output/employees_csv/


                                                                                

In [42]:
# Read from CSV
print("\nüìñ Reading from CSV...")
df_read = spark.read.csv(
    f"file://{output_dir}/employees_csv", 
    header=True, 
    inferSchema=True
)
df_read.show()



üìñ Reading from CSV...
+-------+---+-----------+------+
|   name|age| department|salary|
+-------+---+-----------+------+
|  Alice| 25|Engineering| 75000|
|    Bob| 30|  Marketing| 65000|
|Charlie| 35|Engineering| 85000|
|  Diana| 28|         HR| 60000|
|    Eve| 32|Engineering| 90000|
|  Frank| 29|  Marketing| 70000|
+-------+---+-----------+------+



In [43]:
# Write to Parquet
print("\nüíæ Writing to Parquet...")
df.write.mode("overwrite").parquet(f"file://{output_dir}/employees_parquet")
print(f"‚úÖ Parquet written to {output_dir}/employees_parquet/")


üíæ Writing to Parquet...
‚úÖ Parquet written to /home/somnath/coding/pyspark/output/employees_parquet/


In [44]:

# Read from Parquet
print("\nüìñ Reading from Parquet...")
df_parquet = spark.read.parquet(f"file://{output_dir}/employees_parquet")
df_parquet.show()




üìñ Reading from Parquet...
+-------+---+-----------+------+
|   name|age| department|salary|
+-------+---+-----------+------+
|Charlie| 35|Engineering| 85000|
|  Alice| 25|Engineering| 75000|
|  Frank| 29|  Marketing| 70000|
|    Eve| 32|Engineering| 90000|
|    Bob| 30|  Marketing| 65000|
|  Diana| 28|         HR| 60000|
+-------+---+-----------+------+



In [45]:
# Write to JSON
print("\nüíæ Writing to JSON...")
df.coalesce(1).write.mode("overwrite").json(f"file://{output_dir}/employees_json")
print(f"‚úÖ JSON written to {output_dir}/employees_json/")


üíæ Writing to JSON...


[Stage 66:>                                                         (0 + 1) / 1]

‚úÖ JSON written to /home/somnath/coding/pyspark/output/employees_json/


                                                                                

In [46]:
# ============================================
# 14. Performance Optimization
# ============================================

# Cache DataFrame
df.cache()
print("\n‚ö° DataFrame cached")

# Show execution plan
print("\nüìã Execution Plan:")
df.filter(df.salary > 70000).explain()

# Repartition
df_repart = df.repartition(3)
print(f"Partitions: {df_repart.rdd.getNumPartitions()}")


‚ö° DataFrame cached

üìã Execution Plan:
== Physical Plan ==
*(1) Filter (isnotnull(salary#1424L) AND (salary#1424L > 70000))
+- InMemoryTableScan [name#1421, age#1422L, department#1423, salary#1424L], [isnotnull(salary#1424L), (salary#1424L > 70000)]
      +- InMemoryRelation [name#1421, age#1422L, department#1423, salary#1424L], StorageLevel(disk, memory, deserialized, 1 replicas)
            +- *(1) Scan ExistingRDD[name#1421,age#1422L,department#1423,salary#1424L]


Partitions: 3


In [47]:
# ============================================
# 15. Statistics
# ============================================

print("\nüìä Statistics:")
df.describe().show()

# Specific stats
df.select(
    mean("salary").alias("avg_salary"),
    stddev("salary").alias("std_salary"),
    min("salary").alias("min_salary"),
    max("salary").alias("max_salary")
).show()



üìä Statistics:
+-------+-----+------------------+-----------+------------------+
|summary| name|               age| department|            salary|
+-------+-----+------------------+-----------+------------------+
|  count|    6|                 6|          6|                 6|
|   mean| NULL|29.833333333333332|       NULL| 74166.66666666667|
| stddev| NULL| 3.430257521916783|       NULL|11583.033569262703|
|    min|Alice|                25|Engineering|             60000|
|    max|Frank|                35|  Marketing|             90000|
+-------+-----+------------------+-----------+------------------+

+-----------------+------------------+----------+----------+
|       avg_salary|        std_salary|min_salary|max_salary|
+-----------------+------------------+----------+----------+
|74166.66666666667|11583.033569262703|     60000|     90000|
+-----------------+------------------+----------+----------+



In [48]:
# ============================================
# 16. Convert to/from Pandas
# ============================================

print("\nüêº Convert to Pandas:")
pandas_df = df.toPandas()
print(f"Type: {type(pandas_df)}")
print(pandas_df.head())


üêº Convert to Pandas:
Type: <class 'pandas.core.frame.DataFrame'>
      name  age   department  salary
0    Alice   25  Engineering   75000
1      Bob   30    Marketing   65000
2  Charlie   35  Engineering   85000
3    Diana   28           HR   60000
4      Eve   32  Engineering   90000


In [49]:
# Convert from Pandas
print("\nüîÑ Convert back to Spark:")
spark_df = spark.createDataFrame(pandas_df)
spark_df.show()


üîÑ Convert back to Spark:
+-------+---+-----------+------+
|   name|age| department|salary|
+-------+---+-----------+------+
|  Alice| 25|Engineering| 75000|
|    Bob| 30|  Marketing| 65000|
|Charlie| 35|Engineering| 85000|
|  Diana| 28|         HR| 60000|
|    Eve| 32|Engineering| 90000|
|  Frank| 29|  Marketing| 70000|
+-------+---+-----------+------+



In [50]:
# ============================================
# 17. Additional Useful Operations
# ============================================

print("\nüîπ Distinct values in department:")
df.select("department").distinct().show()

print("\nüîπ Sample data (50%):")
df.sample(fraction=0.5, seed=42).show()

print("\nüîπ Column data types:")
df.printSchema()


üîπ Distinct values in department:
+-----------+
| department|
+-----------+
|Engineering|
|  Marketing|
|         HR|
+-----------+


üîπ Sample data (50%):
+-------+---+-----------+------+
|   name|age| department|salary|
+-------+---+-----------+------+
|Charlie| 35|Engineering| 85000|
+-------+---+-----------+------+


üîπ Column data types:
root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: long (nullable = true)



In [51]:
# ============================================
# 18. Clean Up
# ============================================

print("\n‚úÖ PySpark practice complete!")
print(f"\nüìÅ Output files saved in: {output_dir}")


‚úÖ PySpark practice complete!

üìÅ Output files saved in: /home/somnath/coding/pyspark/output


In [52]:
# Uncomment to stop Spark session
spark.stop()