In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
import random
from datetime import datetime, timedelta

In [5]:
# Initialize Spark session
spark = SparkSession.builder \
    .appName("Module2Exercise") \
    .config("spark.driver.memory", "512m") \
    .config("spark.executor.memory", "512m") \
    .config("spark.sql.shuffle.partitions", "2") \
    .getOrCreate()

In [None]:
schema = StructType([
    StructField("id", IntegerType(), False),
    StructField("name", StringType(), False),
    StructField("age", IntegerType(), False),
    StructField("city", StringType(), False)
])

data = spark.read.schema(schema).option("header", "true").csv("..//..//data/data_2jt_orang_combined.csv")

data.show(5)

+-------+--------------------+---+-----------------+
|     id|                name|age|             city|
+-------+--------------------+---+-----------------+
|1700000|         Jamie White| 60|     West Suzanne|
| 500000|         Pamela Hall| 53|New Josephborough|
|1300000|Mr. Christopher Wood| 43|   Lake Kevinfurt|
| 900000|        Joseph Baker| 50|     Gregoryhaven|
| 200000|   Jacqueline Howard| 39|        New Susan|
+-------+--------------------+---+-----------------+
only showing top 5 rows


In [26]:
data.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- age_bucket: string (nullable = false)



In [16]:
name_age = data.select("name", "age")
name_age.show(5)

+--------------------+---+
|                name|age|
+--------------------+---+
|         Jamie White| 60|
|         Pamela Hall| 53|
|Mr. Christopher Wood| 43|
|        Joseph Baker| 50|
|   Jacqueline Howard| 39|
+--------------------+---+
only showing top 5 rows


In [17]:
adult = data.filter(col("age") >= 21)
adult.show(5)

+-------+--------------------+---+-----------------+
|     id|                name|age|             city|
+-------+--------------------+---+-----------------+
|1700000|         Jamie White| 60|     West Suzanne|
| 500000|         Pamela Hall| 53|New Josephborough|
|1300000|Mr. Christopher Wood| 43|   Lake Kevinfurt|
| 900000|        Joseph Baker| 50|     Gregoryhaven|
| 200000|   Jacqueline Howard| 39|        New Susan|
+-------+--------------------+---+-----------------+
only showing top 5 rows


In [19]:
data = data.withColumn("age_bucket", when(col("age") >= 21, "Adult").otherwise("Minor"))
data.show(5)

+-------+--------------------+---+-----------------+----------+
|     id|                name|age|             city|age_bucket|
+-------+--------------------+---+-----------------+----------+
|1700000|         Jamie White| 60|     West Suzanne|     Adult|
| 500000|         Pamela Hall| 53|New Josephborough|     Adult|
|1300000|Mr. Christopher Wood| 43|   Lake Kevinfurt|     Adult|
| 900000|        Joseph Baker| 50|     Gregoryhaven|     Adult|
| 200000|   Jacqueline Howard| 39|        New Susan|     Adult|
+-------+--------------------+---+-----------------+----------+
only showing top 5 rows


In [20]:
data.drop("city").show(5)

+-------+--------------------+---+----------+
|     id|                name|age|age_bucket|
+-------+--------------------+---+----------+
|1700000|         Jamie White| 60|     Adult|
| 500000|         Pamela Hall| 53|     Adult|
|1300000|Mr. Christopher Wood| 43|     Adult|
| 900000|        Joseph Baker| 50|     Adult|
| 200000|   Jacqueline Howard| 39|     Adult|
+-------+--------------------+---+----------+
only showing top 5 rows


                                                                                

In [24]:
data.groupBy("city").agg(
    avg("age").alias("average_age"),
    count("*").alias("count_people")
).orderBy("count_people", ascending=False).show(15)

+--------------+------------------+------------+
|          city|       average_age|count_people|
+--------------+------------------+------------+
|  Lake Michael|             41.56|         100|
| North Michael| 41.19387755102041|          98|
|   New Michael| 41.02105263157895|          95|
|  Port Michael| 42.53763440860215|          93|
|    Smithmouth| 42.23809523809524|          84|
|  West Michael| 39.87951807228916|          83|
|  East Michael| 39.71604938271605|          81|
| South Michael|           41.0625|          80|
|  Michaelmouth|40.136986301369866|          73|
| Lake Jennifer| 39.50704225352113|          71|
|    West David|41.357142857142854|          70|
|    Lake David|              42.0|          70|
|     New David| 42.57971014492754|          69|
|   North James| 43.85294117647059|          68|
|North Jennifer| 42.13235294117647|          68|
+--------------+------------------+------------+
only showing top 15 rows


In [27]:
data.rdd.getNumPartitions()

1

In [31]:
# count how many citys are there
data.select("city").distinct().count()

                                                                                

40259