In [11]:
# Contoh membuat DataFrame sederhana dan operasi dasar
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('HandsOnPertemuan3').getOrCreate()

data = [('James', 'Sales', 3000),
        ('Michael', 'Sales', 4600),
        ('Robert', 'Sales', 4100),
        ('Maria', 'Finance', 3000)]
columns = ['EmployeeName', 'Departement', 'Salary']

df = spark.createDataFrame(data, schema=columns)
df.show()

+------------+-----------+------+
|EmployeeName|Departement|Salary|
+------------+-----------+------+
|       James|      Sales|  3000|
|     Michael|      Sales|  4600|
|      Robert|      Sales|  4100|
|       Maria|    Finance|  3000|
+------------+-----------+------+



In [16]:
# Contoh operasi transformasi DataFrame
df.select('EmployeeName', 'Salary').show()
df.filter(df['Salary'] > 3000).show()
df.groupBy('Departement').avg('Salary').show()

+------------+------+
|EmployeeName|Salary|
+------------+------+
|       James|  3000|
|     Michael|  4600|
|      Robert|  4100|
|       Maria|  3000|
+------------+------+

+------------+-----------+------+-----------+-----------------+
|EmployeeName|Departement|Salary|SalaryBonus|TotalCompensation|
+------------+-----------+------+-----------+-----------------+
|     Michael|      Sales|  4600|      460.0|           5060.0|
|      Robert|      Sales|  4100|      410.0|           4510.0|
+------------+-----------+------+-----------+-----------------+

+-----------+-----------+
|Departement|avg(Salary)|
+-----------+-----------+
|      Sales|     3900.0|
|    Finance|     3000.0|
+-----------+-----------+



In [13]:
# Contoh manipulasi tipe data kompleks
from pyspark.sql import functions as F

df = df.withColumn("SalaryBonus", F.col("Salary") * F.lit(0.1))
df = df.withColumn("TotalCompensation", F.col("Salary") + F.col("SalaryBonus"))

df.show()


+------------+-----------+------+-----------+-----------------+
|EmployeeName|Departement|Salary|SalaryBonus|TotalCompensation|
+------------+-----------+------+-----------+-----------------+
|       James|      Sales|  3000|      300.0|           3300.0|
|     Michael|      Sales|  4600|      460.0|           5060.0|
|      Robert|      Sales|  4100|      410.0|           4510.0|
|       Maria|    Finance|  3000|      300.0|           3300.0|
+------------+-----------+------+-----------+-----------------+



In [15]:
from pyspark.sql.window import Window
from pyspark.sql import functions as F

windowSpec = Window.partitionBy('Departement').orderBy('Salary')
df.withColumn('Rank', F.rank().over(windowSpec)).show()

+------------+-----------+------+-----------+-----------------+----+
|EmployeeName|Departement|Salary|SalaryBonus|TotalCompensation|Rank|
+------------+-----------+------+-----------+-----------------+----+
|       Maria|    Finance|  3000|      300.0|           3300.0|   1|
|       James|      Sales|  3000|      300.0|           3300.0|   1|
|      Robert|      Sales|  4100|      410.0|           4510.0|   2|
|     Michael|      Sales|  4600|      460.0|           5060.0|   3|
+------------+-----------+------+-----------+-----------------+----+

