In [11]:
# Contoh membuat DataFrame sederhana dan operasi dasar
from pyspark.sql import SparkSession
from pyspark.sql import functions
spark = SparkSession.builder.appName('HandsOnPertemuan6').getOrCreate()

data = [('Restu', 'Hengker', 3000),
        ('Rio', 'Engginer', 4600),
        ('Zidan', 'Develop', 4100),
        ('Azil', 'Mancing', 3000)]
columns = ['EmployeeName', 'Department', 'Salary']

df = spark.createDataFrame(data, schema=columns)
df.show()

+------------+----------+------+
|EmployeeName|Department|Salary|
+------------+----------+------+
|       Restu|   Hengker|  3000|
|         Rio|  Engginer|  4600|
|       Zidan|   Develop|  4100|
|        Azil|   Mancing|  3000|
+------------+----------+------+



In [12]:
# Contoh operasi transformasi DataFrame
df.select('EmployeeName', 'Salary').show()
df.filter(df['Salary'] > 3000).show()
df.groupBy('Department').avg('Salary').show()

+------------+------+
|EmployeeName|Salary|
+------------+------+
|       Restu|  3000|
|         Rio|  4600|
|       Zidan|  4100|
|        Azil|  3000|
+------------+------+

+------------+----------+------+
|EmployeeName|Department|Salary|
+------------+----------+------+
|         Rio|  Engginer|  4600|
|       Zidan|   Develop|  4100|
+------------+----------+------+

+----------+-----------+
|Department|avg(Salary)|
+----------+-----------+
|   Hengker|     3000.0|
|  Engginer|     4600.0|
|   Develop|     4100.0|
|   Mancing|     3000.0|
+----------+-----------+



In [13]:
# Contoh manipulasi tipe data kompleks
df = df.withColumn('SalaryBonus', df['Salary'] * 0.1)
df = df.withColumn('TotalCompensation', df['Salary'] + df['SalaryBonus'])
df.show()


+------------+----------+------+-----------+-----------------+
|EmployeeName|Department|Salary|SalaryBonus|TotalCompensation|
+------------+----------+------+-----------+-----------------+
|       Restu|   Hengker|  3000|      300.0|           3300.0|
|         Rio|  Engginer|  4600|      460.0|           5060.0|
|       Zidan|   Develop|  4100|      410.0|           4510.0|
|        Azil|   Mancing|  3000|      300.0|           3300.0|
+------------+----------+------+-----------+-----------------+



In [14]:
# Contoh menggunakan window functions
from pyspark.sql.window import Window
from pyspark.sql import functions as F

windowSpec = Window.partitionBy('Department').orderBy('Salary')
df.withColumn('Rank', F.rank().over(windowSpec)).show()

+------------+----------+------+-----------+-----------------+----+
|EmployeeName|Department|Salary|SalaryBonus|TotalCompensation|Rank|
+------------+----------+------+-----------+-----------------+----+
|       Zidan|   Develop|  4100|      410.0|           4510.0|   1|
|         Rio|  Engginer|  4600|      460.0|           5060.0|   1|
|       Restu|   Hengker|  3000|      300.0|           3300.0|   1|
|        Azil|   Mancing|  3000|      300.0|           3300.0|   1|
+------------+----------+------+-----------+-----------------+----+



In [15]:
from math import comb

n = 100
k = 86
p = 0.9
q = 1 - p

P_X_86 = comb(n, k) * (p**k) * (q**(n - k))
P_X_86


0.05130382733444938