<a href="https://colab.research.google.com/github/Septianadaw/Praktikum-Big-Data/blob/main/2410506013_Pertemuan3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Tugas 1
from pyspark.sql import SparkSession

# Membuat Spark session
spark = SparkSession.builder.appName("HandsOnPertemuan3_Tugas1").getOrCreate()

# Data sederhana
data = [
    ('Septiana', 'Direktur', 15000),
    ('Diyah', 'Manajer', 10000),
    ('Ayu', 'Wakil Direktur', 12000),
    ('Wulandari', 'Sekretaris', 9000)
]

columns = ['EmployeeName', 'Department', 'Salary']

# Membuat DataFrame
df = spark.createDataFrame(data, schema=columns)

# Menampilkan DataFrame
df.show()

# Eksplorasi fungsi dasar
df.select("EmployeeName").show()  # Menampilkan kolom tertentu

+------------+--------------+------+
|EmployeeName|    Department|Salary|
+------------+--------------+------+
|    Septiana|      Direktur| 15000|
|       Diyah|       Manajer| 10000|
|         Ayu|Wakil Direktur| 12000|
|   Wulandari|    Sekretaris|  9000|
+------------+--------------+------+

+------------+
|EmployeeName|
+------------+
|    Septiana|
|       Diyah|
|         Ayu|
|   Wulandari|
+------------+



In [None]:
# Tugas 2 Operasi Transformasi Dasar

# pilih kolom tertentu
df.select("EmployeeName", "Salary").show()

# ambil karyawan dengan Salary > 3000
df.filter(df["Salary"] > 3000).show()

# GroupBy + Aggregasi
# Rata-rata gaji per departemen
df.groupBy("Department").avg("Salary").show()

# Gaji maksimum per departemen
df.groupBy("Department").max("Salary").show()

# Total gaji per departemen
df.groupBy("Department").sum("Salary").show()

+------------+------+
|EmployeeName|Salary|
+------------+------+
|    Septiana| 15000|
|       Diyah| 10000|
|         Ayu| 12000|
|   Wulandari|  9000|
+------------+------+

+------------+--------------+------+
|EmployeeName|    Department|Salary|
+------------+--------------+------+
|    Septiana|      Direktur| 15000|
|       Diyah|       Manajer| 10000|
|         Ayu|Wakil Direktur| 12000|
|   Wulandari|    Sekretaris|  9000|
+------------+--------------+------+

+--------------+-----------+
|    Department|avg(Salary)|
+--------------+-----------+
|      Direktur|    15000.0|
|       Manajer|    10000.0|
|Wakil Direktur|    12000.0|
|    Sekretaris|     9000.0|
+--------------+-----------+

+--------------+-----------+
|    Department|max(Salary)|
+--------------+-----------+
|      Direktur|      15000|
|       Manajer|      10000|
|Wakil Direktur|      12000|
|    Sekretaris|       9000|
+--------------+-----------+

+--------------+-----------+
|    Department|sum(Salary)|
+-

In [None]:
# Tugas 3 Manipulasi Tipe Data Kompleks

# Membuat kolom baru SalaryBonus (10% dari Salary)
df = df.withColumn('SalaryBonus', df['Salary'] * 0.1)

# Membuat kolom baru TotalCompensation (Salary + SalaryBonus)
df = df.withColumn('TotalCompensation', df['Salary'] + df['SalaryBonus'])

# Menampilkan DataFrame hasil manipulasi
df.show()

+------------+--------------+------+-----------+-----------------+
|EmployeeName|    Department|Salary|SalaryBonus|TotalCompensation|
+------------+--------------+------+-----------+-----------------+
|    Septiana|      Direktur| 15000|     1500.0|          16500.0|
|       Diyah|       Manajer| 10000|     1000.0|          11000.0|
|         Ayu|Wakil Direktur| 12000|     1200.0|          13200.0|
|   Wulandari|    Sekretaris|  9000|      900.0|           9900.0|
+------------+--------------+------+-----------+-----------------+



In [None]:
# Tugas 4 Window Function
from pyspark.sql.window import Window
from pyspark.sql import functions as F

# WindowSpec data dibagi per Department dan diurutkan berdasarkan Salary
windowSpec = Window.partitionBy("Department").orderBy("Salary")

# Menambahkan kolom Ranking berdasarkan gaji tiap Department
df_rank = df.withColumn("Rank", F.rank().over(windowSpec))

# Menambahkan kolom Running Total (akumulasi Salary per Department)
df_running = df_rank.withColumn("RunningTotal", F.sum("Salary").over(windowSpec.rowsBetween(Window.unboundedPreceding, Window.currentRow)))

# Menampilkan hasil akhir
df_running.show()

+------------+--------------+------+-----------+-----------------+----+------------+
|EmployeeName|    Department|Salary|SalaryBonus|TotalCompensation|Rank|RunningTotal|
+------------+--------------+------+-----------+-----------------+----+------------+
|    Septiana|      Direktur| 15000|     1500.0|          16500.0|   1|       15000|
|       Diyah|       Manajer| 10000|     1000.0|          11000.0|   1|       10000|
|   Wulandari|    Sekretaris|  9000|      900.0|           9900.0|   1|        9000|
|         Ayu|Wakil Direktur| 12000|     1200.0|          13200.0|   1|       12000|
+------------+--------------+------+-----------+-----------------+----+------------+



In [2]:
# Tugas 5
# Unduh dataset dari Kaggle
import kagglehub

# Download latest version
path = kagglehub.dataset_download("pavansubhasht/ibm-hr-analytics-attrition-dataset")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'ibm-hr-analytics-attrition-dataset' dataset.
Path to dataset files: /kaggle/input/ibm-hr-analytics-attrition-dataset


In [4]:
# Tugas 5
# Import Library
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Membuat SparkSession
spark = SparkSession.builder.appName("Tugas5_BigData").getOrCreate()

# Load Dataset dari Kaggle (CSV)
path = "/kaggle/input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv"
df = spark.read.csv(path, header=True, inferSchema=True)

# Eksplorasi Awal
print("Jumlah baris:", df.count())
print("Jumlah kolom:", len(df.columns))
df.printSchema()
df.show(5)

# Operasi Dasar & Manipulasi Data
# Rata-rata gaji
df.select(F.avg("MonthlyIncome").alias("RataRataGaji")).show()

# Gaji tertinggi dan terendah
df.select(
    F.max("MonthlyIncome").alias("GajiTertinggi"),
    F.min("MonthlyIncome").alias("GajiTerendah")
).show()

# Total gaji per Department
df.groupBy("Department").sum("MonthlyIncome").alias("TotalGaji").show()

# Rata-rata gaji per JobRole
df.groupBy("JobRole").avg("MonthlyIncome").alias("RataGaji").show()

# Window Function--Ranking Gaji per Department
windowSpec = Window.partitionBy("Department").orderBy(F.desc("MonthlyIncome"))
df_rank = df.withColumn("Rank", F.rank().over(windowSpec))

df_rank.select("EmployeeNumber", "Department", "MonthlyIncome", "Rank").show(10)

Jumlah baris: 1470
Jumlah kolom: 35
root
 |-- Age: integer (nullable = true)
 |-- Attrition: string (nullable = true)
 |-- BusinessTravel: string (nullable = true)
 |-- DailyRate: integer (nullable = true)
 |-- Department: string (nullable = true)
 |-- DistanceFromHome: integer (nullable = true)
 |-- Education: integer (nullable = true)
 |-- EducationField: string (nullable = true)
 |-- EmployeeCount: integer (nullable = true)
 |-- EmployeeNumber: integer (nullable = true)
 |-- EnvironmentSatisfaction: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- HourlyRate: integer (nullable = true)
 |-- JobInvolvement: integer (nullable = true)
 |-- JobLevel: integer (nullable = true)
 |-- JobRole: string (nullable = true)
 |-- JobSatisfaction: integer (nullable = true)
 |-- MaritalStatus: string (nullable = true)
 |-- MonthlyIncome: integer (nullable = true)
 |-- MonthlyRate: integer (nullable = true)
 |-- NumCompaniesWorked: integer (nullable = true)
 |-- Over18: string (nul