In [1]:
# Contoh membuat DataFrame sederhana dan operasi dasar
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('HandsOnPertemuan3').getOrCreate()

data = [('James', 'Sales', 3000),
        ('Michael', 'Sales', 4600),
        ('Robert', 'Sales', 4100),
        ('Maria', 'Finance', 3000)]
columns = ['EmployeeName', 'Department', 'Salary']

df = spark.createDataFrame(data, schema=columns)
df.show()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/09/09 16:34:44 WARN Utils: Your hostname, rayfal, resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
25/09/09 16:34:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/09 16:34:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

+------------+----------+------+
|EmployeeName|Department|Salary|
+------------+----------+------+
|       James|     Sales|  3000|
|     Michael|     Sales|  4600|
|      Robert|     Sales|  4100|
|       Maria|   Finance|  3000|
+------------+----------+------+



In [10]:
# Menampilkan kolom employeename dan salary
df.select('EmployeeName', 'Salary').show()
# Menampilkan salary yang diatas 3000
df.filter(df['Salary'] > 3000).show()
# Mengelompokkan data berdasarkan department lalu menghitung rata rata salary tiap department
df.groupBy('Department').avg('Salary').show()
# Mengelompokkan data berdasarkan department lalu di aggregasi
df.groupBy("Department").agg(
    # Untuk menghitung rata rata salary tiap department
    F.mean("Salary").alias("AvgSalary"),
    # Untuk menampilkan salary tertinggi tiap department 
    F.max("Salary").alias("MaxSalary"),
    # Untuk menampilkan salary terendah tiap department
    F.min("Salary").alias("MinSalary"),
    # Untuk menghitung total salary tiap department
    F.sum("Salary").alias("TotalSalary"),
    # Untuk menghitung jumlah employeename tiap department
    F.count("Salary").alias("CountSalary")).show()

+------------+------+
|EmployeeName|Salary|
+------------+------+
|       James|  3000|
|     Michael|  4600|
|      Robert|  4100|
|       Maria|  3000|
+------------+------+

+------------+----------+------+
|EmployeeName|Department|Salary|
+------------+----------+------+
|     Michael|     Sales|  4600|
|      Robert|     Sales|  4100|
+------------+----------+------+



                                                                                

+----------+-----------+
|Department|avg(Salary)|
+----------+-----------+
|     Sales|     3900.0|
|   Finance|     3000.0|
+----------+-----------+

+----------+---------+---------+---------+-----------+-----------+
|Department|AvgSalary|MaxSalary|MinSalary|TotalSalary|CountSalary|
+----------+---------+---------+---------+-----------+-----------+
|     Sales|   3900.0|     4600|     3000|      11700|          3|
|   Finance|   3000.0|     3000|     3000|       3000|          1|
+----------+---------+---------+---------+-----------+-----------+



In [12]:
# Kolom SalaryBonus dihitung dari 10% gaji
df = df.withColumn("SalaryBonus", df["Salary"] * 0.1)

# Kolom TotalCompensation 
df = df.withColumn("TotalCompensation", df["Salary"] + df["SalaryBonus"])
df.show()

+------------+----------+------+-----------+-----------------+
|EmployeeName|Department|Salary|SalaryBonus|TotalCompensation|
+------------+----------+------+-----------+-----------------+
|       James|     Sales|  3000|      300.0|           3300.0|
|     Michael|     Sales|  4600|      460.0|           5060.0|
|      Robert|     Sales|  4100|      410.0|           4510.0|
|       Maria|   Finance|  3000|      300.0|           3300.0|
+------------+----------+------+-----------+-----------------+



In [13]:
# Penggunaan window functions
from pyspark.sql.window import Window
from pyspark.sql import functions as F

windowSpec = Window.partitionBy('Department').orderBy('Salary')
df.withColumn('Rank', F.rank().over(windowSpec)).show()



+------------+----------+------+-----------+-----------------+----+
|EmployeeName|Department|Salary|SalaryBonus|TotalCompensation|Rank|
+------------+----------+------+-----------+-----------------+----+
|       Maria|   Finance|  3000|      300.0|           3300.0|   1|
|       James|     Sales|  3000|      300.0|           3300.0|   1|
|      Robert|     Sales|  4100|      410.0|           4510.0|   2|
|     Michael|     Sales|  4600|      460.0|           5060.0|   3|
+------------+----------+------+-----------+-----------------+----+



                                                                                

In [2]:
import os
from pyspark.sql import SparkSession

# Inisialisasi Sesi Spark
spark = SparkSession.builder.appName("DataLoading").getOrCreate()

# File diupload melalui Home Jupyter Notebook dan disini hanya copy path filenya
file_path = "best-selling-manga.csv"

# Memuat Data
try:
    df = spark.read.csv(file_path, header=True, inferSchema=True)
    
    print("Data berhasil dimuat:")
    df.show(5)
    
    print("DataFrame Schema:")
    df.printSchema()
    
except Exception as e:
    print(f"Terjadi kesalahan saat memuat data: {e}")

# Stop Spark Session
spark.stop()

Data berhasil dimuat:
+--------------------+--------------------+----------+-----------+------------------------+------------+-------------------------------+--------------------------------------+
|        Manga series|           Author(s)| Publisher|Demographic|No. of collected volumes|  Serialized|Approximate sales in million(s)|Average sales per volume in million(s)|
+--------------------+--------------------+----------+-----------+------------------------+------------+-------------------------------+--------------------------------------+
|           One Piece|        Eiichiro Oda|  Shueisha|     Shōnen|                     104|1997–present|                          516.6|                                  4.97|
|            Golgo 13|Takao Saito, Sait...|Shogakukan|     Seinen|                     207|1968–present|                          300.0|                                  1.45|
|Case Closed / Det...|        Gosho Aoyama|Shogakukan|     Shōnen|                     102|1994–pr