In [None]:
!pip install -q kaggle

In [None]:
!cp kaggle.json ~/.kaggle

In [None]:
!chmod 600 ~/.kaggle/kaggle.json

In [14]:
!kaggle datasets download -d 'jpmiller/employee-attrition-for-healthcare'

Dataset URL: https://www.kaggle.com/datasets/jpmiller/employee-attrition-for-healthcare
License(s): CC0-1.0
employee-attrition-for-healthcare.zip: Skipping, found more recently modified local copy (use --force to force download)


In [15]:
import zipfile

In [16]:
dataset_zip = zipfile.ZipFile('employee-attrition-for-healthcare.zip', 'r')

dataset_zip.extractall()

dataset_zip.close()

In [17]:
from pyspark.sql import SparkSession

# Membuat SparkSession
spark = SparkSession.builder \
    .appName("WatsonHealthcareAnalysis") \
    .getOrCreate()

In [18]:
# Membaca CSV ke dalam DataFrame Spark
df = spark.read.csv("/content/watson_healthcare_modified.csv", header=True, inferSchema=True)

# Melihat struktur data
df.printSchema()
df.show(5)

root
 |-- EmployeeID: integer (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Attrition: string (nullable = true)
 |-- BusinessTravel: string (nullable = true)
 |-- DailyRate: integer (nullable = true)
 |-- Department: string (nullable = true)
 |-- DistanceFromHome: integer (nullable = true)
 |-- Education: integer (nullable = true)
 |-- EducationField: string (nullable = true)
 |-- EmployeeCount: integer (nullable = true)
 |-- EnvironmentSatisfaction: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- HourlyRate: integer (nullable = true)
 |-- JobInvolvement: integer (nullable = true)
 |-- JobLevel: integer (nullable = true)
 |-- JobRole: string (nullable = true)
 |-- JobSatisfaction: integer (nullable = true)
 |-- MaritalStatus: string (nullable = true)
 |-- MonthlyIncome: integer (nullable = true)
 |-- MonthlyRate: integer (nullable = true)
 |-- NumCompaniesWorked: integer (nullable = true)
 |-- Over18: string (nullable = true)
 |-- OverTime: string (nul

In [21]:
print("Jumlah baris:", df.count())
print("Jumlah kolom:", len(df.columns))

# Statistik deskriptif
df.describe().show()

# Menampilkan kolom tertentu (yang valid)
df.select("EmployeeID", "Age", "Department").show(10)

# Filter karyawan dengan umur > 40
df.filter(df["Age"] > 40).show(10)

# Agregasi: jumlah karyawan per Department
df.groupBy("Department").count().show()

Jumlah baris: 1676
Jumlah kolom: 35
+-------+------------------+------------------+---------+--------------+------------------+----------+-----------------+------------------+----------------+-------------+-----------------------+------+-----------------+------------------+------------------+---------+------------------+-------------+-----------------+-----------------+------------------+------+--------+------------------+------------------+------------------------+-------------+------------------+-----------------+---------------------+------------------+-----------------+------------------+-----------------------+--------------------+
|summary|        EmployeeID|               Age|Attrition|BusinessTravel|         DailyRate|Department| DistanceFromHome|         Education|  EducationField|EmployeeCount|EnvironmentSatisfaction|Gender|       HourlyRate|    JobInvolvement|          JobLevel|  JobRole|   JobSatisfaction|MaritalStatus|    MonthlyIncome|      MonthlyRate|NumCompaniesWorked|

In [23]:
from pyspark.sql.functions import when

df = df.withColumn(
    "AgeCategory",
    when(df.Age < 18, "Child")
    .when((df.Age >= 18) & (df.Age < 60), "Adult")
    .otherwise("Senior")
)

df.select("EmployeeID", "Age", "AgeCategory").show(10)

+----------+---+-----------+
|EmployeeID|Age|AgeCategory|
+----------+---+-----------+
|   1313919| 41|      Adult|
|   1200302| 49|      Adult|
|   1060315| 37|      Adult|
|   1272912| 33|      Adult|
|   1414939| 27|      Adult|
|   1633361| 32|      Adult|
|   1329390| 59|      Adult|
|   1699288| 30|      Adult|
|   1469740| 38|      Adult|
|   1101291| 36|      Adult|
+----------+---+-----------+
only showing top 10 rows



In [24]:
df.write.csv("/content/output_healthcare", header=True, mode="overwrite")