In [6]:
import os
import re
from pyspark.sql import SparkSession
from pyspark.sql import functions
from pyspark.sql.functions import udf,expr
from pyspark.sql.types import StringType
import util.config as conf
from util.logger import Log4j

In [7]:
spark_conf = conf.get_spark_conf()
spark = SparkSession.builder.config(conf = spark_conf).getOrCreate()
log = Log4j(spark)
survey_df = spark.read \
            .option("header","true") \
            .option("inferSchema","true") \
            .csv("/data/udf/survey.csv")


24/10/30 14:45:45 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


util_dir: /spark/07-udf/util


In [8]:
survey_df.show()

+-------------------+---+------+--------------+-----+-------------+--------------+---------+--------------+--------------+-----------+------------+----------+------------+----------------+----------+----------+------------------+-------------------------+-----------------------+------------+------------+-----------------------+---------------------+------------------+---------------+--------------------+
|          Timestamp|Age|Gender|       Country|state|self_employed|family_history|treatment|work_interfere|  no_employees|remote_work|tech_company|  benefits|care_options|wellness_program| seek_help| anonymity|             leave|mental_health_consequence|phys_health_consequence|   coworkers|  supervisor|mental_health_interview|phys_health_interview|mental_vs_physical|obs_consequence|            comments|
+-------------------+---+------+--------------+-----+-------------+--------------+---------+--------------+--------------+-----------+------------+----------+------------+-------------

In [9]:
def parse_gender(gender):
    male_pattern = r"^m$|^male$|^man$"
    female_pattern = r"^f$|^female$|^woman$"
    if re.search(male_pattern, gender.lower()):
        return "Male"
    if re.search(female_pattern, gender.lower()):
        return "Female"
    return "Unknown"

In [14]:
parse_gender_udf = udf(parse_gender,StringType())
log.info("Catalog Entry: ")
employee_df = survey_df.withColumn("Gender", parse_gender_udf("Gender")) \
        .select("Age", "Gender", "Country", "state", "no_employees") 

In [16]:
employee_df.dropDuplicates(["no_employees"]).show()

+---+------+--------------+-----+--------------+
|Age|Gender|       Country|state|  no_employees|
+---+------+--------------+-----+--------------+
| 35|Female| United States|   MI|           1-5|
| 31|  Male| United States|   TX|       100-500|
| 31|  Male|United Kingdom|   NA|        26-100|
| 36|  Male| United States|   CT|      500-1000|
| 37|Female| United States|   IL|          6-25|
| 44|  Male| United States|   IN|More than 1000|
+---+------+--------------+-----+--------------+



In [24]:
from pyspark.sql.types import BooleanType, StringType
from pyspark.sql.functions import col
def check_employees(no_employees):
    return no_employees in ["500-1000", "More than 1000"]

# Khai báo UDF với BooleanType
check_employees_udf = udf(check_employees, BooleanType())

In [25]:
employee_df.filter(check_employees_udf(col('no_employees'))).show()

+---+-------+--------------+-----+--------------+
|Age| Gender|       Country|state|  no_employees|
+---+-------+--------------+-----+--------------+
| 44|   Male| United States|   IN|More than 1000|
| 36|   Male| United States|   CT|      500-1000|
| 41|   Male| United States|   IA|More than 1000|
| 35|   Male| United States|   TN|More than 1000|
| 30|   Male|United Kingdom|   NA|      500-1000|
| 35|   Male| United States|   TX|More than 1000|
| 35|   Male| United States|   MI|More than 1000|
| 44|   Male| United States|   IA|More than 1000|
| 40|   Male| United States|   CA|More than 1000|
| 23|Unknown| United States|   MA|More than 1000|
| 33|   Male| United States|   CA|More than 1000|
| 25|   Male| United States|   WA|More than 1000|
| 33|   Male| United States|   CA|More than 1000|
| 34| Female| United States|   OR|      500-1000|
| 32|   Male| United States|   IL|      500-1000|
| 31|   Male| United States|   NY|      500-1000|
| 29|   Male| United States|   NY|More than 1000|


In [26]:
spark.stop()