In [2]:
from pyspark.sql import SparkSession as sc
from pyspark.sql.types import StructType, StructField, StringType, ArrayType

sc = sc.builder.getOrCreate()

In [3]:
from pyspark.sql.functions import explode, split, to_date, date_format, col
from pyspark.sql import functions as F

# path = r'file:///E:\IDE_workspace\VSCode\recruitment_project\Data\computer-information-technology\silver_layer.csv'
hdfs_path = 'hdfs://192.168.48.136:9000/recruitment/silver/'
df = sc.read.options(header=True, delimiter=',').csv(hdfs_path)

df = df.drop('year', 'month', 'day', 'source', 'jobCategory')

df = df.withColumn('Skills', F.from_json('Skills', ArrayType(StringType())))
df = df.withColumn('Requirement', F.from_json('Requirement', ArrayType(StringType())))
df = df.withColumn('Category', F.from_json('Category', ArrayType(StringType())))

df = df.withColumn('Created_time', to_date(col('Created_time'), 'YYYY-MM-dd HH:mm:ss'))
df = df.withColumn('Last_updated', to_date(col('Last_updated'), 'YYYY-MM-dd HH:mm:ss'))
df.schema

StructType([StructField('Title', StringType(), True), StructField('Last_updated', DateType(), True), StructField('Created_time', DateType(), True), StructField('Skills', ArrayType(StringType(), True), True), StructField('Requirement', ArrayType(StringType(), True), True), StructField('Hirer', StringType(), True), StructField('Company_link', StringType(), True), StructField('Company_location', StringType(), True), StructField('Category', ArrayType(StringType(), True), True), StructField('jobDecription', StringType(), True), StructField('min_salary', StringType(), True), StructField('max_salary', StringType(), True), StructField('salary_range', StringType(), True), StructField('location', StringType(), True), StructField('work_type', StringType(), True), StructField('work_time_type', StringType(), True), StructField('company_name', StringType(), True), StructField('source', StringType(), True), StructField('jobCategory', StringType(), True), StructField('year', IntegerType(), True), Stru

# Question:
What will we do with those data:

- Identify the skills most frequently required by companies.
- Determine the average salary for each skill.
- Locate the areas with the highest job concentration.
- What companies have the most job

In [None]:
# Explode 'Skills' column and select all columns from original DataFrame + exploded 'Skills'
exploded_skill = df.withColumn("Exploded_Skill", explode(df['Skills']))

exploded_skill = exploded_skill.drop('Skills')

exploded_skill.show()

In [13]:
# Identify the skills with the highest count
exploded_skill.groupBy('Exploded_Skill') \
    .agg(
        F.round(F.avg((F.col('min_salary') + F.col('max_salary')/ 2)), 2).alias('avg_salary'), 
        F.count('Title').alias('count')) \
    .orderBy('avg_salary', ascending=False).show(truncate=False)

+-----------------------------+----------+-----+
|Exploded_Skill               |avg_salary|count|
+-----------------------------+----------+-----+
|System Analysis              |1.76E8    |2    |
|Hand-ons Task                |1.05E8    |1    |
|Machine Translation          |1.0E8     |1    |
|Linguistics                  |1.0E8     |1    |
|Sentiment Analysis           |1.0E8     |1    |
|Large Language Models (Llm)  |1.0E8     |1    |
|Cto                          |1.0E8     |1    |
|Cognitive Science            |1.0E8     |1    |
|Python (programming Language)|1.0E8     |1    |
|Spring Boot                  |9.95E7    |1    |
|Blokchain                    |7.65E7    |1    |
|Head Of Delivery             |6.5E7     |1    |
|Business Management          |6.5E7     |1    |
|American                     |6.5E7     |1    |
|Information Security         |5.6E7     |2    |
|Microsoft System             |5.2E7     |1    |
|Solution Architecture        |5.0E7     |2    |
|Smart Contract     

In [15]:
exploded_skill.filter(exploded_skill.Exploded_Skill == 'Cto').show()

+--------------------+-------------------+-------------------+--------------------+-----------+--------------------+--------------------+--------------------+----------+----------+------------+--------------------+---------------+-----------------+--------------------+--------------+
|               Title|       Last_updated|       Created_time|         Requirement|      Hirer|        Company_link|    Company_location|            Category|min_salary|max_salary|salary_range|            location|      work_type|   work_time_type|        company_name|Exploded_Skill|
+--------------------+-------------------+-------------------+--------------------+-----------+--------------------+--------------------+--------------------+----------+----------+------------+--------------------+---------------+-----------------+--------------------+--------------+
|Giám Đốc Công Ngh...|2024-08-06 13:38:46|2024-09-01 13:38:46|[5 - 10 năm kinh ...|Phương Đoàn|https://glints.co...|Tòa CIC, 219 Trun...|[Compute

In [17]:
# Explode 'Skills' column and select all columns from original DataFrame + exploded 'Skills'
exploded_requirement = df.withColumn("Requirement", explode(df['Requirement']))

exploded_requirement.show()

+--------------------+-------------------+-------------------+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+----------+----------+------------+--------------------+---------------+--------------------+--------------------+
|               Title|       Last_updated|       Created_time|              Skills|         Requirement|             Hirer|        Company_link|    Company_location|            Category|min_salary|max_salary|salary_range|            location|      work_type|      work_time_type|        company_name|
+--------------------+-------------------+-------------------+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+----------+----------+------------+--------------------+---------------+--------------------+--------------------+
|THỰC TẬP SINH .NE...|2024-08-02 13:47:40|2024-09-01 13:47:40|[E-commerce, Info...|   Tối thiểu C

In [25]:
exploded_requirement.groupBy('Requirement') \
    .agg(
        F.round(F.avg((F.col('min_salary') + F.col('max_salary')/ 2)), 2).alias('avg_salary'),
        F.count('Title').alias('count')) \
    .orderBy('avg_salary', ascending=False).show(truncate=False)

+-----------------------------+-------------+-----+
|Requirement                  |avg_salary   |count|
+-----------------------------+-------------+-----+
|30-55 tuổi                   |9.95E7       |1    |
|25-34 tuổi                   |6.025E7      |2    |
|28-49 tuổi                   |5.75E7       |1    |
|25-40 tuổi                   |2.816666667E7|3    |
|5 - 10 năm kinh nghiệm       |2.790748889E7|27   |
|20-34 tuổi                   |2.75E7       |1    |
|Tối thiểu Tiểu Học           |2.75E7       |1    |
|25-45 tuổi                   |2.35E7       |1    |
|3 - 5 năm kinh nghiệm        |2.322920991E7|96   |
|Tối thiểu Cao Đẳng           |2.23170078E7 |66   |
|1 - 3 năm kinh nghiệm        |2.058617987E7|159  |
|Tối thiểu Cử Nhân            |2.005261566E7|234  |
|24-35 tuổi                   |2.0E7        |1    |
|Giới tính Nam                |1.832692308E7|26   |
|23-30 tuổi                   |1.75E7       |1    |
|Tối thiểu Bằng Liên Kết      |1.675005556E7|18   |
|23-35 tuổi 

In [39]:
from pyspark.sql.functions import array_contains, col

df.where(array_contains(col('Requirement'), '30-55 tuổi')).show()

+--------------------+-------------------+-------------------+--------------------+--------------------+--------+--------------------+--------------------+--------------------+----------+----------+------------+--------------------+---------------+-----------------+--------------------+
|               Title|       Last_updated|       Created_time|              Skills|         Requirement|   Hirer|        Company_link|    Company_location|            Category|min_salary|max_salary|salary_range|            location|      work_type|   work_time_type|        company_name|
+--------------------+-------------------+-------------------+--------------------+--------------------+--------+--------------------+--------------------+--------------------+----------+----------+------------+--------------------+---------------+-----------------+--------------------+
|Full Stack Developer|2024-08-22 13:08:38|2024-09-01 13:08:38|[SQL, Spring Boot...|[5 - 10 năm kinh ...|HR JuYou|https://glints.co...|Lầ

In [28]:
df.filter(df.Title == 'Full Stack Developer').show()

+--------------------+-------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+----------+------------+--------------------+---------------+-----------------+--------------------+
|               Title|       Last_updated|       Created_time|              Skills|         Requirement|               Hirer|        Company_link|    Company_location|            Category|min_salary|max_salary|salary_range|            location|      work_type|   work_time_type|        company_name|
+--------------------+-------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+----------+------------+--------------------+---------------+-----------------+--------------------+
|Full Stack Developer|2024-08-20 13:15:16|2024-09-01 13:15:16|[Algorithms, Inte...|[Dưới một năm kin

In [40]:
df.schema

StructType([StructField('Title', StringType(), True), StructField('Last_updated', StringType(), True), StructField('Created_time', StringType(), True), StructField('Skills', ArrayType(StringType(), True), True), StructField('Requirement', ArrayType(StringType(), True), True), StructField('Hirer', StringType(), True), StructField('Company_link', StringType(), True), StructField('Company_location', StringType(), True), StructField('Category', ArrayType(StringType(), True), True), StructField('min_salary', StringType(), True), StructField('max_salary', StringType(), True), StructField('salary_range', StringType(), True), StructField('location', StringType(), True), StructField('work_type', StringType(), True), StructField('work_time_type', StringType(), True), StructField('company_name', StringType(), True)])

In [None]:
df.