In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, explode, avg, to_timestamp, udf
from pyspark.sql.types import StructType, StructField, StringType, ArrayType
from pyspark.sql import functions as F

sc = SparkSession.builder \
        .appName("recruitmentProjectGoldLayer") \
        .config('spark.hadoop.hive.metastore.uris', "thrift://wakuwa:9083") \
        .enableHiveSupport() \
        .config("hive.exec.dynamic.partition.mode", "nonstrict") \
        .getOrCreate()

In [3]:
sc.sql('SHOW DATABASES').show()

+---------+
|namespace|
+---------+
|  default|
+---------+



In [2]:
base = 'file:///E:\\IDE_workspace\\VSCode\\recruitment_project\\Data\\'
path = base + 'computer-information-technology\\silver_layer.csv'
hdfs_path = 'hdfs://192.168.48.136:9000/recruitment/silver/'
df = sc.read.options(header=True, delimiter=',', multiline=True).csv(path)

df = df.drop('year', 'month', 'day', 'source', 'jobCategory', 'job_description')

df = df.withColumn("max_salary", col("max_salary").cast("integer")) \
        .withColumn("min_salary", col("min_salary").cast("integer")) \
        .withColumn('Skills', F.from_json('Skills', ArrayType(StringType()))) \
        .withColumn('Requirement', F.from_json('Requirement', ArrayType(StringType()))) \
        .withColumn('Category', F.from_json('Category', ArrayType(StringType()))) \
        .withColumn('Created_time', to_timestamp(col('Created_time'), 'yyyy-MM-dd HH:mm:ss')) \
        .withColumn('Last_updated', to_timestamp(col('Last_updated'), 'yyyy-MM-dd HH:mm:ss'))
df.schema

StructType([StructField('Title', StringType(), True), StructField('Last_updated', TimestampType(), True), StructField('Created_time', TimestampType(), True), StructField('Skills', ArrayType(StringType(), True), True), StructField('Requirement', ArrayType(StringType(), True), True), StructField('Hirer', StringType(), True), StructField('Company_link', StringType(), True), StructField('Company_location', StringType(), True), StructField('Category', ArrayType(StringType(), True), True), StructField('min_salary', IntegerType(), True), StructField('max_salary', IntegerType(), True), StructField('salary_range', StringType(), True), StructField('location', StringType(), True), StructField('work_type', StringType(), True), StructField('work_time_type', StringType(), True), StructField('company_name', StringType(), True)])

In [3]:
df.show(5)

+--------------------+-------------------+-------------------+--------------------+--------------------+---------+--------------------+--------------------+--------------------+----------+----------+------------+--------------------+---------------+--------------------+--------------------+
|               Title|       Last_updated|       Created_time|              Skills|         Requirement|    Hirer|        Company_link|    Company_location|            Category|min_salary|max_salary|salary_range|            location|      work_type|      work_time_type|        company_name|
+--------------------+-------------------+-------------------+--------------------+--------------------+---------+--------------------+--------------------+--------------------+----------+----------+------------+--------------------+---------------+--------------------+--------------------+
|       Data Engineer|2024-08-22 23:04:10|2024-09-09 23:04:10|[Data Model, SQL,...|[3 - 5 năm kinh n...|  Ha Tran|https://gl

# Question:
What will we do with those data:

- Identify the skills most frequently required by companies.
- Determine the average salary for each skill.
- Locate the areas with the highest job concentration.
- What companies have the most job

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, explode, avg, to_timestamp, udf
from pyspark.sql.types import StructType, StructField, StringType, ArrayType
from pyspark.sql import functions as F

from datetime import datetime

hdfs_path = 'hdfs://192.168.48.136:9000/recruitment/silver/'
year, month, day = datetime.now().year, datetime.now().month, datetime.now().day

if __name__ == '__main__':
    sc = SparkSession.builder \
        .appName("recruitmentProjectGoldLayer") \
        .config('spark.hadoop.hive.metastore.uris', "thrift://192.168.48.136:9083") \
        .enableHiveSupport() \
        .getOrCreate()

    df = sc.sql('SELECT * FROM recruitment.recruitment_all_report')

In [4]:
df.show()

+--------------------+-------------------+-------------------+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+----------+----------+------------+--------------------+---------------+--------------------+--------------------+----+-----+---+
|               title|       last_updated|       created_time|              skills|         requirement|             hirer|        company_link|    company_location|            category|min_salary|max_salary|salary_range|            location|      work_type|      work_time_type|        company_name|year|month|day|
+--------------------+-------------------+-------------------+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+----------+----------+------------+--------------------+---------------+--------------------+--------------------+----+-----+---+
|       Data Engineer|2024-08-22 23:04:10|2024-09-09