In [1]:
import urllib.request
import os

jdbc_jar_path = "/tmp/postgresql-42.7.3.jar"

if not os.path.exists(jdbc_jar_path):
    urllib.request.urlretrieve(
        "https://jdbc.postgresql.org/download/postgresql-42.7.3.jar",
        jdbc_jar_path
    )

jdbc_jar_path


'/tmp/postgresql-42.7.3.jar'

In [2]:
import os
import subprocess
from pyspark.sql import SparkSession

# Configura JAVA_HOME (como antes)
java_home = subprocess.check_output(
    ["/usr/libexec/java_home", "-v", "11"]
).decode("utf-8").strip()

os.environ["JAVA_HOME"] = java_home
os.environ["PATH"] = java_home + "/bin:" + os.environ["PATH"]

# Cria Spark com o driver JDBC
spark = SparkSession.builder \
    .appName("raw-to-silver-etl") \
    .config("spark.jars", jdbc_jar_path) \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

spark


26/01/19 03:12:20 WARN Utils: Your hostname, MacBook-Air-de-Tales.local resolves to a loopback address: 127.0.0.1; using 192.168.1.18 instead (on interface en0)
26/01/19 03:12:20 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
26/01/19 03:12:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/19 03:12:21 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
from pyspark.sql import SparkSession


In [4]:
spark = SparkSession.builder \
    .appName("raw-to-silver-etl") \
    .getOrCreate()


In [5]:
spark.sparkContext.setLogLevel("ERROR")


In [6]:
raw_path = "../Data Layer/raw/data_raw.csv"


In [7]:
df_raw = spark.read \
    .option("header", True) \
    .option("inferSchema", False) \
    .csv(raw_path)


In [8]:
df_raw.show(5)


+------+--------------------+--------------------+--------------------+--------------------+------------+--------------------+----------+---------------+---------------+-------------+-------------------------+---------------+----------+------------+--------------------+-----------------+--------------------+--------+------------------+------------+-------------+----------------+-----------+-------+
|job_id|           job_title|     job_description|        requirements|            benefits|company_name|     company_profile|  industry|employment_type|       location| salary_range|required_experience_years|education_level|department|posting_date|application_deadline|    contact_email|     company_website|has_logo|num_open_positions|job_function|telecommuting|    fraud_reason|text_length|is_fake|
+------+--------------------+--------------------+--------------------+--------------------+------------+--------------------+----------+---------------+---------------+-------------+-------------

In [9]:
df_raw.printSchema()


root
 |-- job_id: string (nullable = true)
 |-- job_title: string (nullable = true)
 |-- job_description: string (nullable = true)
 |-- requirements: string (nullable = true)
 |-- benefits: string (nullable = true)
 |-- company_name: string (nullable = true)
 |-- company_profile: string (nullable = true)
 |-- industry: string (nullable = true)
 |-- employment_type: string (nullable = true)
 |-- location: string (nullable = true)
 |-- salary_range: string (nullable = true)
 |-- required_experience_years: string (nullable = true)
 |-- education_level: string (nullable = true)
 |-- department: string (nullable = true)
 |-- posting_date: string (nullable = true)
 |-- application_deadline: string (nullable = true)
 |-- contact_email: string (nullable = true)
 |-- company_website: string (nullable = true)
 |-- has_logo: string (nullable = true)
 |-- num_open_positions: string (nullable = true)
 |-- job_function: string (nullable = true)
 |-- telecommuting: string (nullable = true)
 |-- fraud

In [10]:
df_raw.count()


3000

In [11]:
from pyspark.sql.functions import col, when, to_date, current_timestamp

df_silver = (
    df_raw
    .select(
        "job_id",
        "job_title",
        "company_name",
        "location",
        "employment_type",
        "posting_date",
        "application_deadline",
        "salary_range",
        "text_length",
        "is_fake"
    )
    .withColumn("job_id", col("job_id").cast("int"))
    .withColumn(
        "is_fake",
        when(col("is_fake") == 1, True).otherwise(False)
    )
    .withColumn("posting_date", to_date("posting_date", "yyyy-MM-dd"))
    .withColumn("application_deadline", to_date("application_deadline", "yyyy-MM-dd"))
    .withColumn("text_length", col("text_length").cast("int"))
    .withColumn("etl_processed_at", current_timestamp())
)


In [12]:
df_silver.select("is_fake").distinct().show(20, truncate=False)


+-------+
|is_fake|
+-------+
|true   |
|false  |
+-------+



In [13]:
df_raw.groupBy("is_fake").count().show()


+-------+-----+
|is_fake|count|
+-------+-----+
|      0| 1528|
|      1| 1472|
+-------+-----+



In [14]:
df_raw.printSchema()
df_raw.columns


root
 |-- job_id: string (nullable = true)
 |-- job_title: string (nullable = true)
 |-- job_description: string (nullable = true)
 |-- requirements: string (nullable = true)
 |-- benefits: string (nullable = true)
 |-- company_name: string (nullable = true)
 |-- company_profile: string (nullable = true)
 |-- industry: string (nullable = true)
 |-- employment_type: string (nullable = true)
 |-- location: string (nullable = true)
 |-- salary_range: string (nullable = true)
 |-- required_experience_years: string (nullable = true)
 |-- education_level: string (nullable = true)
 |-- department: string (nullable = true)
 |-- posting_date: string (nullable = true)
 |-- application_deadline: string (nullable = true)
 |-- contact_email: string (nullable = true)
 |-- company_website: string (nullable = true)
 |-- has_logo: string (nullable = true)
 |-- num_open_positions: string (nullable = true)
 |-- job_function: string (nullable = true)
 |-- telecommuting: string (nullable = true)
 |-- fraud

['job_id',
 'job_title',
 'job_description',
 'requirements',
 'benefits',
 'company_name',
 'company_profile',
 'industry',
 'employment_type',
 'location',
 'salary_range',
 'required_experience_years',
 'education_level',
 'department',
 'posting_date',
 'application_deadline',
 'contact_email',
 'company_website',
 'has_logo',
 'num_open_positions',
 'job_function',
 'telecommuting',
 'fraud_reason',
 'text_length',
 'is_fake']

In [15]:
jdbc_url = "jdbc:postgresql://localhost:5432/analytics"

connection_properties = {
    "user": "postgres",
    "password": "senha123",
    "driver": "org.postgresql.Driver"
}


In [16]:
df_silver.groupBy("is_fake").count().show()


+-------+-----+
|is_fake|count|
+-------+-----+
|   true| 1472|
|  false| 1528|
+-------+-----+



In [17]:
df_silver.write \
    .mode("append") \
    .jdbc(
        url=jdbc_url,
        table="silver_job_postings",
        properties=connection_properties
    )


In [18]:
print("RAW:", df_raw.count())
print("SILVER:", df_silver.count())
print("SILVER FINAL:", df_silver.count())


RAW: 3000
SILVER: 3000
SILVER FINAL: 3000
