In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import Imputer
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from delta import *


In [2]:
builder = SparkSession.builder \
    .appName("Dim Process") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:3.1.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.driver.memory", "6g") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [4]:
# Read from Delta Lake table
dfu = spark.read.format("delta") \
    .option("mergeSchema", "true") \
    .load("D:/Project/delta_lake/silver/users")

# Display the data
dfu.show()


+--------------------+------------+------------+-------------------+------+-----+----+-----+--------------------+----+-------------+--------------+---------------+------------------+---------------+---------------+---------------+----------------+---------------+----------------+-----------------+-----------------+--------------------+--------------+-------------+------------+------------+---------------------+----------------------+-----------------------+--------------------------+-----------------------+-----------------------+-----------------------+------------------------+-----------------------+------------------------+-------------------------+-------------------------+----------------+-----------+-----------------+--------------------+-------------+-----------------+--------------------+-----------+---------------+
|             user_id|        name|review_count|      yelping_since|useful|funny|cool|elite|             friends|fans|average_stars|compliment_hot|compliment_more|c

In [0]:
# Tạo bảng dim_user
dim_user = dfu.select("user_id", "name", "yelping_since", "elite_years_count", "friend_count", "review_count", "average_stars") \
    .withColumnRenamed("name", "user_name")

# Lưu bảng dim_user vào Gold Layer (lưu tạm)
dim_user.write.mode("overwrite").parquet("dbfs:/mnt/gold/dim_user")


In [0]:
# Ví dụ đọc file parquet
dfb  = spark.read.parquet("dbfs:/mnt/silver/business")
dfb.show()


+--------------------+--------------------+--------------------+--------------------+------------+--------------------+-------+-------------+---------------+--------------------+-----------+------------+-----+-----+---------------------+----------------+------------+--------------------+---------+----------------+--------------------+----------------+---------------------------+-------------------------------+--------------------+----------------------+-----------+--------------+------------+------------------------+----------------+--------------+-------------------+----------------+--------------------+----------------------+--------------+----------+--------------------+---------------+----------------+-------------------+----------------------+------------------------------+------------------------+-----------------------------+---------------------------+----------------------------+----------------------------+-----------------------+------------+-------------------------+-------

In [0]:
dim_business = dfb.select(
    "business_id",
    "name",
    "stars",
    "review_count",
    "is_open",
    "categories"
).withColumnRenamed("name", "business_name")
dim_business.write.mode("overwrite").parquet("dbfs:/mnt/gold/dim_business")


In [0]:
# Ví dụ đọc file parquet
dfr  = spark.read.parquet("dbfs:/mnt/silver/review")
dfr.show()

+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+-----------+
|         business_id|cool|               date|funny|           review_id|stars|                text|useful|             user_id|text_length|
+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+-----------+
|grpNey31cTGKrhmQQ...|   0|2020-06-28 19:45:32|    0|6fObpwIggOQR1oDap...|  5.0|Had a wonderful, ...|     0|lN-1uUHeV_QyFbczw...|        179|
|Fay6yoOC6iitEt3QL...|   0|2020-06-07 18:13:33|    0|UgtrUhfuEgUdPay75...|  4.0|Yeah it was defin...|     0|TJ8Hawan8jDIZHS7A...|        364|
|wQq0QBaYXa1KLNw_J...|   0|2018-08-17 18:51:47|    0|MmLxg9oLQmPpcPNqI...|  4.0|One of the last s...|     1|ML10yeoSaW60TwVaI...|        327|
|xgJMQq0uVY4KB9Efn...|   0|2020-06-15 14:31:14|    0|awQEOCuJ9fL12h7iq...|  5.0|Great experience ...|     0|w3z001eXLTQrYAIFe...|        118|
|ps1Dd

In [0]:
from pyspark.sql.functions import to_date, year, month, dayofweek, col,dayofmonth

dim_date = dfr.select(to_date("date").alias("date")) \
    .distinct() \
    .withColumn("year", year(col("date"))) \
    .withColumn("month", month(col("date"))) \
    .withColumn("day", dayofmonth("date")) \
    .withColumn("day_of_week", dayofweek(col("date")))

dim_date.write.mode("overwrite").parquet("dbfs:/mnt/gold/dim_date")

In [0]:
dim_location = dfb.select(
    "business_id",
    "address",
    "city",
    "state",
    "postal_code",
    "latitude",
    "longitude"
)
dim_location.write.mode("overwrite").parquet("dbfs:/mnt/gold/dim_location")


In [0]:
from pyspark.sql.functions import split

dim_category = dfb.select(
    "business_id",
    "categories"
).withColumn("category_list", split(col("categories"), ","))
dim_category.write.mode("overwrite").parquet("dbfs:/mnt/gold/dim_category")


In [0]:
dim_review_status = dfr.select(
    "review_id",
    "useful",
    "funny",
    "cool"
)
dim_review_status.write.mode("overwrite").parquet("dbfs:/mnt/gold/dim_review_status")


In [0]:
# Ví dụ đọc file parquet
dfc  = spark.read.parquet("dbfs:/mnt/silver/checkin")
dfc.show()

+--------------------+--------------------+-------------+
|         business_id|                date|checkin_count|
+--------------------+--------------------+-------------+
|---kPU91CF4Lq2-Wl...|2020-03-13 21:10:...|           11|
|--0iUa4sNDFiZFrAd...|2010-09-13 21:43:...|           10|
|--30_8IhuyMHbSOcN...|2013-06-14 23:29:...|            2|
|--7PUidqRWpRSpXeb...|2011-02-15 17:12:...|           10|
|--7jw19RH9JKXgFoh...|2014-04-21 20:42:...|           26|
|--8IbOsAAxjKRoYsB...|2015-06-06 01:03:...|           32|
|--9osgUCSDUWUkoTL...|2015-06-13 02:00:...|           24|
|--ARBQr1WMsTWiwOK...|2014-12-12 00:44:...|           34|
|--FWWsIwxRwuw9vIM...|2010-09-11 16:28:...|            7|
|--FcbSxK1AoEtEAxO...|2017-08-18 19:43:...|           82|
|--LC8cIrALInl2vyo...|2017-01-12 19:10:...|            7|
|--MbOh2O1pATkXa7x...|2013-04-21 01:52:...|          103|
|--N9yp3ZWqQIm7DqK...|2012-10-06 20:46:...|            7|
|--O3ip9NpXTKD4oBS...|2010-04-17 21:07:...|          456|
|--OS_I7dnABrX

In [0]:
from pyspark.sql.functions import col, explode, split, count, lit, monotonically_increasing_id
dim_checkin = dfc.select(
    col("business_id"),
    explode(split(col("date"), ",")).alias("date")  # Tách ngày thành từng dòng
).groupBy("business_id", "date").agg(
    count(lit(1)).alias("checkin_count")  # Tính tổng số lần check-in
).withColumn(
    "checkin_id", monotonically_increasing_id()  # Tạo ID duy nhất
)
dim_checkin.write.mode("overwrite").parquet("dbfs:/mnt/gold/dim_checkin")

In [0]:
# Ví dụ đọc file parquet
dft  = spark.read.parquet("dbfs:/mnt/silver/tip")
dft.show()

+--------------------+----------------+-------------------+--------------------+--------------------+
|         business_id|compliment_count|               date|                text|             user_id|
+--------------------+----------------+-------------------+--------------------+--------------------+
|Tdri3C3E2A93twBL3...|               0|2018-07-08 01:19:27|The sushi was pre...|BnoprwbCO9b-nBDqi...|
|yaEC_LAfaoY4yZyub...|               0|2016-11-17 04:52:28|Call ahead for yo...|KlI_NEqRNmw6LVFf4...|
|uXKygoHoGly8OqINL...|               0|2015-05-09 01:41:14|Doesn't take rese...|HpXm_E_MRQdN_Rv4c...|
|az4WbjTUNkMS0XCSA...|               0|2017-08-28 15:14:24|Very old and out ...|IFM9VzljyIE-z8WNs...|
|wEksWwS7a0leoaZkV...|               0|2016-07-12 17:53:25|Love this place. ...|IFM9VzljyIE-z8WNs...|
|RL-d1aD1BMGQoouah...|               0|2018-09-02 16:18:43|30 min wait Satur...|UVO0vrdc8Jfm00p7i...|
|tNnVxOC_9p8UK3N_2...|               0|2013-11-27 01:04:03|No you are not se...|Rq

In [0]:
dim_tip = dft.select(
    "business_id","date","compliment_count","text","user_id"
)
dim_tip.write.mode("overwrite").parquet("dbfs:/mnt/gold/dim_tip")

In [0]:
%python
from pyspark.sql.functions import when, col

dfr = spark.read.parquet("dbfs:/mnt/silver/review")
fact_review = dfr.join(dim_business, "business_id") \
    .join(dim_user, "user_id") \
    .join(dim_date, dfr["date"] == dim_date["date"]) \
    .select(
        "review_id", "business_id", "user_id", dfr["date"], 
        "text", dfr["stars"],  # Specify dfr for stars column
        when(dfr["stars"] >= 4, "positive").otherwise("negative").alias("sentiment")
    )
fact_review.write.mode("overwrite").parquet("dbfs:/mnt/gold/fact_review")