In [8]:
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import regexp_replace, col,  regexp_extract, when ,  to_date, date_format
from pyspark.sql.types import IntegerType , FloatType
from pyspark.sql import functions as F

In [3]:
spark = SparkSession.builder \
    .appName('SparkKafkaToPostgres') \
    .config('spark.jars.packages', "org.mongodb.spark:mongo-spark-connector_2.12:3.0.0,"
                                    "org.postgresql:postgresql:42.5.0") \
    .getOrCreate()

In [4]:
df = spark.read.format("mongo").option("uri", "mongodb://localhost:27020") \
        .option("database", "db_goodread") \
        .option("collection", "tb_book") \
        .load()

In [5]:
df.show()

+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+---------------+---------------+--------------------+------+------------+--------------------+--------------------+--------------------+------+-----------+-------+---------------+-------------+
|                 _id|             author|           authorUrl|             bookUrl|            bookname|            describe|      fivestars|      fourstars|               genre|number|     onestar|               pages|              prices|             publish|rating|ratingcount|reviews|     threestars|     twostars|
+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+---------------+---------------+--------------------+------+------------+--------------------+--------------------+--------------------+------+-----------+-------+---------------+-------------+
|{6702676440ee637c...|   Kathryn Stocket

In [9]:
cleaned_df = df.withColumn("book_id", F.regexp_extract("bookUrl", r'book/show/(\d+)', 1).cast(IntegerType())) \
    .withColumn("author_id", F.regexp_extract("authorUrl", r'author/show/(\d+)', 1).cast(IntegerType())) \
    .withColumn("prices", F.regexp_extract("prices", r'\$(\d+\.\d+)', 1).cast(FloatType())) \
    .withColumn("rating", col("rating").cast(FloatType())) \
    .withColumn("ratingcount", F.regexp_replace("ratingcount", ',', '').cast(IntegerType())) \
    .withColumn("reviews", F.regexp_replace("reviews", ',', '').cast(IntegerType())) \
    .withColumn("fivestars", F.regexp_replace(col("fivestars"), r"[^\d]", "").cast(IntegerType())) \
    .withColumn("fourstars", F.regexp_replace(col("fourstars"), r"[^\d]", "").cast(IntegerType())) \
    .withColumn("threestars", F.regexp_replace(col("threestars"), r"[^\d]", "").cast(IntegerType())) \
    .withColumn("twostars", F.regexp_replace(col("twostars"), r"[^\d]", "").cast(IntegerType())) \
    .withColumn("onestar", F.regexp_replace(col("onestar"), r"[^\d]", "").cast(IntegerType())) \
    .withColumn("pages_n", F.regexp_extract("pages", r'(\d+)', 1).cast(IntegerType())) \
    .withColumn("cover", F.regexp_extract("pages", r',\s*(\w+)$', 1)) \
    .withColumn("publish", F.to_date(F.regexp_extract(col("publish"), r'(\w+ \d{1,2}, \d{4})', 1), "MMMM d, yyyy")) \
    .drop("pages", "bookUrl", "authorUrl")

In [10]:
cleaned_df.show()

+--------------------+-------------------+--------------------+--------------------+---------+---------+--------------------+------+-------+------+----------+------+-----------+-------+----------+--------+--------+---------+-------+---------+
|                 _id|             author|            bookname|            describe|fivestars|fourstars|               genre|number|onestar|prices|   publish|rating|ratingcount|reviews|threestars|twostars| book_id|author_id|pages_n|    cover|
+--------------------+-------------------+--------------------+--------------------+---------+---------+--------------------+------+-------+------+----------+------+-----------+-------+----------+--------+--------+---------+-------+---------+
|{6702676440ee637c...|   Kathryn Stockett|            The Help|Three ordinary wo...|172340260| 81480628|           [Fiction]|     1| 261781| 14.99|2009-02-10|  4.47|    2840843|  91617|   2313328|  451251| 4667024|  1943477|    464|Hardcover|
|{6702676540ee637c...|      

In [11]:
author_df = cleaned_df.select("author_id", "author").distinct()
book_df = cleaned_df.select("book_id", "bookname", "author_id", "prices", "describe", "pages_n", "cover", "publish")
ratings_df = cleaned_df.select("book_id", "rating", "ratingcount", "reviews", "fivestars", "fourstars", "threestars", "twostars", "onestar")


In [7]:
df.show()

+--------------------+-------------------+--------------------+--------------------+--------------------+---------+---------+------+-------+------+----------+------+-----------+-------+----------+--------+-------+--------------------+
|                 _id|             author|             bookUrl|            bookname|            describe|fivestars|fourstars|number|onestar|prices|   publish|rating|ratingcount|reviews|threestars|twostars|pages_n|               cover|
+--------------------+-------------------+--------------------+--------------------+--------------------+---------+---------+------+-------+------+----------+------+-----------+-------+----------+--------+-------+--------------------+
|{66e2ed9239d4c13f...|   Kathryn Stockett|https://www.goodr...|            The Help|Three ordinary wo...|171776460| 81245928|     1| 260251| 14.99|10/02/2009|  4.47|  2,831,938| 91,479|   2306608|  450301|    464|           Hardcover|
|{66e2ed9339d4c13f...|        Kim Edwards|https://www.goodr.

In [12]:
def write_to_postgres(df, table_name):
    db_properties = {
        "user": "admin",
        "password": "admin",
        "driver": "org.postgresql.Driver"
    }
    db_url = "jdbc:postgresql://localhost:5432/goodread"

    # Write the DataFrame to PostgreSQL
    df.write.jdbc(url=db_url, table=table_name, mode="append", properties=db_properties)

# Write each DataFrame to PostgreSQL
write_to_postgres(author_df, "dim_author")
write_to_postgres(book_df, "dim_book")
write_to_postgres(ratings_df, "fact_book_ratings")

In [9]:
df = df.withColumn("ratingcount", regexp_replace(col("ratingcount"), ",", ""))
df = df.withColumn("reviews", regexp_replace(col("reviews"), ",", ""))
df = df.withColumn("publish", to_date(col("publish"), "dd/MM/yyyy"))

                   

In [10]:
df.show()

+--------------------+-------------------+--------------------+--------------------+--------------------+---------+---------+------+-------+------+----------+------+-----------+-------+----------+--------+-------+--------------------+
|                 _id|             author|             bookUrl|            bookname|            describe|fivestars|fourstars|number|onestar|prices|   publish|rating|ratingcount|reviews|threestars|twostars|pages_n|               cover|
+--------------------+-------------------+--------------------+--------------------+--------------------+---------+---------+------+-------+------+----------+------+-----------+-------+----------+--------+-------+--------------------+
|{66e2ed9239d4c13f...|   Kathryn Stockett|https://www.goodr...|            The Help|Three ordinary wo...|171776460| 81245928|     1| 260251| 14.99|2009-02-10|  4.47|    2831938|  91479|   2306608|  450301|    464|           Hardcover|
|{66e2ed9339d4c13f...|        Kim Edwards|https://www.goodr.

In [33]:
df = df.withColumn("pages_n", df["pages_n"].cast("int"))
df = df.withColumn("prices", df["prices"].cast("float"))
df = df.withColumn("onestar", df["onestar"].cast("long"))
df = df.withColumn("twostars", df["twostars"].cast("long"))
df = df.withColumn("threestars", df["threestars"].cast("long"))
df = df.withColumn("fourstars", df["fourstars"].cast("long"))
df = df.withColumn("fivestars", df["fivestars"].cast("long"))
df = df.withColumn("rating", df["rating"].cast("float"))
df = df.withColumn("ratingcount", df["ratingcount"].cast("long"))
df = df.withColumn("reviews", df["reviews"].cast("int"))

In [33]:
df.show()

+-------------------+--------------------+--------------------+--------------------+---------+---------+------+-------+------+----------+------+-----------+-------+----------+--------+-------+--------------------+
|             author|             bookUrl|            bookname|            describe|fivestars|fourstars|number|onestar|prices|   publish|rating|ratingcount|reviews|threestars|twostars|pages_n|               cover|
+-------------------+--------------------+--------------------+--------------------+---------+---------+------+-------+------+----------+------+-----------+-------+----------+--------+-------+--------------------+
|   Kathryn Stockett|https://www.goodr...|            The Help|Three ordinary wo...|171776460| 81245928|     1| 260251| 14.99|2009-02-10|  4.47|    2831938|  91479|   2306608|  450301|    464|           Hardcover|
|        Kim Edwards|https://www.goodr...|The Memory Keeper...|On a winter night...| 14651724| 22017336|    34| 206673| 13.99|2005-06-23|   3.7|

In [11]:
df.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- author: string (nullable = true)
 |-- bookUrl: string (nullable = true)
 |-- bookname: string (nullable = true)
 |-- describe: string (nullable = true)
 |-- fivestars: string (nullable = true)
 |-- fourstars: string (nullable = true)
 |-- number: integer (nullable = true)
 |-- onestar: string (nullable = true)
 |-- prices: string (nullable = true)
 |-- publish: date (nullable = true)
 |-- rating: string (nullable = true)
 |-- ratingcount: string (nullable = true)
 |-- reviews: string (nullable = true)
 |-- threestars: string (nullable = true)
 |-- twostars: string (nullable = true)
 |-- pages_n: string (nullable = true)
 |-- cover: string (nullable = true)



In [26]:
df = df.drop("_id")

In [36]:
df.show()

+--------------------+-------------------+--------------------+--------------------+--------------------+---------+---------+------+-------+------+----------+------+-----------+-------+----------+--------+-------+--------------------+
|                 _id|             author|             bookUrl|            bookname|            describe|fivestars|fourstars|number|onestar|prices|   publish|rating|ratingcount|reviews|threestars|twostars|pages_n|               cover|
+--------------------+-------------------+--------------------+--------------------+--------------------+---------+---------+------+-------+------+----------+------+-----------+-------+----------+--------+-------+--------------------+
|{66e2ed9239d4c13f...|   Kathryn Stockett|https://www.goodr...|            The Help|Three ordinary wo...|171776460| 81245928|     1| 260251| 14.99|2009-02-10|  4.47|    2831938|  91479|   2306608|  450301|    464|           Hardcover|
|{66e2ed9339d4c13f...|        Kim Edwards|https://www.goodr.

In [39]:
df.toJSON()

MapPartitionsRDD[129] at toJavaRDD at NativeMethodAccessorImpl.java:0

In [45]:
a = df.select("author")

In [13]:
df.write.format("mongo").option("uri", "mongodb://root:rootpassword@localhost:27020") \
        .option("database", "db_goodread") \
        .option("collection", "tb_book_clean") \
        .mode("append") \
        .save()

In [39]:
df.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- author: string (nullable = true)
 |-- bookUrl: string (nullable = true)
 |-- bookname: string (nullable = true)
 |-- describe: string (nullable = true)
 |-- fivestars: long (nullable = true)
 |-- fourstars: long (nullable = true)
 |-- number: integer (nullable = true)
 |-- onestar: long (nullable = true)
 |-- prices: float (nullable = true)
 |-- publish: date (nullable = true)
 |-- rating: float (nullable = true)
 |-- ratingcount: long (nullable = true)
 |-- reviews: integer (nullable = true)
 |-- threestars: long (nullable = true)
 |-- twostars: long (nullable = true)
 |-- pages_n: integer (nullable = true)
 |-- cover: string (nullable = true)



In [14]:
spark.stop()

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType

In [21]:
import findspark
findspark.init()

import os


In [2]:
spark = SparkSession.builder \
    .appName('SparkKafkaToPostgres') \
    .config('spark.jars.packages', "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.3,"
                                    "org.postgresql:postgresql:42.5.0") \
    .getOrCreate()

In [7]:
kafka_df = spark.readStream \
    .format('kafka') \
    .option('kafka.bootstrap.servers', 'localhost:9092') \
    .option('subscribe', 'goodread') \
    .load()

In [6]:
#xem dữ liệu
value_df = kafka_df.selectExpr("CAST(value AS STRING) AS message")

# Hiển thị dữ liệu trên console
query = value_df.writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

# Chờ cho đến khi stream dừng
query.awaitTermination()

In [4]:
kafka_df_once = spark.read \
    .format('kafka') \
    .option('kafka.bootstrap.servers', 'localhost:9092') \
    .option('subscribe', 'goodread') \
    .load()

# Chọn cột 'value' và chuyển đổi nó thành chuỗi
value_df_once = kafka_df_once.selectExpr("CAST(value AS STRING) AS message")

# Hiển thị 10 dòng dữ liệu
value_df_once.show(10)

+--------------------+
|             message|
+--------------------+
|{"bookUrl": "http...|
|{"bookUrl": "http...|
|{"bookUrl": "http...|
|{"bookUrl": "http...|
|{"bookUrl": "http...|
|{"bookUrl": "http...|
|{"bookUrl": "http...|
|{"bookUrl": "http...|
|{"bookUrl": "http...|
|{"bookUrl": "http...|
+--------------------+
only showing top 10 rows



In [5]:
schema = StructType([
    StructField("bookUrl", StringType()),
    StructField("number", StringType()),
    StructField("bookname", StringType()),
    StructField("author", StringType()),
    StructField("prices", StringType()),
    StructField("describe", StringType()),
    StructField("rating", StringType()),
    StructField("ratingcount", StringType()),
    StructField("reviews", StringType()),
    StructField("fivestars", StringType()),
    StructField("fourstars", StringType()),
    StructField("threestars", StringType()),
    StructField("twostars", StringType()),
    StructField("onestar", StringType()),
    StructField("pages", StringType()),
    StructField("publish", StringType()),
])

In [8]:
value_df = kafka_df.selectExpr("CAST(value AS STRING) AS json_string") \
    .select(from_json(col("json_string"), schema).alias("data")) \
    .select("data.*")


In [12]:
value_df.writeStream \
    .outputMode("append") \
    .format("console") \
    .start() 

<pyspark.sql.streaming.query.StreamingQuery at 0x1d053082b00>

In [10]:
#xem data



AnalysisException: Queries with streaming sources must be executed with writeStream.start();
kafka