In [2]:
#Cell 1
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("yelp-ingest")
    .config("spark.jars","/home/jovyan/jars/postgresql-42.7.4.jar")
    .config("spark.sql.files.maxPartitionBytes","268435456")
    .config("spark.sql.shuffle.partitions","64")
    .config("spark.driver.memory","6g")
    .config("spark.executor.memory","6g")
    .getOrCreate()
)

jdbc = {
    "url": "jdbc:postgresql://db:5432/yelp",
    "user": "postgres",
    "password": "postgres",
    "driver": "org.postgresql.Driver",
    "batchsize": "10000",
}

spark.createDataFrame([(1,"ok")],["id","msg"])\
     .write.format("jdbc").options(**jdbc)\
     .option("dbtable","public.spark_ping").mode("overwrite").save()



In [3]:
#Cell 2
from pyspark.sql.types import *
base = "/data/raw"; pq = "/data/parquet"

business_schema = StructType([
    StructField("business_id", StringType()), StructField("name", StringType()),
    StructField("address", StringType()), StructField("city", StringType()),
    StructField("state", StringType()), StructField("postal_code", StringType()),
    StructField("latitude", DoubleType()), StructField("longitude", DoubleType()),
    StructField("stars", DoubleType()), StructField("review_count", IntegerType()),
    StructField("is_open", IntegerType()),
    StructField("attributes", MapType(StringType(), StringType())),
    StructField("categories", StringType()),
    StructField("hours", MapType(StringType(), StringType())),
])
tip_schema = StructType([
    StructField("user_id", StringType()),
    StructField("business_id", StringType()),
    StructField("text", StringType()),
    StructField("date", StringType()),
    StructField("compliment_count", IntegerType()),
])

biz = spark.read.schema(business_schema).json(f"{base}/yelp_academic_dataset_business.json")
tip = spark.read.schema(tip_schema).json(f"{base}/yelp_academic_dataset_tip.json")

biz.coalesce(8).write.mode("overwrite").parquet(f"{pq}/business")
tip.coalesce(8).write.mode("overwrite").parquet(f"{pq}/tip")

print("business rows:", biz.count())   # expect ~172,907
print("tip rows     :", tip.count())   # expect ~533,285



business rows: 172907
tip rows     : 533285


In [4]:
#Cell 3
biz_pg = biz.drop("attributes","hours")\
            .select("business_id","name","city","state","postal_code",
                    "latitude","longitude","stars","review_count","is_open","categories")

(biz_pg.repartition(8)
 .write.format("jdbc").options(**jdbc)
 .option("dbtable","public.business").mode("overwrite").save())

tip_small = tip.select("user_id","business_id","text","date","compliment_count").limit(100_000)
(tip_small.repartition(4)
 .write.format("jdbc").options(**jdbc)
 .option("dbtable","public.tip_small").mode("overwrite").save())


In [5]:
# Cell 4
# read-back proof
for t in ["spark_ping","business","tip_small"]:
    df = spark.read.format("jdbc").options(**jdbc).option("dbtable", t).load()
    print(t, df.count())


spark_ping 1
business 172907
tip_small 100000
