In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import Imputer
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from delta import *

builder = SparkSession.builder \
    .appName("Users to Silver") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:3.1.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()


# Initialize Spark session (assuming it's already created in the previous code)

# Load the data from the previous step or directly from source
df = spark.read.json("D:/Project/delta_lake/bronze/yelp_academic_dataset_business.json")

# Extract and parse JSON structured attributes
df_with_parsed = df.withColumn(
    "business_parking_dict", 
    from_json(
        col("attributes").getItem("BusinessParking"),
        MapType(StringType(), BooleanType())
    )
).withColumn(
    "ambience_dict", 
    from_json(
        col("attributes").getItem("Ambience"),
        MapType(StringType(), BooleanType())
    )
).withColumn(
    "good_for_meal_dict", 
    from_json(
        col("attributes").getItem("GoodForMeal"),
        MapType(StringType(), BooleanType())
    )
)

# Extract specific values from these complex attributes
df_with_parsed = df_with_parsed \
    .withColumn("garage_parking", col("business_parking_dict").getItem("garage")) \
    .withColumn("street_parking", col("business_parking_dict").getItem("street")) \
    .withColumn("lot_parking", col("business_parking_dict").getItem("lot")) \
    .withColumn("valet_parking", col("business_parking_dict").getItem("valet")) \
    .withColumn("is_romantic", col("ambience_dict").getItem("romantic")) \
    .withColumn("is_intimate", col("ambience_dict").getItem("intimate")) \
    .withColumn("is_classy", col("ambience_dict").getItem("classy")) \
    .withColumn("is_hipster", col("ambience_dict").getItem("hipster")) \
    .withColumn("good_for_dinner", col("good_for_meal_dict").getItem("dinner")) \
    .withColumn("good_for_lunch", col("good_for_meal_dict").getItem("lunch")) \
    .withColumn("good_for_breakfast", col("good_for_meal_dict").getItem("breakfast"))

# Display results
df_with_parsed.select(
    "business_id", "name", 
    "garage_parking", "street_parking", "lot_parking", 
    "is_romantic", "is_classy",
    "good_for_dinner", "good_for_lunch"
).show(5)



+--------------------+--------------------+--------------+--------------+-----------+-----------+---------+---------------+--------------+
|         business_id|                name|garage_parking|street_parking|lot_parking|is_romantic|is_classy|good_for_dinner|good_for_lunch|
+--------------------+--------------------+--------------+--------------+-----------+-----------+---------+---------------+--------------+
|Pns2l4eNsfO8kk83d...|Abby Rappoport, L...|          NULL|          NULL|       NULL|       NULL|     NULL|           NULL|          NULL|
|mpf3x-BjTdTEA3yCZ...|       The UPS Store|          NULL|          NULL|       NULL|       NULL|     NULL|           NULL|          NULL|
|tUFrWirKiKi_TAnsV...|              Target|          NULL|          NULL|       NULL|       NULL|     NULL|           NULL|          NULL|
|MTSW4McQd7CbVtyjq...|  St Honore Pastries|          NULL|          NULL|       NULL|       NULL|     NULL|           NULL|          NULL|
|mWMc6_wTdE0EUBKIG...|Perki

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import Imputer
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from delta import *

builder = SparkSession.builder \
    .appName("Users to Silver") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:3.1.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()


# Initialize Spark session (assuming it's already created in the previous code)

# Load the data from the previous step or directly from source
df = spark.read.json("D:/Project/delta_lake/bronze/yelp_academic_dataset_business.json")

# Extract and parse JSON structured attributes
df_with_parsed = df.withColumn(
    "business_parking_dict", 
    from_json(
        col("attributes").getItem("BusinessParking"),
        MapType(StringType(), BooleanType())
    )
).withColumn(
    "ambience_dict", 
    from_json(
        col("attributes").getItem("Ambience"),
        MapType(StringType(), BooleanType())
    )
).withColumn(
    "good_for_meal_dict", 
    from_json(
        col("attributes").getItem("GoodForMeal"),
        MapType(StringType(), BooleanType())
    )
)

# Extract specific values from these complex attributes
df_with_parsed = df_with_parsed \
    .withColumn("garage_parking", col("business_parking_dict").getItem("garage")) \
    .withColumn("street_parking", col("business_parking_dict").getItem("street")) \
    .withColumn("lot_parking", col("business_parking_dict").getItem("lot")) \
    .withColumn("valet_parking", col("business_parking_dict").getItem("valet")) \
    .withColumn("is_romantic", col("ambience_dict").getItem("romantic")) \
    .withColumn("is_intimate", col("ambience_dict").getItem("intimate")) \
    .withColumn("is_classy", col("ambience_dict").getItem("classy")) \
    .withColumn("is_hipster", col("ambience_dict").getItem("hipster")) \
    .withColumn("good_for_dinner", col("good_for_meal_dict").getItem("dinner")) \
    .withColumn("good_for_lunch", col("good_for_meal_dict").getItem("lunch")) \
    .withColumn("good_for_breakfast", col("good_for_meal_dict").getItem("breakfast"))

# Display results
df_with_parsed.select(
    "business_id", "name", 
    "garage_parking", "street_parking", "lot_parking", 
    "is_romantic", "is_classy",
    "good_for_dinner", "good_for_lunch"
).show(5)



+--------------------+--------------------+--------------+--------------+-----------+-----------+---------+---------------+--------------+
|         business_id|                name|garage_parking|street_parking|lot_parking|is_romantic|is_classy|good_for_dinner|good_for_lunch|
+--------------------+--------------------+--------------+--------------+-----------+-----------+---------+---------------+--------------+
|Pns2l4eNsfO8kk83d...|Abby Rappoport, L...|          NULL|          NULL|       NULL|       NULL|     NULL|           NULL|          NULL|
|mpf3x-BjTdTEA3yCZ...|       The UPS Store|          NULL|          NULL|       NULL|       NULL|     NULL|           NULL|          NULL|
|tUFrWirKiKi_TAnsV...|              Target|          NULL|          NULL|       NULL|       NULL|     NULL|           NULL|          NULL|
|MTSW4McQd7CbVtyjq...|  St Honore Pastries|          NULL|          NULL|       NULL|       NULL|     NULL|           NULL|          NULL|
|mWMc6_wTdE0EUBKIG...|Perki

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import Imputer
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from delta import *

builder = SparkSession.builder \
    .appName("Users to Silver") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:3.1.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()


# Initialize Spark session (assuming it's already created in the previous code)

# Load the data from the previous step or directly from source
df = spark.read.json("D:/Project/delta_lake/bronze/yelp_academic_dataset_business.json")

# Extract and parse JSON structured attributes
df_with_parsed = df.withColumn(
    "business_parking_dict", 
    from_json(
        col("attributes").getItem("BusinessParking"),
        MapType(StringType(), BooleanType())
    )
).withColumn(
    "ambience_dict", 
    from_json(
        col("attributes").getItem("Ambience"),
        MapType(StringType(), BooleanType())
    )
).withColumn(
    "good_for_meal_dict", 
    from_json(
        col("attributes").getItem("GoodForMeal"),
        MapType(StringType(), BooleanType())
    )
)

# Extract specific values from these complex attributes
df_with_parsed = df_with_parsed \
    .withColumn("garage_parking", col("business_parking_dict").getItem("garage")) \
    .withColumn("street_parking", col("business_parking_dict").getItem("street")) \
    .withColumn("lot_parking", col("business_parking_dict").getItem("lot")) \
    .withColumn("valet_parking", col("business_parking_dict").getItem("valet")) \
    .withColumn("is_romantic", col("ambience_dict").getItem("romantic")) \
    .withColumn("is_intimate", col("ambience_dict").getItem("intimate")) \
    .withColumn("is_classy", col("ambience_dict").getItem("classy")) \
    .withColumn("is_hipster", col("ambience_dict").getItem("hipster")) \
    .withColumn("good_for_dinner", col("good_for_meal_dict").getItem("dinner")) \
    .withColumn("good_for_lunch", col("good_for_meal_dict").getItem("lunch")) \
    .withColumn("good_for_breakfast", col("good_for_meal_dict").getItem("breakfast"))

# Display results
df_with_parsed.select(
    "business_id", "name", 
    "garage_parking", "street_parking", "lot_parking", 
    "is_romantic", "is_classy",
    "good_for_dinner", "good_for_lunch"
).show(5)



+--------------------+--------------------+--------------+--------------+-----------+-----------+---------+---------------+--------------+
|         business_id|                name|garage_parking|street_parking|lot_parking|is_romantic|is_classy|good_for_dinner|good_for_lunch|
+--------------------+--------------------+--------------+--------------+-----------+-----------+---------+---------------+--------------+
|Pns2l4eNsfO8kk83d...|Abby Rappoport, L...|          NULL|          NULL|       NULL|       NULL|     NULL|           NULL|          NULL|
|mpf3x-BjTdTEA3yCZ...|       The UPS Store|          NULL|          NULL|       NULL|       NULL|     NULL|           NULL|          NULL|
|tUFrWirKiKi_TAnsV...|              Target|          NULL|          NULL|       NULL|       NULL|     NULL|           NULL|          NULL|
|MTSW4McQd7CbVtyjq...|  St Honore Pastries|          NULL|          NULL|       NULL|       NULL|     NULL|           NULL|          NULL|
|mWMc6_wTdE0EUBKIG...|Perki

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import Imputer
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from delta import *

builder = SparkSession.builder \
    .appName("Users to Silver") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:3.1.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()


# Initialize Spark session (assuming it's already created in the previous code)

# Load the data from the previous step or directly from source
df = spark.read.json("D:/Project/delta_lake/bronze/yelp_academic_dataset_business.json")

# Extract and parse JSON structured attributes
df_with_parsed = df.withColumn(
    "business_parking_dict", 
    from_json(
        col("attributes").getItem("BusinessParking"),
        MapType(StringType(), BooleanType())
    )
).withColumn(
    "ambience_dict", 
    from_json(
        col("attributes").getItem("Ambience"),
        MapType(StringType(), BooleanType())
    )
).withColumn(
    "good_for_meal_dict", 
    from_json(
        col("attributes").getItem("GoodForMeal"),
        MapType(StringType(), BooleanType())
    )
)

# Extract specific values from these complex attributes
df_with_parsed = df_with_parsed \
    .withColumn("garage_parking", col("business_parking_dict").getItem("garage")) \
    .withColumn("street_parking", col("business_parking_dict").getItem("street")) \
    .withColumn("lot_parking", col("business_parking_dict").getItem("lot")) \
    .withColumn("valet_parking", col("business_parking_dict").getItem("valet")) \
    .withColumn("is_romantic", col("ambience_dict").getItem("romantic")) \
    .withColumn("is_intimate", col("ambience_dict").getItem("intimate")) \
    .withColumn("is_classy", col("ambience_dict").getItem("classy")) \
    .withColumn("is_hipster", col("ambience_dict").getItem("hipster")) \
    .withColumn("good_for_dinner", col("good_for_meal_dict").getItem("dinner")) \
    .withColumn("good_for_lunch", col("good_for_meal_dict").getItem("lunch")) \
    .withColumn("good_for_breakfast", col("good_for_meal_dict").getItem("breakfast"))

# Display results
df_with_parsed.select(
    "business_id", "name", 
    "garage_parking", "street_parking", "lot_parking", 
    "is_romantic", "is_classy",
    "good_for_dinner", "good_for_lunch"
).show(5)



+--------------------+--------------------+--------------+--------------+-----------+-----------+---------+---------------+--------------+
|         business_id|                name|garage_parking|street_parking|lot_parking|is_romantic|is_classy|good_for_dinner|good_for_lunch|
+--------------------+--------------------+--------------+--------------+-----------+-----------+---------+---------------+--------------+
|Pns2l4eNsfO8kk83d...|Abby Rappoport, L...|          NULL|          NULL|       NULL|       NULL|     NULL|           NULL|          NULL|
|mpf3x-BjTdTEA3yCZ...|       The UPS Store|          NULL|          NULL|       NULL|       NULL|     NULL|           NULL|          NULL|
|tUFrWirKiKi_TAnsV...|              Target|          NULL|          NULL|       NULL|       NULL|     NULL|           NULL|          NULL|
|MTSW4McQd7CbVtyjq...|  St Honore Pastries|          NULL|          NULL|       NULL|       NULL|     NULL|           NULL|          NULL|
|mWMc6_wTdE0EUBKIG...|Perki

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import Imputer
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from delta import *

builder = SparkSession.builder \
    .appName("Users to Silver") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:3.1.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()


# Initialize Spark session (assuming it's already created in the previous code)

# Load the data from the previous step or directly from source
df = spark.read.json("D:/Project/delta_lake/bronze/yelp_academic_dataset_business.json")

# Extract and parse JSON structured attributes
df_with_parsed = df.withColumn(
    "business_parking_dict", 
    from_json(
        col("attributes").getItem("BusinessParking"),
        MapType(StringType(), BooleanType())
    )
).withColumn(
    "ambience_dict", 
    from_json(
        col("attributes").getItem("Ambience"),
        MapType(StringType(), BooleanType())
    )
).withColumn(
    "good_for_meal_dict", 
    from_json(
        col("attributes").getItem("GoodForMeal"),
        MapType(StringType(), BooleanType())
    )
)

# Extract specific values from these complex attributes
df_with_parsed = df_with_parsed \
    .withColumn("garage_parking", col("business_parking_dict").getItem("garage")) \
    .withColumn("street_parking", col("business_parking_dict").getItem("street")) \
    .withColumn("lot_parking", col("business_parking_dict").getItem("lot")) \
    .withColumn("valet_parking", col("business_parking_dict").getItem("valet")) \
    .withColumn("is_romantic", col("ambience_dict").getItem("romantic")) \
    .withColumn("is_intimate", col("ambience_dict").getItem("intimate")) \
    .withColumn("is_classy", col("ambience_dict").getItem("classy")) \
    .withColumn("is_hipster", col("ambience_dict").getItem("hipster")) \
    .withColumn("good_for_dinner", col("good_for_meal_dict").getItem("dinner")) \
    .withColumn("good_for_lunch", col("good_for_meal_dict").getItem("lunch")) \
    .withColumn("good_for_breakfast", col("good_for_meal_dict").getItem("breakfast"))

# Display results
df_with_parsed.select(
    "business_id", "name", 
    "garage_parking", "street_parking", "lot_parking", 
    "is_romantic", "is_classy",
    "good_for_dinner", "good_for_lunch"
).show(5)



+--------------------+--------------------+--------------+--------------+-----------+-----------+---------+---------------+--------------+
|         business_id|                name|garage_parking|street_parking|lot_parking|is_romantic|is_classy|good_for_dinner|good_for_lunch|
+--------------------+--------------------+--------------+--------------+-----------+-----------+---------+---------------+--------------+
|Pns2l4eNsfO8kk83d...|Abby Rappoport, L...|          NULL|          NULL|       NULL|       NULL|     NULL|           NULL|          NULL|
|mpf3x-BjTdTEA3yCZ...|       The UPS Store|          NULL|          NULL|       NULL|       NULL|     NULL|           NULL|          NULL|
|tUFrWirKiKi_TAnsV...|              Target|          NULL|          NULL|       NULL|       NULL|     NULL|           NULL|          NULL|
|MTSW4McQd7CbVtyjq...|  St Honore Pastries|          NULL|          NULL|       NULL|       NULL|     NULL|           NULL|          NULL|
|mWMc6_wTdE0EUBKIG...|Perki

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import Imputer
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from delta import *

builder = SparkSession.builder \
    .appName("Users to Silver") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:3.1.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()


# Initialize Spark session (assuming it's already created in the previous code)

# Load the data from the previous step or directly from source
df = spark.read.json("D:/Project/delta_lake/bronze/yelp_academic_dataset_business.json")

# Extract and parse JSON structured attributes
df_with_parsed = df.withColumn(
    "business_parking_dict", 
    from_json(
        col("attributes").getItem("BusinessParking"),
        MapType(StringType(), BooleanType())
    )
).withColumn(
    "ambience_dict", 
    from_json(
        col("attributes").getItem("Ambience"),
        MapType(StringType(), BooleanType())
    )
).withColumn(
    "good_for_meal_dict", 
    from_json(
        col("attributes").getItem("GoodForMeal"),
        MapType(StringType(), BooleanType())
    )
)

# Extract specific values from these complex attributes
df_with_parsed = df_with_parsed \
    .withColumn("garage_parking", col("business_parking_dict").getItem("garage")) \
    .withColumn("street_parking", col("business_parking_dict").getItem("street")) \
    .withColumn("lot_parking", col("business_parking_dict").getItem("lot")) \
    .withColumn("valet_parking", col("business_parking_dict").getItem("valet")) \
    .withColumn("is_romantic", col("ambience_dict").getItem("romantic")) \
    .withColumn("is_intimate", col("ambience_dict").getItem("intimate")) \
    .withColumn("is_classy", col("ambience_dict").getItem("classy")) \
    .withColumn("is_hipster", col("ambience_dict").getItem("hipster")) \
    .withColumn("good_for_dinner", col("good_for_meal_dict").getItem("dinner")) \
    .withColumn("good_for_lunch", col("good_for_meal_dict").getItem("lunch")) \
    .withColumn("good_for_breakfast", col("good_for_meal_dict").getItem("breakfast"))

# Display results
df_with_parsed.select(
    "business_id", "name", 
    "garage_parking", "street_parking", "lot_parking", 
    "is_romantic", "is_classy",
    "good_for_dinner", "good_for_lunch"
).show(5)



+--------------------+--------------------+--------------+--------------+-----------+-----------+---------+---------------+--------------+
|         business_id|                name|garage_parking|street_parking|lot_parking|is_romantic|is_classy|good_for_dinner|good_for_lunch|
+--------------------+--------------------+--------------+--------------+-----------+-----------+---------+---------------+--------------+
|Pns2l4eNsfO8kk83d...|Abby Rappoport, L...|          NULL|          NULL|       NULL|       NULL|     NULL|           NULL|          NULL|
|mpf3x-BjTdTEA3yCZ...|       The UPS Store|          NULL|          NULL|       NULL|       NULL|     NULL|           NULL|          NULL|
|tUFrWirKiKi_TAnsV...|              Target|          NULL|          NULL|       NULL|       NULL|     NULL|           NULL|          NULL|
|MTSW4McQd7CbVtyjq...|  St Honore Pastries|          NULL|          NULL|       NULL|       NULL|     NULL|           NULL|          NULL|
|mWMc6_wTdE0EUBKIG...|Perki

In [None]:
def load_business_data(spark, path):
    schema = StructType([
        StructField("business_id", StringType(), True),
        StructField("name", StringType(), True),
        StructField("address", StringType(), True),
        StructField("city", StringType(), True),
        StructField("state", StringType(), True),
        StructField("postal_code", StringType(), True),
        StructField("latitude", DoubleType(), True),
        StructField("longitude", DoubleType(), True),
        StructField("stars", DoubleType(), True),
        StructField("review_count", IntegerType(), True),
        StructField("is_open", IntegerType(), True),
        StructField("attributes", MapType(StringType(), StringType()), True),
        StructField("categories", StringType(), True),
        StructField("hours", MapType(StringType(), StringType()), True)
    ])
    return spark.read.json(path, schema=schema)

df = load_business_data(spark, "D:/Project/delta_lake/bronze/yelp_academic_dataset_business.json")
df.show(5)

+--------------------+--------------------+--------------------+-------------+-----+-----------+----------+------------+-----+------------+-------+--------------------+--------------------+--------------------+
|         business_id|                name|             address|         city|state|postal_code|  latitude|   longitude|stars|review_count|is_open|          attributes|          categories|               hours|
+--------------------+--------------------+--------------------+-------------+-----+-----------+----------+------------+-----+------------+-------+--------------------+--------------------+--------------------+
|Pns2l4eNsfO8kk83d...|Abby Rappoport, L...|1616 Chapala St, ...|Santa Barbara|   CA|      93101|34.4266787|-119.7111968|  5.0|           7|      0|{ByAppointmentOnl...|Doctors, Traditio...|                NULL|
|mpf3x-BjTdTEA3yCZ...|       The UPS Store|87 Grasso Plaza S...|       Affton|   MO|      63123| 38.551126|  -90.335695|  3.0|          15|      1|{Business

In [None]:
def load_business_data(spark, path):
    schema = StructType([
        StructField("business_id", StringType(), True),
        StructField("name", StringType(), True),
        StructField("address", StringType(), True),
        StructField("city", StringType(), True),
        StructField("state", StringType(), True),
        StructField("postal_code", StringType(), True),
        StructField("latitude", DoubleType(), True),
        StructField("longitude", DoubleType(), True),
        StructField("stars", DoubleType(), True),
        StructField("review_count", IntegerType(), True),
        StructField("is_open", IntegerType(), True),
        StructField("attributes", MapType(StringType(), StringType()), True),
        StructField("categories", StringType(), True),
        StructField("hours", MapType(StringType(), StringType()), True)
    ])
    return spark.read.json(path, schema=schema)

df = load_business_data(spark, "D:/Project/delta_lake/bronze/yelp_academic_dataset_business.json")
df.show(5)

+--------------------+--------------------+--------------------+-------------+-----+-----------+----------+------------+-----+------------+-------+--------------------+--------------------+--------------------+
|         business_id|                name|             address|         city|state|postal_code|  latitude|   longitude|stars|review_count|is_open|          attributes|          categories|               hours|
+--------------------+--------------------+--------------------+-------------+-----+-----------+----------+------------+-----+------------+-------+--------------------+--------------------+--------------------+
|Pns2l4eNsfO8kk83d...|Abby Rappoport, L...|1616 Chapala St, ...|Santa Barbara|   CA|      93101|34.4266787|-119.7111968|  5.0|           7|      0|{ByAppointmentOnl...|Doctors, Traditio...|                NULL|
|mpf3x-BjTdTEA3yCZ...|       The UPS Store|87 Grasso Plaza S...|       Affton|   MO|      63123| 38.551126|  -90.335695|  3.0|          15|      1|{Business

In [None]:
def load_business_data(spark, path):
    schema = StructType([
        StructField("business_id", StringType(), True),
        StructField("name", StringType(), True),
        StructField("address", StringType(), True),
        StructField("city", StringType(), True),
        StructField("state", StringType(), True),
        StructField("postal_code", StringType(), True),
        StructField("latitude", DoubleType(), True),
        StructField("longitude", DoubleType(), True),
        StructField("stars", DoubleType(), True),
        StructField("review_count", IntegerType(), True),
        StructField("is_open", IntegerType(), True),
        StructField("attributes", MapType(StringType(), StringType()), True),
        StructField("categories", StringType(), True),
        StructField("hours", MapType(StringType(), StringType()), True)
    ])
    return spark.read.json(path, schema=schema)

df = load_business_data(spark, "D:/Project/delta_lake/bronze/yelp_academic_dataset_business.json")
df.show(5)

+--------------------+--------------------+--------------------+-------------+-----+-----------+----------+------------+-----+------------+-------+--------------------+--------------------+--------------------+
|         business_id|                name|             address|         city|state|postal_code|  latitude|   longitude|stars|review_count|is_open|          attributes|          categories|               hours|
+--------------------+--------------------+--------------------+-------------+-----+-----------+----------+------------+-----+------------+-------+--------------------+--------------------+--------------------+
|Pns2l4eNsfO8kk83d...|Abby Rappoport, L...|1616 Chapala St, ...|Santa Barbara|   CA|      93101|34.4266787|-119.7111968|  5.0|           7|      0|{ByAppointmentOnl...|Doctors, Traditio...|                NULL|
|mpf3x-BjTdTEA3yCZ...|       The UPS Store|87 Grasso Plaza S...|       Affton|   MO|      63123| 38.551126|  -90.335695|  3.0|          15|      1|{Business

In [None]:
def load_business_data(spark, path):
    schema = StructType([
        StructField("business_id", StringType(), True),
        StructField("name", StringType(), True),
        StructField("address", StringType(), True),
        StructField("city", StringType(), True),
        StructField("state", StringType(), True),
        StructField("postal_code", StringType(), True),
        StructField("latitude", DoubleType(), True),
        StructField("longitude", DoubleType(), True),
        StructField("stars", DoubleType(), True),
        StructField("review_count", IntegerType(), True),
        StructField("is_open", IntegerType(), True),
        StructField("attributes", MapType(StringType(), StringType()), True),
        StructField("categories", StringType(), True),
        StructField("hours", MapType(StringType(), StringType()), True)
    ])
    return spark.read.json(path, schema=schema)

df = load_business_data(spark, "D:/Project/delta_lake/bronze/yelp_academic_dataset_business.json")
df.show(5)

+--------------------+--------------------+--------------------+-------------+-----+-----------+----------+------------+-----+------------+-------+--------------------+--------------------+--------------------+
|         business_id|                name|             address|         city|state|postal_code|  latitude|   longitude|stars|review_count|is_open|          attributes|          categories|               hours|
+--------------------+--------------------+--------------------+-------------+-----+-----------+----------+------------+-----+------------+-------+--------------------+--------------------+--------------------+
|Pns2l4eNsfO8kk83d...|Abby Rappoport, L...|1616 Chapala St, ...|Santa Barbara|   CA|      93101|34.4266787|-119.7111968|  5.0|           7|      0|{ByAppointmentOnl...|Doctors, Traditio...|                NULL|
|mpf3x-BjTdTEA3yCZ...|       The UPS Store|87 Grasso Plaza S...|       Affton|   MO|      63123| 38.551126|  -90.335695|  3.0|          15|      1|{Business

In [None]:
def load_business_data(spark, path):
    schema = StructType([
        StructField("business_id", StringType(), True),
        StructField("name", StringType(), True),
        StructField("address", StringType(), True),
        StructField("city", StringType(), True),
        StructField("state", StringType(), True),
        StructField("postal_code", StringType(), True),
        StructField("latitude", DoubleType(), True),
        StructField("longitude", DoubleType(), True),
        StructField("stars", DoubleType(), True),
        StructField("review_count", IntegerType(), True),
        StructField("is_open", IntegerType(), True),
        StructField("attributes", MapType(StringType(), StringType()), True),
        StructField("categories", StringType(), True),
        StructField("hours", MapType(StringType(), StringType()), True)
    ])
    return spark.read.json(path, schema=schema)

df = load_business_data(spark, "D:/Project/delta_lake/bronze/yelp_academic_dataset_business.json")
df.show(5)

+--------------------+--------------------+--------------------+-------------+-----+-----------+----------+------------+-----+------------+-------+--------------------+--------------------+--------------------+
|         business_id|                name|             address|         city|state|postal_code|  latitude|   longitude|stars|review_count|is_open|          attributes|          categories|               hours|
+--------------------+--------------------+--------------------+-------------+-----+-----------+----------+------------+-----+------------+-------+--------------------+--------------------+--------------------+
|Pns2l4eNsfO8kk83d...|Abby Rappoport, L...|1616 Chapala St, ...|Santa Barbara|   CA|      93101|34.4266787|-119.7111968|  5.0|           7|      0|{ByAppointmentOnl...|Doctors, Traditio...|                NULL|
|mpf3x-BjTdTEA3yCZ...|       The UPS Store|87 Grasso Plaza S...|       Affton|   MO|      63123| 38.551126|  -90.335695|  3.0|          15|      1|{Business

In [None]:
def load_business_data(spark, path):
    schema = StructType([
        StructField("business_id", StringType(), True),
        StructField("name", StringType(), True),
        StructField("address", StringType(), True),
        StructField("city", StringType(), True),
        StructField("state", StringType(), True),
        StructField("postal_code", StringType(), True),
        StructField("latitude", DoubleType(), True),
        StructField("longitude", DoubleType(), True),
        StructField("stars", DoubleType(), True),
        StructField("review_count", IntegerType(), True),
        StructField("is_open", IntegerType(), True),
        StructField("attributes", MapType(StringType(), StringType()), True),
        StructField("categories", StringType(), True),
        StructField("hours", MapType(StringType(), StringType()), True)
    ])
    return spark.read.json(path, schema=schema)

df = load_business_data(spark, "D:/Project/delta_lake/bronze/yelp_academic_dataset_business.json")
df.show(5)

+--------------------+--------------------+--------------------+-------------+-----+-----------+----------+------------+-----+------------+-------+--------------------+--------------------+--------------------+
|         business_id|                name|             address|         city|state|postal_code|  latitude|   longitude|stars|review_count|is_open|          attributes|          categories|               hours|
+--------------------+--------------------+--------------------+-------------+-----+-----------+----------+------------+-----+------------+-------+--------------------+--------------------+--------------------+
|Pns2l4eNsfO8kk83d...|Abby Rappoport, L...|1616 Chapala St, ...|Santa Barbara|   CA|      93101|34.4266787|-119.7111968|  5.0|           7|      0|{ByAppointmentOnl...|Doctors, Traditio...|                NULL|
|mpf3x-BjTdTEA3yCZ...|       The UPS Store|87 Grasso Plaza S...|       Affton|   MO|      63123| 38.551126|  -90.335695|  3.0|          15|      1|{Business

In [None]:
def load_business_data(spark, path):
    schema = StructType([
        StructField("business_id", StringType(), True),
        StructField("name", StringType(), True),
        StructField("address", StringType(), True),
        StructField("city", StringType(), True),
        StructField("state", StringType(), True),
        StructField("postal_code", StringType(), True),
        StructField("latitude", DoubleType(), True),
        StructField("longitude", DoubleType(), True),
        StructField("stars", DoubleType(), True),
        StructField("review_count", IntegerType(), True),
        StructField("is_open", IntegerType(), True),
        StructField("attributes", MapType(StringType(), StringType()), True),
        StructField("categories", StringType(), True),
        StructField("hours", MapType(StringType(), StringType()), True)
    ])
    return spark.read.json(path, schema=schema)

df = load_business_data(spark, "D:/Project/delta_lake/bronze/yelp_academic_dataset_business.json")
df.show(5)

+--------------------+--------------------+--------------------+-------------+-----+-----------+----------+------------+-----+------------+-------+--------------------+--------------------+--------------------+
|         business_id|                name|             address|         city|state|postal_code|  latitude|   longitude|stars|review_count|is_open|          attributes|          categories|               hours|
+--------------------+--------------------+--------------------+-------------+-----+-----------+----------+------------+-----+------------+-------+--------------------+--------------------+--------------------+
|Pns2l4eNsfO8kk83d...|Abby Rappoport, L...|1616 Chapala St, ...|Santa Barbara|   CA|      93101|34.4266787|-119.7111968|  5.0|           7|      0|{ByAppointmentOnl...|Doctors, Traditio...|                NULL|
|mpf3x-BjTdTEA3yCZ...|       The UPS Store|87 Grasso Plaza S...|       Affton|   MO|      63123| 38.551126|  -90.335695|  3.0|          15|      1|{Business

In [None]:
def load_business_data(spark, path):
    schema = StructType([
        StructField("business_id", StringType(), True),
        StructField("name", StringType(), True),
        StructField("address", StringType(), True),
        StructField("city", StringType(), True),
        StructField("state", StringType(), True),
        StructField("postal_code", StringType(), True),
        StructField("latitude", DoubleType(), True),
        StructField("longitude", DoubleType(), True),
        StructField("stars", DoubleType(), True),
        StructField("review_count", IntegerType(), True),
        StructField("is_open", IntegerType(), True),
        StructField("attributes", MapType(StringType(), StringType()), True),
        StructField("categories", StringType(), True),
        StructField("hours", MapType(StringType(), StringType()), True)
    ])
    return spark.read.json(path, schema=schema)

df = load_business_data(spark, "D:/Project/delta_lake/bronze/yelp_academic_dataset_business.json")
df.show(5)

+--------------------+--------------------+--------------------+-------------+-----+-----------+----------+------------+-----+------------+-------+--------------------+--------------------+--------------------+
|         business_id|                name|             address|         city|state|postal_code|  latitude|   longitude|stars|review_count|is_open|          attributes|          categories|               hours|
+--------------------+--------------------+--------------------+-------------+-----+-----------+----------+------------+-----+------------+-------+--------------------+--------------------+--------------------+
|Pns2l4eNsfO8kk83d...|Abby Rappoport, L...|1616 Chapala St, ...|Santa Barbara|   CA|      93101|34.4266787|-119.7111968|  5.0|           7|      0|{ByAppointmentOnl...|Doctors, Traditio...|                NULL|
|mpf3x-BjTdTEA3yCZ...|       The UPS Store|87 Grasso Plaza S...|       Affton|   MO|      63123| 38.551126|  -90.335695|  3.0|          15|      1|{Business

In [None]:
def load_business_data(spark, path):
    schema = StructType([
        StructField("business_id", StringType(), True),
        StructField("name", StringType(), True),
        StructField("address", StringType(), True),
        StructField("city", StringType(), True),
        StructField("state", StringType(), True),
        StructField("postal_code", StringType(), True),
        StructField("latitude", DoubleType(), True),
        StructField("longitude", DoubleType(), True),
        StructField("stars", DoubleType(), True),
        StructField("review_count", IntegerType(), True),
        StructField("is_open", IntegerType(), True),
        StructField("attributes", MapType(StringType(), StringType()), True),
        StructField("categories", StringType(), True),
        StructField("hours", MapType(StringType(), StringType()), True)
    ])
    return spark.read.json(path, schema=schema)

df = load_business_data(spark, "D:/Project/delta_lake/bronze/yelp_academic_dataset_business.json")
df.show(5)

+--------------------+--------------------+--------------------+-------------+-----+-----------+----------+------------+-----+------------+-------+--------------------+--------------------+--------------------+
|         business_id|                name|             address|         city|state|postal_code|  latitude|   longitude|stars|review_count|is_open|          attributes|          categories|               hours|
+--------------------+--------------------+--------------------+-------------+-----+-----------+----------+------------+-----+------------+-------+--------------------+--------------------+--------------------+
|Pns2l4eNsfO8kk83d...|Abby Rappoport, L...|1616 Chapala St, ...|Santa Barbara|   CA|      93101|34.4266787|-119.7111968|  5.0|           7|      0|{ByAppointmentOnl...|Doctors, Traditio...|                NULL|
|mpf3x-BjTdTEA3yCZ...|       The UPS Store|87 Grasso Plaza S...|       Affton|   MO|      63123| 38.551126|  -90.335695|  3.0|          15|      1|{Business