# 1. Businesses Dataset

## Load DataFrame

In [1]:
val dfBusiness = spark.read.json("project/reviews/raw/meta-New_York.json")
z.show(dfBusiness)

In [2]:
dfBusiness.printSchema()
dfBusiness.count()

## Compute zip code from address and remove unnecessary columns

In [4]:
def getZip(address: String): Long = {
    if (address == null) {
        0L
    } else {
        val zipPattern ="""(?:\s+NY)\s(\d+)""".r
        zipPattern.findFirstMatchIn(address) match {
            case Some(matched) => matched.group(1).toLong
            case None => 0L
        }
    }
}

spark.udf.register("getZip", getZip(_))

dfBusiness.createOrReplaceTempView("businesses")

val dfWithZip = spark.sql("SELECT gmap_id, avg_rating, category, name, num_of_reviews, address, getZip(address) as zip FROM businesses")

z.show(dfWithZip.select("address", "zip"))

## Get DataFrame of all NYC zipcodes, then clean it

In [6]:
var NYC_zipcodes = spark.read
    .option("header", "true")
    .option("mode", "DROPMALFORMED")
    .csv("project/reviews/raw/nyc_zipcodes.csv")
    
NYC_zipcodes = NYC_zipcodes.withColumn("ZIPCODES", col("ZIPCODES").cast("Long"))

z.show(NYC_zipcodes)

  
## Filter out businesses whose zipcodes don't lie within NYC

In [8]:
val NYC_zipcodes_list = NYC_zipcodes.rdd.flatMap(_.toSeq).collect()

val df_NYC_only = dfWithZip
                    .drop("address")
                    .filter(col("zip").isInCollection(NYC_zipcodes_list))

z.show(df_NYC_only.select("zip").describe())


## Ensure numeric fields don't have invalid values

In [10]:
z.show(df_NYC_only.select("avg_rating", "num_of_reviews").describe())

## Save processed businesses

In [12]:
df_NYC_only.write.json("project/reviews/processed/nyc_businesses.json")

# 2. Businesses Dataset

## Load DataFrame

In [14]:
val dfReviews = spark.read.json("project/reviews/raw/review-New_York.json")
z.show(dfReviews)

In [15]:
dfReviews.printSchema()
dfReviews.count()

## Filter reviews to include only reviews from businesses located in NYC

In [17]:
var nyc_reviews_join = dfReviews.join(df_NYC_only, Seq("gmap_id"), "inner")
z.show(nyc_reviews_join)

 
## Convert the time column from timestamp into DateFormat data type

In [19]:
val formattedDF = nyc_reviews_join.withColumn(
  "datetime",
  to_timestamp(col("time") / 1000)
    .cast("timestamp")
    .as("datetime")
)
z.show(formattedDF)

## Remove unnecessary columns

In [21]:
val nyc_reviews = formattedDF.select("gmap_id", "rating", "datetime", "zip")
nyc_reviews.printSchema()

 
## Ensure fields don’t have invalid values

In [23]:
z.show(nyc_reviews.describe())

 
## Find the minimum and maximum dates in the review dataset for NYC

In [25]:
val minMaxDates = formattedDF.agg(
  min("datetime").as("min_date"),
  max("datetime").as("max_date")
)

z.show(minMaxDates)

## Save processed reviews

In [27]:
nyc_reviews.write.json("project/reviews/processed/nyc_reviews.json")