# Data exploration

In [1]:
import org.apache.spark

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.0.174:4040
SparkContext available as 'sc' (version = 3.5.1, master = local[*], app id = local-1754232169501)
SparkSession available as 'spark'


import org.apache.spark


### Schema definitions

In [2]:
import org.apache.spark.sql.types._
import java.sql.Timestamp

val reviewSchema = StructType(
  Seq(
    StructField("user_id",  StringType,            nullable = true),
    StructField("name",     StringType,            nullable = true),
    StructField("time",     LongType,              nullable = false),
    StructField("rating",   DoubleType,            nullable = true),
    StructField("text",     StringType,            nullable = true),
    StructField("pics",     ArrayType(StringType), nullable = true),
    StructField("resp",     StructType(
      Seq(
        StructField("time", LongType,              nullable = false),
        StructField("text", StringType,            nullable = true)
      )
    ),                                             nullable = true),
    StructField("gmap_id",  StringType,            nullable = false),
  )
)

case class Response(time: Timestamp, text: Option[String])

case class Review(
  user_id: Option[String],
  name: Option[String],
  time: Timestamp,
  rating: Option[Double],
  text: Option[String],
  pics: Seq[String],
  resp: Option[Response],
  gmap_id: String
)

import org.apache.spark.sql.types._
import java.sql.Timestamp
reviewSchema: org.apache.spark.sql.types.StructType = StructType(StructField(user_id,StringType,true),StructField(name,StringType,true),StructField(time,LongType,false),StructField(rating,DoubleType,true),StructField(text,StringType,true),StructField(pics,ArrayType(StringType,true),true),StructField(resp,StructType(StructField(time,LongType,false),StructField(text,StringType,true)),true),StructField(gmap_id,StringType,false))
defined class Response
defined class Review


In [3]:
val metadataSchema = StructType(
  Seq(
    StructField("name",             StringType,                                 nullable = true),
    StructField("address",          StringType,                                 nullable = true),
    StructField("gmap_id",          StringType,                                 nullable = false),
    StructField("description",      StringType,                                 nullable = true),
    StructField("latitude",         DoubleType,                                 nullable = false),
    StructField("longitude",        DoubleType,                                 nullable = false),
    StructField("category",         ArrayType(StringType),                      nullable = true),
    StructField("avg_rating",       DoubleType,                                 nullable = false),
    StructField("num_of_reviews",   IntegerType,                                nullable = false),
    StructField("price",            StringType,                                 nullable = false),
    StructField("hours",            ArrayType(ArrayType(StringType)),           nullable = true),
    StructField("MISC",             MapType(StringType, ArrayType(StringType)), nullable = false),
    StructField("state",            StringType,                                 nullable = true),
    StructField("relative_results", ArrayType(StringType),                      nullable = true),
    StructField("url",              StringType,                                 nullable = false),
  )
)

case class Metadata(
  name: Option[String],
  address: Option[String],
  gmap_id: String,
  description: Option[String],
  latitude: Double,
  longitude: Double,
  category: Seq[String],
  avg_rating: Double,
  num_of_reviews: Int,
  price: String,
  hours: Seq[Seq[String]],
  MISC: Map[String, Seq[String]],
  state: Option[String],
  relative_results: Seq[String],
  url: String
)

metadataSchema: org.apache.spark.sql.types.StructType = StructType(StructField(name,StringType,true),StructField(address,StringType,true),StructField(gmap_id,StringType,false),StructField(description,StringType,true),StructField(latitude,DoubleType,false),StructField(longitude,DoubleType,false),StructField(category,ArrayType(StringType,true),true),StructField(avg_rating,DoubleType,false),StructField(num_of_reviews,IntegerType,false),StructField(price,StringType,false),StructField(hours,ArrayType(ArrayType(StringType,true),true),true),StructField(MISC,MapType(StringType,ArrayType(StringType,true),true),false),StructField(state,StringType,true),StructField(relative_results,ArrayType(StringType,true),true),StructField(url,StringType,false))
defined class Metadata


### Dataset load and parse

In [4]:
import java.nio.file.Paths

val projectDir: String = Paths.get(System.getProperty("user.dir")).getParent.getParent.getParent.toString
val reviewsPath = s"$projectDir/dataset/sample-reviews.ndjson"
val metadataPath = s"$projectDir/dataset/metadata.ndjson"

import org.apache.spark.sql.SparkSession

val spark = SparkSession.builder()
  .appName("NDJSON Reader")
  .master("local[*]") // Needed in local mode
  .getOrCreate()

val reviewsDf = spark.read
  .schema(reviewSchema)
  .json(reviewsPath)
  .withColumn("pics", when (col("pics") isNull, array()) otherwise col("pics"))
  .withColumn("time", from_unixtime(col("time") / 1000).cast("timestamp"))
  .withColumn("resp", 
    when (
      col("resp") isNotNull, 
      struct(
        from_unixtime(col("resp.time") / 1000).cast("timestamp").alias("time"),
        col("resp.text").cast(StringType).alias("text")
      )
    ) otherwise lit(null)
  )
  .as[Review]

val metadataDf = spark.read
  .schema(metadataSchema)
  .json(metadataPath)
  .withColumn("category", when (col("category") isNull, array()) otherwise col("category"))
  .withColumn("hours", when (col("hours") isNull, array()) otherwise col("hours"))
  .withColumn("relative_results", when (col("relative_results") isNull, array()) otherwise col("relative_results"))
  .as[Metadata]

reviewsDf.printSchema()
metadataDf.printSchema()

val reviewsRdd = reviewsDf.rdd.map(Review.unapply).map(_.get)
val metaRdd = metadataDf.rdd.map(Metadata.unapply).map(_.get)

root
 |-- user_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- time: timestamp (nullable = true)
 |-- rating: double (nullable = true)
 |-- text: string (nullable = true)
 |-- pics: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- resp: struct (nullable = true)
 |    |-- time: timestamp (nullable = true)
 |    |-- text: string (nullable = true)
 |-- gmap_id: string (nullable = true)

root
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- gmap_id: string (nullable = true)
 |-- description: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- category: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- avg_rating: double (nullable = true)
 |-- num_of_reviews: integer (nullable = true)
 |-- price: string (nullable = true)
 |-- hours: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: 

import java.nio.file.Paths
projectDir: String = /Users/teo/Universita/Magistrale/BIG_DATA/bd-project25
reviewsPath: String = /Users/teo/Universita/Magistrale/BIG_DATA/bd-project25/dataset/sample-reviews.ndjson
metadataPath: String = /Users/teo/Universita/Magistrale/BIG_DATA/bd-project25/dataset/metadata.ndjson
import org.apache.spark.sql.SparkSession
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@5c65976a
reviewsDf: org.apache.spark.sql.Dataset[Review] = [user_id: string, name: string ... 6 more fields]
metadataDf: org.apache.spark.sql.Dataset[Metadata] = [name: string, address: string ... 13 more fields]
reviewsRdd: org.apache.spark.rdd.RDD[(Option[String], Option[String], java.sql.Timestamp, Option[Double], Option[String], Seq[String], Option[Response], S...


## Exploration

---

**Metadata**: (name, address, <ins>gmap_id</ins>, description, latitude, longitude, category, avg_rating, num_of_reviews, price, hours, misc, state, relative_results, url)

**Review**: (user_id, name, time, rating, text, pics, responses, <ins>gmap_id</ins>)

---

In [5]:
reviewsRdd
  .filter(_._2.contains("Hossein"))
  .collect()

res0: Array[(Option[String], Option[String], java.sql.Timestamp, Option[Double], Option[String], Seq[String], Option[Response], String)] = Array()


In [6]:
metaRdd
  .filter(_._3 == "0x80dcdbd91ac0ff97:0x40cb80cf24283e4d")
  .collect()

res1: Array[(Option[String], Option[String], String, Option[String], Double, Double, Seq[String], Double, Int, String, Seq[Seq[String]], Map[String,Seq[String]], Option[String], Seq[String], String)] = Array()


In [7]:
// beware `rating` can be empty!!
reviewsRdd.filter(_._4.isEmpty).count()

res2: Long = 11232


In [8]:
// beware `user_id` can be empty!!
reviewsRdd.filter(_._1.isEmpty).count()

res3: Long = 11232


How many distinct businesses? 1.228.804

In [9]:
metaRdd.map(_._3).distinct().count()

res4: Long = 291314


How many average ratings per user? [Incomplete data since we are using a sample of the dataset]

In [10]:
val avgRatingsPerUser = reviewsRdd
  .filter(_._1.isDefined) // user_id can be not defined
  .map(_._1.get -> 1)
  .reduceByKey(_ + _) // [(user_id, #ratings written by user_id)*]
  .aggregate((0, 0))((acc, v) => (acc._1 + v._2, acc._2 + 1), (r1, r2) => (r1._1 + r2._1, r1._2 + r2._2))
"Average ratings per user: " + avgRatingsPerUser._1.toDouble / avgRatingsPerUser._2

avgRatingsPerUser: (Int, Int) = (1809132,1151185)
res5: String = Average ratings per user: 1.5715388925324774


How many average ratings per business? [Incomplete data since we are using a sample of the dataset]

In [11]:
val avgRatingsPerBusiness = reviewsRdd
  .map(_._8 -> 1)
  .reduceByKey(_ + _) // [(gmap_id, #ratings for gmap_id)*]
  .aggregate((0, 0))((acc, v) => (acc._1 + v._2, acc._2 + 1), (r1, r2) => (r1._1 + r2._1, r1._2 + r2._2))
"Average ratings per business: " + avgRatingsPerBusiness._1.toDouble / avgRatingsPerBusiness._2

avgRatingsPerBusiness: (Int, Int) = (1820364,193159)
res6: String = Average ratings per business: 9.424173867125011


Average response rate? [Incomplete data since we are using a sample of the dataset]

In [12]:
val avgResponseRate = reviewsRdd
  .map(r => r._8 -> (if (r._7.isDefined) 1 else 0))
  .aggregate((0, 0))((acc, v) => (acc._1 + v._2, acc._2 + 1), (r1, r2) => (r1._1 + r2._1, r1._2 + r2._2))
"Average response rate: " + (avgResponseRate._1.toDouble / avgResponseRate._2) * 100 + " %"

avgResponseRate: (Int, Int) = (244450,1820364)
res7: String = Average response rate: 13.428632954727734 %


### Job 2 Exploration

---

**Metadata**: (name, address, <ins>gmap_id</ins>, description, latitude, longitude, category, avg_rating, num_of_reviews, price, hours, misc, state, relative_results, url)

**Review**: (user_id, name, time, rating, text, pics, responses, <ins>gmap_id</ins>)

---

In [13]:
// avg rating mapper to description
def ratingToSuggestion(rating: Double): String =
  rating match {
    case r if r <= 2.0 => "Not recommended"
    case r if r > 2.0 && r <= 3.5 => "Discreet"
    case r if r > 3.5 && r <= 4.5 => "Recommended"
    case r if r > 4.5 => "Highly recommended"
    case _ => "Undefined"
  }

ratingToSuggestion: (rating: Double)String


In [None]:
val rating = 0.0 to 5.0 by 0.25
rating.foreach(d => println(d + " - " +ratingToSuggestion(d)))

Per ogni business viene calcolata la valutazione media delle recensioni raggruppandole per anno:

In [15]:
val bussRating = reviewsRdd
.filter(_._8 == "0x1458a270b3b7bec7:0xadb29e6ff9d981a2")
.filter(_._4.isDefined)
.map(r => (r._3.toLocalDateTime.getYear, r._4.get))
.reduceByKey(_+_)
.collect()


bussRating: Array[(Int, Double)] = Array((2018,15.0), (2019,23.0), (2020,8.0), (2021,5.0))


In [27]:
// To improve, maybe is better group by year after reduce
val businessAvgRating = reviewsRdd
  .filter(_._4.isDefined)
  .map(r => ((r._8, r._3.toLocalDateTime.getYear), (1, r._4.get)))
  .reduceByKey((a, b) => (a._1 + b._1, a._2 + b._2)) // count reviews and sum rating value
  .map{ case (bId, c) =>(bId._1,(bId._2, c._2/c._1))} // calculate avg rating

businessAvgRating: org.apache.spark.rdd.RDD[(String, (Int, Double))] = MapPartitionsRDD[71] at map at <console>:36


Aggregando in base alla categoria di business, lo stato e la fascia di prezzo viene calcolata la media delle valutazioni medie delle recensioni:

In [17]:
def toState(address: Option[String]): String = address.flatMap { addr =>
    // This regex captures the state abbreviation between a comma and the ZIP code
    val StateRegex = """,\s*([A-Z]{2})\s+\d{5}""".r
    StateRegex.findFirstMatchIn(addr).map(_.group(1))
  }.getOrElse("Unknown")

val addr = "Redbox, 865 Thomas Rd, Warrior, AL 35180"

toState(Some(addr))

toState: (address: Option[String])String
addr: String = Redbox, 865 Thomas Rd, Warrior, AL 35180
res9: String = AL


In [None]:

val meta = metaRdd
.filter(_._10 != null)
.flatMap(r => 
  r._7.map(category => (r._3, (category, toState(r._2), r._10))) //each category into tuple
) //(gmap_id, (category, state, price))
.join(businessAvgRating)
.map { case (_, (metaInfo, bReview)) => 
  val (year, avgRate) = bReview
  // I could put year inside metaInfo
  (metaInfo, (year, avgRate))
}
.groupByKey()
.mapValues(
  _.groupBy(_._1)
    .map { case (year, avgBRating) => 
      val ratings = avgBRating.map(_._2)
      (year, ratings.sum/ratings.size)
    }.toArray //convert map(year-> avgScore) into Array of (year, avgScore)
)
// .filter(_._1._1 == "Pizza Takeout")
// .filter(_._1._3 == "$$")
//the flat map convert the array into distinct tuples
.flatMap(r => r._2.map(t => (r._1, t._1,f"${t._2}%.2f", ratingToSuggestion(t._2)))) //flatten into tuple
.collect()

meta: Array[((String, String, String), Int, String, String)] = Array(((Pizza Takeout,MS,$$),2020,4.55,Highly recommended), ((Pizza Takeout,MS,$$),2017,4.69,Highly recommended), ((Pizza Takeout,MS,$$),2016,4.54,Highly recommended), ((Pizza Takeout,MS,$$),2019,4.08,Recommended), ((Pizza Takeout,MS,$$),2021,4.83,Highly recommended), ((Pizza Takeout,MS,$$),2018,4.02,Recommended), ((Pizza Takeout,WA,$$),2020,4.26,Recommended), ((Pizza Takeout,WA,$$),2017,4.15,Recommended), ((Pizza Takeout,WA,$$),2015,3.74,Recommended), ((Pizza Takeout,WA,$$),2016,4.10,Recommended), ((Pizza Takeout,WA,$$),2019,4.26,Recommended), ((Pizza Takeout,WA,$$),2021,4.39,Recommended), ((Pizza Takeout,WA,$$),2018,4.31,Recommended), ((Pizza Takeout,AL,$$),2020,4.41,Recommended), ((Pizza Takeout,AL,$$),2017,3.93,Recommend...
