# Data exploration

In [53]:
import org.apache.spark

import org.apache.spark


### Schema definitions

In [54]:
import org.apache.spark.sql.types._
import java.sql.Timestamp

val reviewSchema = StructType(
  Seq(
    StructField("user_id",  StringType,            nullable = true),
    StructField("name",     StringType,            nullable = true),
    StructField("time",     LongType,              nullable = false),
    StructField("rating",   DoubleType,            nullable = true),
    StructField("text",     StringType,            nullable = true),
    StructField("pics",     ArrayType(StringType), nullable = true),
    StructField("resp",     StructType(
      Seq(
        StructField("time", LongType,              nullable = false),
        StructField("text", StringType,            nullable = true)
      )
    ),                                             nullable = true),
    StructField("gmap_id",  StringType,            nullable = false),
  )
)

case class Response(time: Timestamp, text: Option[String])

case class Review(
  user_id: Option[String],
  name: Option[String],
  time: Timestamp,
  rating: Option[Double],
  text: Option[String],
  pics: Seq[String],
  resp: Option[Response],
  gmap_id: String
)

import org.apache.spark.sql.types._
import java.sql.Timestamp
reviewSchema: org.apache.spark.sql.types.StructType = StructType(StructField(user_id,StringType,true),StructField(name,StringType,true),StructField(time,LongType,false),StructField(rating,DoubleType,true),StructField(text,StringType,true),StructField(pics,ArrayType(StringType,true),true),StructField(resp,StructType(StructField(time,LongType,false),StructField(text,StringType,true)),true),StructField(gmap_id,StringType,false))
defined class Response
defined class Review


In [55]:
val metadataSchema = StructType(
  Seq(
    StructField("name",             StringType,                                 nullable = true),
    StructField("address",          StringType,                                 nullable = true),
    StructField("gmap_id",          StringType,                                 nullable = false),
    StructField("description",      StringType,                                 nullable = true),
    StructField("latitude",         DoubleType,                                 nullable = false),
    StructField("longitude",        DoubleType,                                 nullable = false),
    StructField("category",         ArrayType(StringType),                      nullable = true),
    StructField("avg_rating",       DoubleType,                                 nullable = false),
    StructField("num_of_reviews",   IntegerType,                                nullable = false),
    StructField("price",            StringType,                                 nullable = false),
    StructField("hours",            ArrayType(ArrayType(StringType)),           nullable = true),
    StructField("MISC",             MapType(StringType, ArrayType(StringType)), nullable = false),
    StructField("state",            StringType,                                 nullable = true),
    StructField("relative_results", ArrayType(StringType),                      nullable = true),
    StructField("url",              StringType,                                 nullable = false),
  )
)

case class Metadata(
  name: Option[String],
  address: Option[String],
  gmap_id: String,
  description: Option[String],
  latitude: Double,
  longitude: Double,
  category: Seq[String],
  avg_rating: Double,
  num_of_reviews: Int,
  price: String,
  hours: Seq[Seq[String]],
  MISC: Map[String, Seq[String]],
  state: Option[String],
  relative_results: Seq[String],
  url: String
)

metadataSchema: org.apache.spark.sql.types.StructType = StructType(StructField(name,StringType,true),StructField(address,StringType,true),StructField(gmap_id,StringType,false),StructField(description,StringType,true),StructField(latitude,DoubleType,false),StructField(longitude,DoubleType,false),StructField(category,ArrayType(StringType,true),true),StructField(avg_rating,DoubleType,false),StructField(num_of_reviews,IntegerType,false),StructField(price,StringType,false),StructField(hours,ArrayType(ArrayType(StringType,true),true),true),StructField(MISC,MapType(StringType,ArrayType(StringType,true),true),false),StructField(state,StringType,true),StructField(relative_results,ArrayType(StringType,true),true),StructField(url,StringType,false))
defined class Metadata


### Dataset load and parse

In [70]:
import java.nio.file.Paths
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.from_unixtime

val projectDir = Paths.get(System.getProperty("user.dir")).getParent.getParent.getParent
val reviewsPath = s"$projectDir/dataset/sample-reviews.ndjson"
val metadataPath = s"$projectDir/dataset/metadata.ndjson"

val spark = SparkSession.builder()
  .appName("NDJSON Reader")
  .master("local[*]") // Needed in local mode
  .getOrCreate()

val reviewsDf = spark.read
  .schema(reviewSchema)
  .json(reviewsPath)
  .withColumn("pics", when (col("pics") isNull, array()) otherwise col("pics"))
  .withColumn("time", from_unixtime(col("time") / 1000).cast("timestamp"))
  .withColumn("resp", 
    when (
      col("resp") isNotNull, 
      struct(
        from_unixtime(col("resp.time") / 1000).cast("timestamp").alias("time"),
        col("resp.text").cast(StringType).alias("text")
      )
    ) otherwise lit(null)
  )
  .as[Review]

val metadataDf = spark.read
  .schema(metadataSchema)
  .json(metadataPath)
  .withColumn("category", when (col("category") isNull, array()) otherwise col("category"))
  .withColumn("hours", when (col("hours") isNull, array()) otherwise col("hours"))
  .withColumn("relative_results", when (col("relative_results") isNull, array()) otherwise col("relative_results"))
  .as[Metadata]

reviewsDf.printSchema()
metadataDf.printSchema()

// Unforturnately, it seems that Spark does not support case classes in RDDs. It throws ArrayStoreException
// when trying to collect the RDD... [see also [here](https://github.com/adtech-labs/spylon-kernel/issues/40)]
val reviewsRdd = reviewsDf.rdd
  .map(Review.unapply(_).get)
  .map { case review @ (_, _, _, _, _, _, resp, _) => review.copy(_7 = resp.map(Response.unapply(_).get)) }
val metaRdd = metadataDf.rdd.map(Metadata.unapply).map(_.get)

root
 |-- user_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- time: timestamp (nullable = true)
 |-- rating: double (nullable = true)
 |-- text: string (nullable = true)
 |-- pics: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- resp: struct (nullable = true)
 |    |-- time: timestamp (nullable = true)
 |    |-- text: string (nullable = true)
 |-- gmap_id: string (nullable = true)

root
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- gmap_id: string (nullable = true)
 |-- description: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- category: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- avg_rating: double (nullable = true)
 |-- num_of_reviews: integer (nullable = true)
 |-- price: string (nullable = true)
 |-- hours: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: 

import java.nio.file.Paths
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.from_unixtime
projectDir: java.nio.file.Path = /Users/lucatassi/Projects/big-data/big-data-project
reviewsPath: String = /Users/lucatassi/Projects/big-data/big-data-project/dataset/sample-reviews.ndjson
metadataPath: String = /Users/lucatassi/Projects/big-data/big-data-project/dataset/metadata.ndjson
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@5f67424d
reviewsDf: org.apache.spark.sql.Dataset[Review] = [user_id: string, name: string ... 6 more fields]
metadataDf: org.apache.spark.sql.Dataset[Metadata] = [name: string, address: string ... 13 more fields]
reviewsRdd: org.apache.spark.rdd.RDD[(Option[String], Option[String], java.sql.Timestamp, Option[Do...


## Exploration

---

**Metadata**: (name, address, <ins>gmap_id</ins>, description, latitude, longitude, category, avg_rating, num_of_reviews, price, hours, misc, state, relative_results, url)

**Review**: (user_id, name, time, rating, text, pics, responses, <ins>gmap_id</ins>)

---

**Remember that we are using a sample of the dataset, so this number is not representative of the whole dataset.**

In [57]:
reviewsRdd
  .map(_._7) // response field
  .filter(_.isDefined)
  .map(_.get) // (time, text)
  .take(1)

res33: Array[(java.sql.Timestamp, Option[String])] = Array((2019-06-10 23:28:09.0,Some(Thanks Shaun! We appreciate your kind remarks and hope to see you again soon!)))


In [58]:
reviewsRdd
  .filter(_._2.contains("Hossein"))
  .collect()

res34: Array[(Option[String], Option[String], java.sql.Timestamp, Option[Double], Option[String], Seq[String], Option[(java.sql.Timestamp, Option[String])], String)] = Array((Some(113587357589866776535),Some(Hossein),2021-04-02 23:48:46.0,Some(5.0),None,List(),None,0x808fbb350a474eb9:0x20e475f4d549a0b7), (Some(113587357589866776535),Some(Hossein),2019-11-15 14:27:45.0,Some(4.0),None,List(),None,0x808fc76a4f691325:0x530bc2080d135950))


In [59]:
metaRdd
  .filter(_._3 == "0x80dcdbd91ac0ff97:0x40cb80cf24283e4d")
  .collect()

res35: Array[(Option[String], Option[String], String, Option[String], Double, Double, Seq[String], Double, Int, String, Seq[Seq[String]], Map[String,Seq[String]], Option[String], Seq[String], String)] = Array((Some(Shell),Some(Shell, 395 W First St, Tustin, CA 92780),0x80dcdbd91ac0ff97:0x40cb80cf24283e4d,None,33.7459242,-117.82758,List(Gas station, Alternative fuel station, ATM, Convenience store, Electric vehicle charging station),4.0,34,$$,List(),Map(Accessibility -> List(Wheelchair accessible entrance)),Some(Open now),List(0x80dcdbe3deac39db:0x624e68a174a293b7, 0x80dcd95ef4e83e17:0xb0a6b7bf134626d5, 0x80dcdc214dd2ddc5:0x933178bf1e6f113a, 0x80dd2674301aa6ab:0x7dfb81933c80facb, 0x80dcda2ad3aee46b:0x9de6be25417d4cb),https://www.google.com/maps/place//data=!4m2!3m1!1s0x80dcdbd91ac0ff97:0...


In [60]:
// beware `rating` can be empty!!
reviewsRdd.filter(_._4.isEmpty).count()

res36: Long = 9745


In [61]:
// beware `user_id` can be empty!!
reviewsRdd.filter(_._1.isEmpty).count()

res37: Long = 9745


How many distinct businesses? 1.228.804

In [62]:
metaRdd.map(_._3).distinct().count()

res38: Long = 1228804


How many distinct categories? 4280

In [63]:
val categories = metaRdd.flatMap(_._7).distinct()
println(s"Distinct categories: ${categories.count()}")
categories.collect()

Distinct categories: 4280


categories: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[237] at distinct at <console>:78
res39: Array[String] = Array(Camping store, Photo booth, Drama theater, Suzuki dealer, Squash court, Summer camp, Housing development, Home cinema installation, Beauty products wholesaler, Pile driver, Convenience store, Archery club, Taiwanese restaurant, Steel fabricator, Vacuum cleaner repair shop, Security checkpoint, Athletic club, Calligraphy lesson, Handicraft, Medical examiner, Wildlife refuge, Art handcraft, Bottle & can redemption center, Childrens book store, High ropes course, Local government office, Pet adoption service, Laundry service, Website designer, Road safety town, Diesel fuel supplier, Golf driving range, City tax office, Advertising, Breakfast restaurant, War memorial...


How many businesses foreach category?

In [64]:
metaRdd
  .flatMap(m => m._7.map(_ -> m._3))
  .groupByKey() // [(category, [gmap_id*])*]
  .mapValues(_.size) // [(category, #businesses in category)*]
  .collect()

res40: Array[(String, Int)] = Array((Camping store,613), (Photo booth,68), (Drama theater,43), (Suzuki dealer,21), (Squash court,9), (Home cinema installation,99), (Housing development,1201), (Summer camp,669), (Beauty products wholesaler,77), (Pile driver,1), (Convenience store,20663), (Archery club,13), (Taiwanese restaurant,377), (Steel fabricator,266), (Vacuum cleaner repair shop,252), (Security checkpoint,1), (Athletic club,141), (Calligraphy lesson,2), (Bottle & can redemption center,355), (Medical examiner,60), (Wildlife refuge,247), (Art handcraft,164), (Handicraft,74), (Childrens book store,208), (High ropes course,26), (Local government office,187), (Pet adoption service,708), (Laundry service,1381), (Website designer,1125), (Road safety town,4), (Diesel fuel supplier,2110), (...


How many years of reviews?

In [65]:
val years = reviewsRdd.map(_._3.toLocalDateTime.getYear).distinct().collect().sorted
println("Review years: " + years.mkString(", "))
println("Reviews span from " + years.head + " to " + years.last + " (" + (years.last - years.head + 1) + " years)")

Review years: 1990, 1999, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021
Reviews span from 1990 to 2021 (32 years)


years: Array[Int] = Array(1990, 1999, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021)


How many average ratings per user?

In [66]:
val avgRatingsPerUser = reviewsRdd
  .filter(_._1.isDefined) // user_id can be not defined
  .map(_._1.get -> 1)
  .countByKey() // [(user_id, #ratings written by user_id)*], or equivalently `.reduceByKey(_ + _)`
  .aggregate((0.0, 0.0))((acc, v) => (acc._1 + v._2, acc._2 + 1), (r1, r2) => (r1._1 + r2._1, r1._2 + r2._2))
"Average ratings per user: " + avgRatingsPerUser._1 / avgRatingsPerUser._2

avgRatingsPerUser: (Double, Double) = (1696279.0,1422429.0)
res42: String = Average ratings per user: 1.1925227902412001


How many average ratings per business?

In [67]:
val avgRatingsPerBusiness = reviewsRdd
  .map(_._8 -> 1)
  .reduceByKey(_ + _) // [(gmap_id, #ratings for gmap_id)*]
  .aggregate((0, 0))((acc, v) => (acc._1 + v._2, acc._2 + 1), (r1, r2) => (r1._1 + r2._1, r1._2 + r2._2))
"Average ratings per business: " + avgRatingsPerBusiness._1.toDouble / avgRatingsPerBusiness._2

avgRatingsPerBusiness: (Int, Int) = (1706024,486237)
res43: String = Average ratings per business: 3.508626451709763


Average response rate?

In [68]:
val avgResponseRate = reviewsRdd
  .map(r => r._8 -> (if (r._7.isDefined) 1 else 0)) // [(gmap_id, 1 if response exists, 0 otherwise)*]
  .aggregate((0, 0))((acc, v) => (acc._1 + v._2, acc._2 + 1), (r1, r2) => (r1._1 + r2._1, r1._2 + r2._2))
"Average response rate: " + (avgResponseRate._1.toDouble / avgResponseRate._2) * 100 + " %"

avgResponseRate: (Int, Int) = (218759,1706024)
res44: String = Average response rate: 12.8227387187988 %


Average ratings per category? This is already a double shuffle operation 😃

In [69]:
val businessCategoriesRdd = metaRdd
  .map(m => m._3 -> m._7) // [(gmap_id, [category*])*]
  .flatMapValues(_.toSet) // [(gmap_id, category)*]

val avgRatingsPerCategory = reviewsRdd
  .map(r => r._8 -> r._4) // [(gmap_id, rating)*]
  // Note: `filter` + `map` are lazy transformations in Spark (like defining a plan), 
  // whereas `collect` is an eager action that triggers computation and brings data to the driver.
  // This differs from Scala collections, where filter + map is equivalent to a single `collect` operation.
  .filter { case (_, rating) => rating.isDefined }
  .map { case (id, rating) => id -> rating.get } // [(gmap_id, rating)*]
  .aggregateByKey((0.0, 0))((acc, v) => (acc._1 + v, acc._2 + 1), (r1, r2) => (r1._1 + r2._1, r1._2 + r2._2))
  .join(businessCategoriesRdd) // [(gmap_id, ((ratings sum for gmap_id, #ratings for gmap_id), category))*]
  .map { case (id, ((ratingsSum, numRatings), category)) => (category, (ratingsSum, numRatings)) }
  .reduceByKey { case ((sum1, count1), (sum2, count2)) => (sum1 + sum2, count1 + count2) }
  .mapValues { case (sum, count) => sum / count } // [(category, avg rating for category)*]
  .collect()

businessCategoriesRdd: org.apache.spark.rdd.RDD[(String, String)] = MapPartitionsRDD[253] at flatMapValues at <console>:81
avgRatingsPerCategory: Array[(String, Double)] = Array((Interior plant service,4.627906976744186), (Eye care,4.6), (Tree farm,4.314285714285714), (Money transfer service,4.499446290143965), (Smart dealer,4.333333333333333), (HIV testing center,4.47244094488189), (Day spa,4.407836153161176), (Photo booth,5.0), (Kung fu school,4.764705882352941), (Housing development,4.186234817813765), (Dry fruit store,4.53125), (Ski rental service,4.458064516129032), (Vocal instructor,4.48), (Home cinema installation,4.6461538461538465), (Car finance and loan company,4.50610568383659), (Oral and maxillofacial surgeon,4.7847533632287), (Convenience store,3.8376722817764164), (Investm...
