In [None]:
import numpy as np
import pandas as pd
import os

from sklearn.model_selection import train_test_split

from pyspark import sql
from pyspark import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.context import SparkContext
import pyspark.sql.functions as F

In [None]:
# mount gdrive
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
# change root to the necessary path
root = "/gdrive/MyDrive/ads_proj5/ads-spring-2022-prj5-group-4/"
outputs_dir = root + "output/"

JSON_PATH = root+'/data/goodreads_reviews_spoiler.json'

In [None]:
assert os.path.exists(root), 'Check the path to your root directory'
assert os.path.exists(outputs_dir), 'Check the path to your outputs directory'

In [None]:
sparkConf = SparkConf().setAppName('CMPE256')
sparkConf.set('spark.executor.memory', '16g')
sparkConf.set('spark.executor.cores', '5')
sparkConf.set('spark.cores.max', '40')
sparkConf.set('spark.driver.memory', '12g')
sparkConf.set('spark.driver.maxResultSize', '4g')

spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
dataSetRaw = spark.read.json(JSON_PATH)
dataSetRaw.show(15)

+--------+-----------+------+--------------------+--------------------+----------+--------------------+
| book_id|has_spoiler|rating|           review_id|    review_sentences| timestamp|             user_id|
+--------+-----------+------+--------------------+--------------------+----------+--------------------+
|18245960|       true|     5|dfdbb7b0eb5a7e4c2...|[[0, This is a sp...|2017-08-30|8842281e1d1347389...|
|   16981|      false|     3|a5d2c3628987712d0...|[[0, Recommended ...|2017-03-22|8842281e1d1347389...|
|28684704|       true|     3|2ede853b14dc4583f...|[[0, A fun, fast ...|2017-03-20|8842281e1d1347389...|
|27161156|      false|     0|ced5675e55cd9d38a...|[[0, Recommended ...|2016-11-09|8842281e1d1347389...|
|25884323|       true|     4|33273272586313127...|[[0, I really enj...|2016-04-25|8842281e1d1347389...|
|19398490|      false|     4|ea4a220b10e6b5c79...|[[0, A beautiful ...|2016-09-20|8842281e1d1347389...|
|22318578|      false|     5|5fe9882bfe4b0520a...|[[0, 5 stars f

In [None]:
dataSet = dataSetRaw.withColumn('reviewNew', F.explode('review_sentences'))
columns = ['book_id', 'user_id', 'review_id', 'has_spoiler', 'rating'] + [dataSet.reviewNew[i] for i in range(2)]
dataSet = dataSet.select(columns).withColumnRenamed('reviewNew[0]', 'class').withColumnRenamed('reviewNew[1]', 'reviews')
processedDF = dataSet.groupBy('review_id', 'class').agg(F.concat_ws(' ', F.collect_list('reviews')).alias('review'))
dataSet = processedDF.join(dataSetRaw, on='review_id', how='left')

In [None]:
columns = ['review_id', 'class', 'review', 'book_id', 'has_spoiler', 'rating', 'timestamp', 'user_id']
dataSet = dataSet.select(columns)
dataSet.show(15)

+--------------------+-----+--------------------+--------+-----------+------+----------+--------------------+
|           review_id|class|              review| book_id|has_spoiler|rating| timestamp|             user_id|
+--------------------+-----+--------------------+--------+-----------+------+----------+--------------------+
|00000f32a5bf9f821...|    0|Nicely different ...|30813362|      false|     4|2017-05-09|42ba997ef9931c622...|
|00002838e49ba431a...|    0|I started 'All Th...|18143977|      false|     5|2017-10-15|32df3d3841923febf...|
|000039a2fc73f37e6...|    0|What an awesome r...|12444166|      false|     5|2013-03-26|916fc998255325090...|
|00004f91993abddf3...|    0|OOOOooooooohhhh, ...|18815501|      false|     5|2014-12-22|8b3d7f1d66ccc991e...|
|0000566c9fd53e3c3...|    0|While I thought t...|16176440|      false|     2|2016-07-19|996d9db340128bad9...|
|0000660f0dc7b670e...|    0|Although I didn't...|11735983|      false|     5|2015-01-08|b48f60be86aca1257...|
|0000aae04

In [None]:
dataSet.toPandas().to_csv("/gdrive/MyDrive/ads_proj5/ads-spring-2022-prj5-group-4/output/goodreads_reviews_spoiler.csv")