# Big Data Final Project Yelp 
## Group Members: Haoning Liu, Jiaqi Chen, Mengqi Liu, Xiaolu Li

In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("BD4").getOrCreate()
spark

### Sentiment analysis on review

#### 1. Review data

In [2]:
review = spark.read\
  .format('csv')\
  .option('header', 'true')\
  .option('inferSchema', 'true')\
  .load('s3://bigdataclasslhn/Prj_Yelp/review1.csv')

In [3]:
review=review.drop('_c0')

#### Check schema

In [4]:
review.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- business_id: string (nullable = true)
 |-- stars: integer (nullable = true)
 |-- useful: integer (nullable = true)
 |-- funny: integer (nullable = true)
 |-- cool: integer (nullable = true)
 |-- text: string (nullable = true)
 |-- date: timestamp (nullable = true)



#### Check unique values for each column and whether data has been read in correct format

In [5]:
review.select('business_id').distinct().show()

+--------------------+
|         business_id|
+--------------------+
|E9QTQ4DOKo1UsGNmM...|
|bo3SQVtErnMOqO6lk...|
|uC3qwaxsOkdJzpOc0...|
|mA27CG2U3ytmkxIGV...|
|R_M4P9XetEM-aLE7e...|
|ipFreSFhjClfNETuM...|
|oIEmXWLtoh5blz-iw...|
|f4mh1Y0rnvbJRfQ3j...|
|eUI230JcFZLajIcDd...|
|19m3NtbbP2VX-tDFJ...|
|1UwaMUnVKeWcV14qv...|
|FpFIAW_IEvASZBbus...|
|smmLyq8f_YaxXPwZY...|
|a1TTRvtMCDoxalOU6...|
|7coCBjZNMJ48BD2ta...|
|lt8IW9Bpy9GMeKGxy...|
|cKwg6HFaLYXl7Ar0r...|
|Cl-xl1vTUwHeaGgBx...|
|DEBqmgxv2yhJ93LqG...|
|nqBLuNgN1VrQhi6vU...|
+--------------------+
only showing top 20 rows



In [6]:
review.select('business_id').distinct().count()

76755

In [7]:
review.select('stars').distinct().show()

+-----+
|stars|
+-----+
| null|
|    1|
|    3|
|    5|
|    4|
|    2|
+-----+



In [8]:
review.select('user_id').distinct().show()

+--------------------+
|             user_id|
+--------------------+
|H0tfWQsGjEBuhXD4W...|
|cJlM64tJEdMxirjfw...|
|DgZO2UiUoAJQ5pw5v...|
|UY5kjH85BrmN9YHIb...|
|NFH6lgwwub14W-sR7...|
|Gb9Y_f1xslY1mDvvH...|
|D2ljL5ejuqpa4f8fn...|
|llsbSKHnzAUPByQwz...|
|XAja3Ed6Fa_lwZl_7...|
|ft0uc4tJNjgwVa7F8...|
|sJF4jbF5LyGWmkN8B...|
|Inh6RQ9BjpV05V74O...|
|ttc3_AtKQzrFlFdfs...|
|2Ly_E_OZnJu8-fmpQ...|
|QRqktJWBUi3HL7539...|
|O9Dbdo2LZut3e8VqI...|
|418W7Ufoz3eXSt3A1...|
|2iuocD002C1y2f2f1...|
|C16QOoBorPQwN1oeR...|
|t5Neo4x7mL7j98bia...|
+--------------------+
only showing top 20 rows



In [9]:
review.select('useful').distinct().show()

+------+
|useful|
+------+
|   148|
|    31|
|    85|
|   251|
|   808|
|   137|
|    65|
|    53|
|   255|
|   970|
|   133|
|    78|
|   362|
|   108|
|   155|
|    34|
|   193|
|   101|
|   126|
|   115|
+------+
only showing top 20 rows



In [10]:
review.select('funny').distinct().show()

+-----+
|funny|
+-----+
|  148|
|   31|
|   85|
|  137|
|   65|
|   53|
|  970|
|  133|
|   78|
|  322|
|  108|
|  155|
|   34|
|  101|
|  115|
|   81|
|   28|
|  183|
|  412|
|   76|
+-----+
only showing top 20 rows



In [11]:
review.select('date').distinct().show()

+-------------------+
|               date|
+-------------------+
|2016-01-18 19:56:59|
|2017-03-05 16:18:31|
|2016-06-14 13:13:16|
|2016-08-13 17:44:31|
|2014-05-11 01:27:24|
|2016-08-27 01:54:59|
|2014-06-30 18:05:22|
|2015-12-07 19:21:07|
|2016-01-06 14:48:30|
|2011-05-31 21:33:05|
|2014-09-27 18:33:15|
|2018-03-18 00:52:38|
|2012-12-07 21:28:36|
|2014-08-12 01:44:53|
|2017-09-10 06:54:32|
|2017-07-10 04:30:44|
|2014-08-26 21:02:47|
|2013-11-16 20:54:19|
|2017-12-03 16:57:03|
|2016-05-08 21:21:49|
+-------------------+
only showing top 20 rows



In [12]:
from pyspark import SparkContext 
from pyspark.sql import SQLContext
from pyspark.sql.functions import count

In [13]:
review.groupBy('stars').agg(count('review_id').alias('count')).sort('stars').show()

+-----+-------+
|stars|  count|
+-----+-------+
| null|     28|
|    1| 368956|
|    2| 201273|
|    3| 276740|
|    4| 552435|
|    5|1100596|
+-----+-------+



In [14]:
review = review.filter(review.stars.isNotNull())

In [15]:
review.groupBy('stars').agg(count('review_id').alias('count')).sort('stars').show()

+-----+-------+
|stars|  count|
+-----+-------+
|    1| 368956|
|    2| 201273|
|    3| 276740|
|    4| 552435|
|    5|1100596|
+-----+-------+



From the summary table, we can see the majority of the reviews focuses on the positive sides. People tend to leave reviews when they have a good expression on their experience. Meanwhile, the number of extremely negative reviews (1 star) also much larger than the number of neutral reviews (2 or 3 stars).

## Data preprossing

Relabel the stars so that any reviews with 4 stars or above will be 1, which means positive, anything else is deemed to be 0,which means negative reviews. 

We begin to do text processings before tokenizing the text.

In [16]:
import re
import string
# remove punctuation
def remove_punctuation(text):
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    clean = regex.sub(" ", str(text))
    return clean

In [17]:
# relabel target variable
def convert_rate(rate):
    rate = int(rate)
    if rate >=4: 
        return 1
    else: 
        return 0

In [18]:
from pyspark.sql.functions import udf, expr, concat, col

In [19]:
punct_remover = udf(lambda x: remove_punctuation(x))
rating_convert = udf(lambda x: convert_rate(x))

In [20]:
from pyspark.sql.types import IntegerType
# apply to review raw data
review_df = review.select('review_id', punct_remover('text'), rating_convert('stars'))

review_df = review_df.withColumnRenamed('<lambda>(text)', 'text')\
                     .withColumn('label', review_df["<lambda>(stars)"].cast(IntegerType()))\
                     .drop('<lambda>(stars)')



In [21]:
review_df.show()

+--------------------+--------------------+-----+
|           review_id|                text|label|
+--------------------+--------------------+-----+
|Q1sbwvVQXV2734tPg...|Total bill for th...|    0|
|GJXCdrto3ASJOqKeV...|I  adore  Travis ...|    1|
|2TzJjDVDEuAW6MR5V...|I have to say tha...|    1|
|yi0R0Ugj_xUx_Nek0...|Went in for a lun...|    1|
|11a8sVPMUFtaC7_AB...|Today was my seco...|    0|
|fdiNeiN_hoCxCMy2w...|I ll be the first...|    1|
|G7XHMxG0bx9oBJNEC...|Tracy dessert had...|    0|
|8e9HxxLjjqc9ez5ez...|This place has go...|    0|
|qrffudO73zsslZbe8...|I was really look...|    0|
|RS_GTIT6836bCaPy6...|It s a giant Best...|    0|
|kbtscdyz6lvrtGjD1...|Like walking back...|    1|
|-I5umRTkhw15RqpKM...|Walked in around ...|    0|
|Z7wgXp98wYB57QdRY...|Wow  So surprised...|    1|
|qlXw1JQ0UodW7qrmV...|Michael from Red ...|    1|
|JVcjMhlavKKn3UIt9...|I cannot believe ...|    0|
|svK3nBU7Rk8VfGorl...|You can t really ...|    1|
|1wVA2-vQIuW_ClmXk...|Great lunch today...|    1|


### Tokenize

In [22]:
from pyspark.ml.feature import *
#make tokens and remove stopwords
tok = Tokenizer(inputCol="text", outputCol="words")
review_tokenized = tok.transform(review_df)


In [23]:
from nltk.corpus import stopwords
#stop word remover
stopword_rm = StopWordsRemover(inputCol='words', outputCol='words_nsw')
review_tokenized = stopword_rm.transform(review_tokenized)

In [24]:
review_tokenized.show()

+--------------------+--------------------+-----+--------------------+--------------------+
|           review_id|                text|label|               words|           words_nsw|
+--------------------+--------------------+-----+--------------------+--------------------+
|Q1sbwvVQXV2734tPg...|Total bill for th...|    0|[total, bill, for...|[total, bill, hor...|
|GJXCdrto3ASJOqKeV...|I  adore  Travis ...|    1|[i, , adore, , tr...|[, adore, , travi...|
|2TzJjDVDEuAW6MR5V...|I have to say tha...|    1|[i, have, to, say...|[say, office, rea...|
|yi0R0Ugj_xUx_Nek0...|Went in for a lun...|    1|[went, in, for, a...|[went, lunch, , s...|
|11a8sVPMUFtaC7_AB...|Today was my seco...|    0|[today, was, my, ...|[today, second, t...|
|fdiNeiN_hoCxCMy2w...|I ll be the first...|    1|[i, ll, be, the, ...|[ll, first, admit...|
|G7XHMxG0bx9oBJNEC...|Tracy dessert had...|    0|[tracy, dessert, ...|[tracy, dessert, ...|
|8e9HxxLjjqc9ez5ez...|This place has go...|    0|[this, place, has...|[place, go

### Trigrams with Frequency 

We decided to use trigram to conduct sentiment analysis and want to select trigram frequency larger than 10

In [25]:
# create 3 gram columns
n = 3
ngram = NGram(inputCol = 'words', outputCol = 'threegram', n = n)
add_ngram = ngram.transform(review_tokenized)

# generate the top frequent ngram
ngrams = add_ngram.rdd.flatMap(lambda x: x[-1]).filter(lambda x: len(x.split())==n)
ngram_tall = ngrams.map(lambda x: (x, 1))\
                      .reduceByKey(lambda x,y: x+y)\
                      .sortBy(lambda x: x[1], ascending=False)\
                      .filter(lambda x: x[1]>=10)
ngram_list = ngram_tall.map(lambda x: x[0]).collect()

In [26]:
# replace the word with selected ngram
def ngram_concat(text):
    textlower = text.lower()
    for ngram in ngram_list:
        return textlower.replace(ngram, ngram.replace(' ', '_'))

In [28]:
ngram_df = udf(lambda x: ngram_concat(x))
ngram_df = review_tokenized.select(ngram_df('text'), 'label')\
                          .withColumnRenamed('<lambda>(text)', 'text')

In [29]:
# tokenize and remove stop words with ngram
tok = Tokenizer(inputCol="text", outputCol="words")
review_tokenized = tok.transform(review_df)
tokenized_ngram = tok.transform(ngram_df)
tokenized_ngram = stopword_rm.transform(tokenized_ngram)

stopword_rm = StopWordsRemover(inputCol='words', outputCol='words_nsw')
review_tokenized = stopword_rm.transform(review_tokenized)

In [30]:

# count vectorizer and tfidf
cv = CountVectorizer(inputCol='words_nsw', outputCol='tf')
cvModel = cv.fit(review_tokenized)
count_vectorized = cvModel.transform(review_tokenized)

#tfidfModel = IDF.fit(count_vectorized)
#tfidf_df = tfidfModel.transform(count_vectorized)

In [31]:
# create TF-IDF matrix
idf_ngram = IDF().setInputCol('tf').setOutputCol('tfidf_ngram')
tfidfModel_ngram = idf_ngram.fit(count_vectorized)
tfidf_df_ngram = tfidfModel_ngram.transform(count_vectorized )

In [32]:
# split into training and testing set
splits = tfidf_df_ngram .select(['tfidf_ngram', 'label']).randomSplit([0.8,0.2])
train = splits[0].cache()
test = splits[1].cache()

In [33]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import SVMWithSGD

In [34]:
from pyspark.ml import linalg as ml_linalg
from pyspark.mllib.linalg import Vectors as MLLibVectors

def as_mllib(v):
    if isinstance(v, ml_linalg.SparseVector):
        return MLLibVectors.sparse(v.size, v.indices, v.values)
    elif isinstance(v, ml_linalg.DenseVector):
        return MLLibVectors.dense(v.toArray())
    else:
        raise TypeError("Unsupported type: {0}".format(type(v)))


In [35]:
from pyspark.mllib.util import MLUtils
# Convert feature matrix to LabeledPoint vectors
train_lb = train.rdd.map(lambda row: LabeledPoint(row[1], as_mllib(row[0])))
test_lb = test.rdd.map(lambda row: LabeledPoint(row[1], as_mllib(row[0])))

In [36]:
# SVM model
from pyspark.mllib.linalg import SparseVector
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import SVMWithSGD
from numpy import array
from pyspark.mllib.util import MLUtils
numIterations = 100
regParam = 0.3
svm = SVMWithSGD.train(train_lb, numIterations, regParam=regParam)

In [38]:
# Model prediction
scoreAndLabels_test = test_lb.map(lambda x: (float(svm.predict(x.features)), x.label))
score_label_test = spark.createDataFrame(scoreAndLabels_test, ["prediction", "label"])

In [39]:
# Model evaluation
# F1 score
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
f1_score = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
svm_f1 = f1_score.evaluate(score_label_test)
print("F1 score: %.4f" % svm_f1)

F1 score: 0.8848


In [40]:
spark.stop()