# Assignment: Scalable Processing
## Yelp Reviews and Authenticity

Large Scale Data Analysis | by Maciej Jalocha | macja@itu.dk | 10.03.2025

## Connecting to the Spark Cluster job using the two JobParameters.json

To connect this jupyter notebook with your Spark cluster, we need to tell jupyter how it can access the spark cluster. Below code accomplishes that. Do not worry about how it works, just run the cell once to connect. 

In [1]:
#####################################################################
# DO NOT CHANGE ANYTHING HERE.
# IF YOU HAVE PROBLEMS, CHECK THE ASSIGNMENT GUIDE CAREFULLY 
#####################################################################
from IPython.display import Javascript, display
import jupyterlab
import os, json, pyspark
from pyspark.sql import SparkSession, functions as F
from pyspark.conf import SparkConf
from py4j.protocol import Py4JJavaError


def show_popup(message):
    display(Javascript(f'alert("{message}")'))

def check_correct_file_location():
    items = os.listdir('/work')
    items_expected = ['yelp', 'Home','JobParameters.json', 'emails']
    if sorted(items) != sorted(items_expected):
        items_to_be_moved = [item for item in items if item not in items_expected and item[0] != '.'] # Ignore hidden files starting with .
        show_popup(f"Warning: Found these files {items_to_be_moved} that should (most likely) be moved inside your Home folder. Make sure your Git repository and notebooks are all saved inside your Home folder and not at the 'root'/top of filesystem. Please move your files to prevent them from disappearing.")
    if 'emails' not in items_expected:
        show_popup(f'Error: the folder "emails" does not seem to be accessible - did you remeber to add it to the Spark Cluster job and JupyterLab job?')
    
check_correct_file_location()

SUPPORTED_SPARK_VERSION = "3.3.1"
SUPPORTED_JUPYTERLAB_VERSION = "3.5.1"
if jupyterlab.__version__ != SUPPORTED_JUPYTERLAB_VERSION:
    show_popup(f"Wrong JupyterLab version :( When starting the UCloud job you selected {jupyterlab.__version__} but it should have been {SUPPORTED_JUPYTERLAB_VERSION}")
    show_popup("Please shutdown this JupyterLab job and follow the instructions carefully in the UCloud setup guide PDF on LearnIT") 
elif '_EXECUTED_' in globals(): # Only execute this cell once.
    # check if variable '_EXECUTED_' exists in the global variable namespace
    print("Already been executed once, not running again!")
else:
    print("Cell has not been executed before. Please restart the UCloud jobs if any error message pops up. Running setup cell now.")
    # Two files are automatically read: JobParameters.json for the Spark Cluster job using a temporary spark instance
    # and JobParameters.json for the Jupyter Lab job to extract the hostname of the cluster. 

    MASTER_HOST_NAME = None

    # Open the parameters Jupyter Lab app was launched with
    with open('/work/JobParameters.json', 'r') as file:
        JUPYTER_LAB_JOB_PARAMS = json.load(file)
        # from pprint import pprint; pprint(JUPYTER_LAB_JOB_PARAMS) 
        for resource in JUPYTER_LAB_JOB_PARAMS['request']['resources']:
            if 'hostname' in resource.keys():
                MASTER_HOST_NAME = resource['hostname']
    
    if MASTER_HOST_NAME != "spark-cluster":
        msg = f"The JupyterLab job was started using spark hostname {MASTER_HOST_NAME}. This is not recommended, please start it using spark-cluster instead"
        show_popup(msg)
        print(msg)
    else:
        MASTER_HOST = f"spark://{MASTER_HOST_NAME}:7077"

        conf = SparkConf().setAll([
                ("spark.app.name", 'reading_job_params_app'), 
                ("spark.master", MASTER_HOST),
            ])

        spark = SparkSession.builder.config(conf=conf)\
                                    .getOrCreate()
        
        if spark.version != SUPPORTED_SPARK_VERSION:
            show_popup(f"Wrong Spark Cluster version :( When starting the UCloud job you selected {spark.version} but it should have been {SUPPORTED_SPARK_VERSION}")
            show_popup("Please shutdown this JupyterLab job, the Spark Cluster and follow the instructions carefully in the UCloud setup guide PDF on LearnIT") 

        CLUSTER_PARAMETERS_JSON_DF = spark.read.option("multiline","true").json('/work/JobParameters.json')
        
        # Extract cluster info from the specific JobParameters.json
        NODES = CLUSTER_PARAMETERS_JSON_DF.select("request.replicas").first()[0]
        CPUS_PER_NODE = CLUSTER_PARAMETERS_JSON_DF.select("machineType.cpu").first()[0] - 1
        MEM_PER_NODE = CLUSTER_PARAMETERS_JSON_DF.select("machineType.memoryInGigs").first()[0]

        CLUSTER_CORES_MAX = CPUS_PER_NODE * NODES
        CLUSTER_MEMORY_MAX = MEM_PER_NODE * NODES 
        
        if CPUS_PER_NODE > 1:
            EXECUTOR_CORES = CPUS_PER_NODE - 1  # set cores per executor on worker node
        else:
            EXECUTOR_CORES = CPUS_PER_NODE 

        try:
            EXECUTOR_MEMORY = int(
                MEM_PER_NODE / (CPUS_PER_NODE / EXECUTOR_CORES) * 0.5
            )  # set executor memory in GB on each worker node
        except ZeroDivisionError:
            show_popup(f"Please make sure you selected 3 nodes for the Spark Cluster, each with 24 GB of ram. You selected {MEM_PER_NODE} GB ram and {NODES} node(s)")
            
        # Make sure there is a dir for spark logs
        if not os.path.exists('spark_logs'):
            os.mkdir('spark_logs')
        conf = SparkConf().setAll(
            [
                ("spark.app.name", 'spark_assignment'), # Change to your liking 
                ("spark.sql.caseSensitive", False), # Optional: Make queries strings sensitive to captialization
                ("spark.master", MASTER_HOST),
                ("spark.cores.max", CLUSTER_CORES_MAX),
                ("spark.executor.cores", EXECUTOR_CORES),
                ("spark.executor.memory", str(EXECUTOR_MEMORY) + "g"),
                ("spark.eventLog.enabled", True),
                ("spark.eventLog.dir", "spark_logs"),
                ("spark.history.fs.logDirectory", "spark_logs"),
                ("spark.deploy.mode", "cluster")
            ]
        )

        ## check executor memory, taking into accout 10% of memory overhead (minimum 384 MiB)
        CHECK = (CLUSTER_CORES_MAX / EXECUTOR_CORES) * (
            EXECUTOR_MEMORY + max(EXECUTOR_MEMORY * 0.10, 0.403)
        )

        assert (
            int(CHECK) <= CLUSTER_MEMORY_MAX
        ), "Executor memory larger than cluster total memory!"

        # Stop previous session that was just for loading cluster params
        spark.stop()

        # Start new session with above config, that has better resource handling
        spark = SparkSession.builder.config(conf=conf)\
                                    .getOrCreate()
        sc = spark.sparkContext
        _EXECUTED_ = True
        print("Success!")

Cell has not been executed before. Please restart the UCloud jobs if any error message pops up. Running setup cell now.
Success!


Click on the "SparkMonitor" tab at the top in Jupyter Lab to see the status of running code on the cluster.

## Loading the data
Here we specify where the yelp datasets are located on UCloud and read then using the spark session.

In [2]:
# Read in the business and review files
# This is the path to the shared datasets provided by adding an the dataset input folder
# when submitting the spark cluster job.
business = spark.read.json('file:////work/yelp/yelp_academic_dataset_business.json') # Use the file:/// prefix to indicate we want to read from the cluster's filesystem
business = business.persist()
# Persist 2 commonly used dataframes since they're used for later computations
# https://sparkbyexamples.com/spark/spark-difference-between-cache-and-persist/

users = spark.read.json("file:////work/yelp/yelp_academic_dataset_user.json")
users = users.persist()

reviews = spark.read.json('file:////work/yelp/yelp_academic_dataset_review.json')
reviews = reviews.persist()

# checkin = spark.read.json('file:////work/yelp/yelp_academic_dataset_checkin.json')
# tip = spark.read.json('file:////work/yelp/yelp_academic_dataset_tip.json')


## PySpark example usage

In [None]:
# Show PySpark dataframes:
reviews.columns

In [None]:
business.show()

In [None]:
# Get number of rows with no sampling:
reviews.count()

In [None]:
# OPTIONAL:
# Reduce resource usage and make queries run faster
# by only using a small sample of the dataframe
# and overwriting previous variable "df".
# Useful while developing, not so much to
# provide final answers. Therefore: Remember to 
# to re-read the df when done developing code using
# df = spark.read etc like above.


# Get number of rows after sampling:
reviews.count() 

In [97]:
business.show()

+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+-------+-------------+--------------+--------------------+-----------+------------+-----+-----+
|             address|          attributes|         business_id|          categories|          city|               hours|is_open|     latitude|     longitude|                name|postal_code|review_count|stars|state|
+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+-------+-------------+--------------+--------------------+-----------+------------+-----+-----+
|1616 Chapala St, ...|{null, null, null...|Pns2l4eNsfO8kk83d...|Doctors, Traditio...| Santa Barbara|                null|      0|   34.4266787|  -119.7111968|Abby Rappoport, L...|      93101|           7|  5.0|   CA|
|87 Grasso Plaza S...|{null, null, null...|mpf3x-BjTdTEA3yCZ...|Shipping Centers,...|        Affton|{8:0-18:30, 0:0-0...|      1|   

Example: Say we're only interested in reviews of good mexican restaurants in Arizona. You can delete this when you do your own thing. 

### Example

In [110]:
# Filter to only Arizona businesses with "Mexican" as part of their categories
az_mex = business.filter(business.state == "AZ")\
                .filter(business.categories.rlike("Mexican"))\
                .select("business_id", "name", 'categories')
az_mex.toPandas().to_csv('az_mex.csv', header=True, index=False, encoding='utf-8')
# Join with the reviews
az_mex_rs = reviews.join(az_mex, on="business_id", how="inner")

# Filter to only 5 star reviews
good_az_mex_rs = az_mex_rs.filter(az_mex_rs.stars == 5)\
                        .select("name","text")

# Print the top 20 rows of the DataFrame
good_az_mex_rs.show()

# Convert to pandas (local object) and save to local file system
good_az_mex_rs.toPandas().to_csv("good_az_reviews.csv", header=True, index=False, encoding='utf-8')


+--------------------+--------------------+
|                name|                text|
+--------------------+--------------------+
|Casa Molina Del N...|We've been coming...|
|               Penca|Wow. That was inc...|
|               Penca|Street Tacos in T...|
|               Penca|Last minute decis...|
|             Micha's|The best Carne Se...|
|Street- Taco and ...|Cool lil spot dow...|
|               Penca|A little pricey b...|
|Taqueria Pico De ...|LOVE THIS PLACE!\...|
|Casa Molina Del N...|I love the margar...|
|      El Charro Cafe|Really happy I fo...|
|            BK Tacos|Absolutely love t...|
|St Mary's Mexican...|Best tortillas I'...|
|Charro Steak & De...|Visiting from out...|
|            BK Tacos|Most people go fo...|
|             Micha's|We travel to Tucs...|
|      El Charro Cafe|Had a great dinne...|
|Charro Steak & De...|Excellent night a...|
|Street- Taco and ...|This is a fabulou...|
|Indian Frybread-M...|I love this place...|
|               Penca|It was del

In [99]:
business_sam.select(
    explode(split(col("categories"), ",")).alias("category")
).withColumn("category", trim(col("category"))).distinct().sort('category').show()

+--------------------+
|            category|
+--------------------+
|          Acai Bowls|
|         Accessories|
|         Accountants|
|         Active Life|
|         Acupuncture|
|               Adult|
|     Adult Education|
| Adult Entertainment|
|         Advertising|
|       Aestheticians|
|             African|
|   Air Duct Cleaning|
|            Airlines|
|    Airport Shuttles|
|   Airport Terminals|
|            Airports|
|          Allergists|
|Alternative Medicine|
|Amateur Sports Teams|
|      American (New)|
+--------------------+
only showing top 20 rows



See assignment PDF for task descriptions.

### Samplling

In [12]:
reviews = reviews.sample(withReplacement=False, fraction=1/50)
business = business.sample(withReplacement=False, fraction=1/50)

# Exercises

### Task 3.1.1:

In [53]:
# Write your code here...
business.count()

150346

In [None]:
business.columns

### Task 3.1.2:

In [37]:
# Write your code here...
respected_businesses=business.filter((business.stars >= 5) & (business.review_count >= 500)).select("name", "stars", "review_count")
respected_businesses.toPandas().to_csv("3_1_2_respected_businesses.csv", header=True, index=False, encoding='utf-8')

### Task 3.1.3: 

In [7]:
# Write your code here...
influencers = users.filter(users.review_count >= 1000).select("user_id")
influencers.toPandas().to_csv("3_1_3_influencers.csv", header=True, index=False, encoding='utf-8')

### Task 3.1.4: 

In [None]:
business

In [2]:
# Write your code here...

reviews = reviews.sample(withReplacement=False, fraction=1/50)
business_sam = business.sample(withReplacement=False, fraction=1/50)

NameError: name 'reviews' is not defined

In [9]:
from pyspark.sql.functions import col
counts = business\
.join(reviews, on="business_id", how="inner")\
.join(influencers, on="user_id", how="inner")\
.select("name", "user_id")\
.groupBy('name')\
.count()\
.filter(col('count') >= 5)\
.toPandas().to_csv("3_1_4_occupied_businesses.csv", header=True, index=False, encoding='utf-8')

### Task 3.1.5: 

Find anordered list of users based on the average star counts they have given in all their reviews.

In [None]:
# Write your code here...
from pyspark.sql.functions import col
counts = reviews\
.join(users, on="user_id", how="right")\
.select("stars", "user_id")\
.groupBy('user_id')\
.mean()\
.sort(col('avg(stars)'), ascending=False)\
.show()
# .toPandas().to_csv("3_1_5_ordered_users.csv", header=True, index=False, encoding='utf-8')
# too big to save haha.
#include all users, including with no corresponding reviews.

### Task 3.2.1: Data Exploration

 What is the percentage of reviews that contain a variant of the word "authentic"?

##### Total counts

In [14]:
reviews.count()

6990280

In [9]:
business.count()

150346

In [11]:
users.count()

1987897

In [4]:
reviews.columns

['business_id',
 'cool',
 'date',
 'funny',
 'review_id',
 'stars',
 'text',
 'useful',
 'user_id']

### Percentage of authentic and legitimate language

In [5]:
# Write your code here...
from pyspark.sql.functions import col, lower, when, desc, asc

In [9]:
c = lower(col('text'))
n_authentic = reviews\
.filter(c.contains('authentic'))\
.count()
total = reviews.count()
percentage_authentic = n_authentic/total
print(n_authentic)
print(percentage_authentic)

124634
0.01782961483660168


In [14]:
c = lower(col('text'))
n_authentic = reviews\
.filter(c.contains('legitimate'))\
.count()
total = reviews.count()
percentage_authentic \
= n_authentic/total
print(n_authentic)
print(percentage_authentic)

5066
0.000724720612049875


In [10]:
c = lower(col('text'))
n_authentic = reviews\
.filter(c.contains('authentic') | c.contains('legitimate'))\
.count()
total = reviews.count()
percentage_authentic = n_authentic/total
print(n_authentic)
print(percentage_authentic)

129503
0.018526153458802794


In [7]:
print(percentage_authentic)

0.018526153458802794


In [6]:
n_authentic

124634

Grouped by cuisine

In [8]:
# Filter to only Arizona businesses with "Mexican" as part of their categories
c = col('categories')
business = business.withColumn('cuisine',
    when(c.contains("Mexican"), "Mexican")\
    .when(c.contains("Chinese"), "Chinese")\
    .when(c.contains("Thai"), "Thai")\
    .when(c.contains("Japanese"), "Japanese")\
    .when(c.contains("Indian"), "Indian")\
    .when(c.contains("Italian"), "Italian")\
    .when(c.contains("French"), "French")\
    .when(c.contains("Korean"), "Korean")\
    .when(c.contains("Mediterranean"), "Mediterranean")\
    .when(c.contains("Soul"), "Soul")\
    #.when(c.contains("German"), "German")\
    #.when(c.contains("Greek"), "Greek")\
    #.when(c.contains("Spanish"), "Spanish")\
    .otherwise('non-restaurant')
)

#TODO: OVERLAP BETWEEN DIFFERENT CUSINES! especially French/italian!

In [117]:
from pyspark.sql.functions import round
condition = c.contains('legitimate') | c.contains('authentic')
reviews_cuisine_text = business\
.filter(col('cuisine')!='non-restaurant')\
.join(reviews, on="business_id", how="inner")\
.select('cuisine', 'text', 'review_id', 'state', 'city')\
.cache()


c=col('text')
review_mentions_authentic=reviews_cuisine_text\
.withColumn('mentions_authentic', condition)\
.cache()


review_mentions_authentic\
.filter(col('mentions_authentic') == True)\
.groupBy('cuisine')\
.count()\
.withColumnRenamed('count', 'mentions_count')\
.join(
    reviews_cuisine_text\
    .groupBy('cuisine')\
    .count()\
    .withColumnRenamed('count', 'total_count')
    ,
    on='cuisine'
)\
.withColumn('ratio', round(
    col('mentions_count')/col('total_count'),
    2))\
.show()

+-------------+--------------+-----------+-----+
|      cuisine|mentions_count|total_count|ratio|
+-------------+--------------+-----------+-----+
|      Mexican|         31233|     432248| 0.07|
|         Thai|          5605|      93217| 0.06|
|       Indian|          5100|      75821| 0.07|
|      Chinese|         11906|     223663| 0.05|
|     Japanese|          4443|     177474| 0.03|
|      Italian|         11145|     431647| 0.03|
|       Korean|          2119|      30851| 0.07|
|       French|          1243|      82738| 0.02|
|         Soul|          1042|      63684| 0.02|
|Mediterranean|          4504|     108168| 0.04|
+-------------+--------------+-----------+-----+



In [110]:
from pyspark.sql.functions import max_by, struct

restaurant_reviews_on_business = reviews_cuisine_text
window = Window.partitionBy("state").orderBy(col("count").desc())

totals = restaurant_reviews_on_business\
.rollup('state', 'city')\
.count()\
.filter(col('state').isNotNull())\
.withColumn('rank', row_number().over(window))\
.filter(col('rank') <= 2)\
.select('state', 'city', 'count')

city_df = totals.filter(col('city').isNotNull()).select('state', 'city', col('count').alias('city_count'))
state_df = totals.filter(col('city').isNull()).select('state', col('count').alias('state_count'))
totals = city_df.join(state_df, on='state')

auth_totals = restaurant_reviews_on_business\
.filter(condition)\
.rollup('state', 'city')\
.count()\
.filter(col('state').isNotNull())\
.select('state', 'city', 'count')

auth_city_df = auth_totals.filter(col('city').isNotNull()).select('state', 'city', col('count').alias('auth_city_count'))
auth_state_df = auth_totals.filter(col('city').isNull()).select('state', col('count').alias('auth_state_count'))
auth_totals = auth_city_df.join(auth_state_df, on='state')

combined = totals.join(auth_totals, on=['state', 'city'], how='left')
combined = combined\
    .withColumn('city_auth_ratio', round(col('auth_city_count')/col('city_count'),2))\
    .withColumn('state_auth_ratio', round(col('auth_state_count')/col('state_count'),2))\
    #.select('state', 'state_auth_ratio', 'city', 'city_auth_ratio')

combined.show()

NameError: name 'Window' is not defined

### Task 3.2.2: Hypothesis Testing

In [45]:
c=col('text')
review_mentions_dirty=reviews_cuisine_text\
.withColumn('mentions_dirty', c.contains('dirty') | c.contains('kitsch') | c.contains('cheap') | c.contains('rude') | c.contains('simple'))\
.cache()

In [113]:
# now it contains cuisine and boolean variable if two conditions are fulfilled. We can easily operate on it!
c=col('text')
review_mentions_authentic_or_dirty=review_mentions_dirty\
.select('review_id', 'mentions_dirty')\
.join(review_mentions_authentic, on='review_id', how='inner')\
.withColumn('mentions_authentic_dirty', col('mentions_dirty') & col('mentions_authentic'))\
.cache()
authentic_review_mentions_dirty=review_mentions_authentic_or_dirty.filter(col('mentions_authentic'))
authentic_review_mentions_dirty.show(n=5)                                                                                                                      

+--------------------+--------------+-------------+--------------------+-----+---------------+------------------+------------------------+
|           review_id|mentions_dirty|      cuisine|                text|state|           city|mentions_authentic|mentions_authentic_dirty|
+--------------------+--------------+-------------+--------------------+-----+---------------+------------------+------------------------+
|-GehB4C8_DeDOjToM...|         false|Mediterranean|The wife and I ag...|   IN|   Indianapolis|              true|                   false|
|-LcqwMWUQXjVjE21F...|         false|      Mexican|Taco Bus man o ma...|   FL|Treasure Island|              true|                   false|
|-WN9tM07OQF8BjyiV...|         false|      Italian|By far the best a...|   PA|   Philadelphia|              true|                   false|
|0AHkb3ZVcGY-bXNNL...|         false|Mediterranean|Tried this place ...|   PA|   Philadelphia|              true|                   false|
|0CrvG-_VaGmFbKLJU...|     

For the 2x2 contigency table we need:
- to merge them into European and South America + Asian + Soul (SAAS)
- get total
- get margin counts for European
- get margin counts for SAAS
- get margin for 'authenticity + dirty' language
- get margin for non 'authenticity + dirty' language

##### Total

In [85]:
total = authentic_review_mentions_dirty.count()
total

1719511

##### Merge

In [63]:
# Write your code here...
c=col('cuisine')
european_saas_review_mentions_authentic_dirty = authentic_review_mentions_dirty\
.withColumn('global_cuisine', when(c.contains("Italian") | c.contains("French") | c.contains("Mediterranean"), "european")\
            .otherwise("asian_or_south_american"))\
.select('mentions_authentic_dirty', 'global_cuisine')\
.cache()

european_saas_review_mentions_authentic_dirty.show(n=5)

+------------------------+--------------------+
|mentions_authentic_dirty|      global_cuisine|
+------------------------+--------------------+
|                   false|            european|
|                   false|asian_or_south_am...|
|                   false|            european|
|                   false|            european|
|                   false|asian_or_south_am...|
+------------------------+--------------------+
only showing top 5 rows



##### Cell Counts

In [64]:
tmp = european_saas_review_mentions_authentic_dirty.groupby('global_cuisine', 'mentions_authentic_dirty').count()
tmp.show()
tmp=tmp.collect()
print(tmp)

# i = 0
i = 0
european_true = tmp[i][-1]
assert tmp[i][0] == 'european' and tmp[i][1] == True

# i = 1
i = 1
asian_or_south_american_false = tmp[i][-1]
assert tmp[i][0] == 'asian_or_south_american' and tmp[i][1] == False

# i = 2
i = 2
asian_or_south_american_true = tmp[i][-1]
assert tmp[i][0] == 'asian_or_south_american' and tmp[i][1] == True

# i = 3
i = 3
european_false = tmp[i][-1]
assert tmp[i][0] == 'european' and tmp[i][1] == False


+--------------------+------------------------+-----+
|      global_cuisine|mentions_authentic_dirty|count|
+--------------------+------------------------+-----+
|            european|                    true| 1187|
|asian_or_south_am...|                   false|56472|
|asian_or_south_am...|                    true| 4976|
|            european|                   false|15705|
+--------------------+------------------------+-----+

[Row(global_cuisine='european', mentions_authentic_dirty=True, count=1187), Row(global_cuisine='asian_or_south_american', mentions_authentic_dirty=False, count=56472), Row(global_cuisine='asian_or_south_american', mentions_authentic_dirty=True, count=4976), Row(global_cuisine='european', mentions_authentic_dirty=False, count=15705)]


In [16]:
!pip install scipy

Collecting scipy
  Downloading scipy-1.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (37.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.6/37.6 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: scipy
Successfully installed scipy-1.15.2


#### Proper Tests

#### test for independnce
This is just test for independence

$H_0$: The two variables are independent
i.e.
$H_0: (\forall i, j)[p_{ij} = p_i \cdot q_j]$

In [65]:
from scipy.stats import chi2_contingency
table = np.array([[european_true, european_false],
                  [asian_or_south_american_true, asian_or_south_american_false]])
chi2, p, dof, expected = chi2_contingency(table)

print("Chi-square statistic:", chi2)
print("p-value:", p)
print("Degrees of freedom:", dof)
print("Expected frequencies:")
print(expected)
print("Observed frequencies:")
print(table)

Chi-square statistic: 20.81710344243725
p-value: 5.052969628268849e-06
Degrees of freedom: 1
Expected frequencies:
[[ 1328.89195813 15563.10804187]
 [ 4834.10804187 56613.89195813]]
Observed frequencies:
[[ 1187 15705]
 [ 4976 56472]]


Conclusion, is that we reject $H_0$ in favour of $H_1$ so actually these are dependent.

##### Directional test
So here, I treat european and asian_or_south_american as population (for 2x2 it's fine), and start with hypothesis that:

Probability of Authenticity+Dirty language in SAAS is higher than in European restaurant i.e. 

$H_0: p_{11} = p_{21}$ vs $H_A:p_{11}<p_{21}$

In [66]:

probs = table/table.sum(axis=1)[:,np.newaxis]
(p_11, p_12), (p_21, p_22) = probs
probs

array([[0.07026995, 0.92973005],
       [0.08097904, 0.91902096]])

1 check: if indeed the prob for SAAS is higher

In [67]:
assert p_11 < p_21, "H_A is rejected"

2nd check: divide $p$ value by half and get info

In [68]:
print(p/2)

2.5264848141344246e-06


In [43]:
p_11

0.07026995027231826

In [71]:
import numpy as np
from scipy.stats import norm
# Compute sample sizes for each group
n1 = european_true + european_false
n2 = asian_or_south_american_true + asian_or_south_american_false
p_2 = p_21
p_1 = p_11
diff = p2 - p1
SE_unpooled = np.sqrt(p1*(1 - p1)/n1 + p2*(1 - p2)/n2)
z_star = norm.ppf(1 - 0.05/2)
margin = z_star * SE_unpooled
CI_lower = diff - margin
CI_upper = diff + margin
print("Group 1 (European):")
print("  n =", n1, "; p1 =", p1)
print("Group 2 (Asian/South American):")
print("  n =", n2, "; p2 =", p2)
print("Difference in proportions (p1 - p2):", diff)
print("Unpooled Standard Error:", SE_unpooled)
print("95% Confidence Interval for the difference: ({:.4f}, {:.4f})".format(CI_lower, CI_upper))

Group 1 (European):
  n = 16892 ; p1 = 0.07026995027231826
Group 2 (Asian/South American):
  n = 61448 ; p2 = 0.08097903918760578
Difference in proportions (p1 - p2): 0.010709088915287518
Unpooled Standard Error: 0.002253611170865988
95% Confidence Interval for the difference: (0.0063, 0.0151)


As we see, we have enough info to accept H_A in favour of H_0

### Task 3.3: Building a Rating Prediction Model

### Let's recall fields again

In [39]:
users.columns

['average_stars',
 'compliment_cool',
 'compliment_cute',
 'compliment_funny',
 'compliment_hot',
 'compliment_list',
 'compliment_more',
 'compliment_note',
 'compliment_photos',
 'compliment_plain',
 'compliment_profile',
 'compliment_writer',
 'cool',
 'elite',
 'fans',
 'friends',
 'funny',
 'name',
 'review_count',
 'useful',
 'user_id',
 'yelping_since']

In [5]:
reviews.columns

['business_id',
 'cool',
 'date',
 'funny',
 'review_id',
 'stars',
 'text',
 'useful',
 'user_id']

In [73]:
business.columns

['address',
 'attributes',
 'business_id',
 'categories',
 'city',
 'hours',
 'is_open',
 'latitude',
 'longitude',
 'name',
 'postal_code',
 'review_count',
 'stars',
 'state',
 'cuisine']

### Model Training

In [9]:
from pyspark.sql.functions import udf
def zero_center_stars(star):
    return star - 1

zero_center_stars_udf = udf(zero_center_stars)


c = col('categories')
business = business.withColumn('cuisine',
    when(c.contains("Mexican"), "Mexican")\
    .when(c.contains("Chinese"), "Chinese")\
    .when(c.contains("Thai"), "Thai")\
    .when(c.contains("Japanese"), "Japanese")\
    .when(c.contains("Indian"), "Indian")\
    .when(c.contains("Italian"), "Italian")\
    .when(c.contains("French"), "French")\
    .when(c.contains("Korean"), "Korean")\
    .when(c.contains("Mediterranean"), "Mediterranean")\
    .when(c.contains("Soul"), "Soul")\
    .otherwise('non-restaurant')
)

In [14]:
from pyspark.sql.functions import col, lower, when, desc, asc
from pyspark.ml.feature import Tokenizer, HashingTF
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, RegressionEvaluator
from pyspark.ml import Pipeline
from pyspark.sql import functions as f
import tempfile
from pyspark.ml.classification import LinearSVC, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import Tokenizer, HashingTF, StringIndexer, OneHotEncoder, VectorAssembler

c=col('text')
data = business\
.filter(col('cuisine')!='non-restaurant')\
.select('business_id', 'state')\
.join(reviews, on="business_id", how="inner")\
.withColumn('mentions_dirty', (c.contains('dirty') | c.contains('kitsch') | c.contains('cheap') | c.contains('rude') | c.contains('simple')).cast('integer'))\
.withColumn('mentions_authentic', (c.contains('legitimate') | c.contains('authentic')).cast('integer'))\
.select('text', 'stars', 'state', 'mentions_dirty', 'mentions_authentic', 'review_id')\
.cache()

def stratified_train_test_split(df, frac, label, join_on, seed=42):
    """ stratfied split of a dataframe in train and test set."""
    fractions = df.select(label).distinct().withColumn("fraction", f.lit(frac)).rdd.collectAsMap()
    df_frac = df.stat.sampleBy(label, fractions, seed)
    df_remaining = df.join(df_frac, on=join_on, how="left_anti")
    return df_frac, df_remaining

In [29]:
reviews_sam = data#.sample(withReplacement=False, fraction=1/2)
reviews_sam = reviews_sam.withColumn('stars', zero_center_stars(reviews_sam['stars']))
# reviews_sam.count()

raw_train, raw_test = stratified_train_test_split(reviews_sam, 0.7, label='stars', join_on='review_id')
# text
tokenizer = Tokenizer(inputCol="text", outputCol="words") 
hashingtf = HashingTF(numFeatures=2**14,inputCol=tokenizer.getOutputCol(), outputCol="textFeatures")

feats = {
    'featuresCol': 'allFeatures',
    'labelCol': 'stars'
}
lsvc = LinearSVC(maxIter=10, regParam=0.1, **feats)
ovr = OneVsRest(classifier=lsvc, **feats)
revaluator = RegressionEvaluator(predictionCol='prediction', labelCol='stars')
state_indexer = StringIndexer(inputCol="state", outputCol="stateIndex", handleInvalid="keep")
state_encoder = OneHotEncoder(inputCols=["stateIndex"], outputCols=["stateVec"])

#### Below I compute four model for the four different setups.

In [30]:
# combine
assembler = VectorAssembler(inputCols=["textFeatures"], outputCol="allFeatures")

pipeline = Pipeline(stages=[tokenizer, hashingtf, assembler,ovr])

trained_model_pipeline = pipeline.fit(raw_train)
test_preds = trained_model_pipeline.transform(raw_test)
revaluator.evaluate(test_preds, {revaluator.metricName: "rmse"})

0.9728175752529415

In [31]:
assembler = VectorAssembler(inputCols=["textFeatures", "mentions_dirty", "mentions_authentic"], outputCol="allFeatures")

pipeline = Pipeline(stages=[tokenizer, hashingtf, assembler,ovr])

trained_model_pipeline = pipeline.fit(raw_train)
test_preds = trained_model_pipeline.transform(raw_test)
revaluator.evaluate(test_preds, {revaluator.metricName: "rmse"})

0.9730572774864338

In [33]:

assembler = VectorAssembler(inputCols=["textFeatures", "stateVec"], outputCol="allFeatures")

pipeline = Pipeline(stages=[tokenizer, hashingtf, state_indexer, state_encoder, assembler,ovr])

trained_model_pipeline = pipeline.fit(raw_train)
test_preds = trained_model_pipeline.transform(raw_test)
revaluator.evaluate(test_preds, {revaluator.metricName: "rmse"})

0.9747573691040137

In [34]:
assembler = VectorAssembler(inputCols=["textFeatures", "stateVec", "mentions_dirty", "mentions_authentic"], outputCol="allFeatures")


pipeline = Pipeline(stages=[tokenizer, hashingtf, state_indexer, state_encoder, assembler,ovr])

trained_model_pipeline = pipeline.fit(raw_train)
test_preds = trained_model_pipeline.transform(raw_test)
revaluator.evaluate(test_preds, {revaluator.metricName: "rmse"})

0.9749638409128306

In [35]:
assembler = VectorAssembler(inputCols=["stateVec", "mentions_dirty", "mentions_authentic"], outputCol="allFeatures")


pipeline = Pipeline(stages=[state_indexer, state_encoder, assembler,ovr])

trained_model_pipeline = pipeline.fit(raw_train)
test_preds = trained_model_pipeline.transform(raw_test)
revaluator.evaluate(test_preds, {revaluator.metricName: "rmse"})

1.7961764340587196