# SIADS 516: Homework 4

- **Dr. Chris Teplovs**, School of Information, University of Michigan
- **Kris Steinhoff**, School of Information, University of Michigan


In [1]:
# The AutograderHelper class provides methods used by the autograder.
from autograder_helper import AutograderHelper

In [2]:
# Autograder cell. This cell is worth 0 points.
# This cell has hidden code used to configure the autograder.

This homework assignment uses the Yelp Academic dataset, with which you should now be familiar.
We have created a few cells to get you started, but you're largely on your own to devise solutions to the
"real-world" questions below.

In this assignment, provide solutions that use spark.sql() calls to query the dataset. For example, to find the answer to "How many users have more than 100 "cool" votes?", this:
```
query = """
SELECT count(*) FROM user WHERE cool > 100
"""
spark.sql(query).show()
```
is similar to:
```
user.filter('cool > 100').show()
```
But in this assignment, use the first approach. The autograder will check for the use of `spark.sql()`

Our usual Spark mantra:

In [3]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName('My First Spark application') \
    .getOrCreate() 

sc = spark.sparkContext

22/07/25 15:07:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


Load the JSON files:

In [4]:
business = spark.read.json('../../assets/data/yelp_academic/yelp_academic_dataset_business.json.gz')
checkin = spark.read.json('../../assets/data/yelp_academic/yelp_academic_dataset_checkin.json.gz')
review = spark.read.json('../../assets/data/yelp_academic/yelp_academic_dataset_review.json.gz')
tip = spark.read.json('../../assets/data/yelp_academic/yelp_academic_dataset_tip.json.gz')
user = spark.read.json('../../assets/data/yelp_academic/yelp_academic_dataset_user.json.gz')

22/07/25 15:07:47 WARN Utils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.debug.maxToStringFields' in SparkEnv.conf.
                                                                                

Create temp views for the DataFrames:

In [5]:
business.createOrReplaceTempView("business")
checkin.createOrReplaceTempView("checkin")
tip.createOrReplaceTempView("tip")
review.createOrReplaceTempView("review")
user.createOrReplaceTempView("user")

---

## -- EXAMPLE PROBLEM --

Get a list of users named "Kahlil" with the number of their reviews tagged "funny".

- The result should have these columns:
  - `user_id`
  - `name`
  - `funny`
- The result rows do NOT need to be ordered

In [6]:
# Solve the problem by assigning populating the provided variable 
# with the result of the Spark SQL query

def users_kahlil():
    return spark.sql("""\
        SELECT user_id, name, funny
        FROM user
        WHERE name = "Kahlil"
        """)

In [7]:
# It can be helpful to look at the result with .show()

results = users_kahlil()
results.show()

[Stage 5:>                                                          (0 + 1) / 1]

+--------------------+------+-----+
|             user_id|  name|funny|
+--------------------+------+-----+
|HE5fZW8m7MpdLHa3H...|Kahlil|   32|
|BAX7MdujQiv_Camqi...|Kahlil|    0|
|fepcVUPERVRA16b4M...|Kahlil|    0|
|uvG9MAZF6vIVBoj24...|Kahlil|    4|
|sEQtegzBDjARGB_YM...|Kahlil|    0|
|JpOCv0TtT2nz0gv0S...|Kahlil|    0|
+--------------------+------+-----+



                                                                                

In [8]:
# This notebook provides several asserts for each problem. 
#
# There are also hidden tests that are run by the autograder after submission.

assert type(users_kahlil()) == pyspark.sql.dataframe.DataFrame, \
    "The return value should be a Spark DataFrame."

AutograderHelper.assert_function_calls(users_kahlil, ["spark.sql"])

users_kahlil_ids = [r["user_id"] for r in users_kahlil().collect()]


                                                                                

In [9]:
assert len(users_kahlil_ids) == 6, \
    "The result must have 6 rows."

expected_user_id = "HE5fZW8m7MpdLHa3HGp1FA"
assert expected_user_id in users_kahlil_ids, f'The user_id column should include "{expected_user_id}"'

---

## -- USERS WITH 500 FANS --

Determine how many users have more than 500 fans.

- The result should have 1 column and 1 row
- The name of the column does not matter

In [11]:
def count_users_500_fans():
    # YOUR CODE HERE
#     raise NotImplementedError()

    query="""
          SELECT COUNT(*) FROM user WHERE fans>500    
    """
    
    return spark.sql(query)

In [12]:
assert type(count_users_500_fans()) == pyspark.sql.dataframe.DataFrame, \
    "The return value should be a Spark DataFrame."

AutograderHelper.assert_function_calls(count_users_500_fans, ["spark.sql"])

count_users_500_fans_submitted = count_users_500_fans().collect()[0][0]

                                                                                

In [13]:
assert count_users_500_fans_submitted != 8286, \
    "That is the number of users who have more than 500 funny ratings."

In [14]:
# Autograder cell. This cell is worth 2 points (out of 20). This cell contains hidden tests.

## -- BUSINESS REVIEWS --

Determine how many businesses have at least 4 stars and at least 100 reviews.

- The result should have 1 column and 1 row
- The name of the column does not matter

In [16]:
def business_reviews_count():

    # YOUR CODE HERE
#     raise NotImplementedError()

    query="""
          SELECT COUNT(*) FROM business WHERE stars>=4 AND review_count>=100
    """
    
    return spark.sql(query)

In [17]:
assert type(business_reviews_count()) == pyspark.sql.dataframe.DataFrame, \
    "The return value should be a Spark DataFrame."

AutograderHelper.assert_function_calls(business_reviews_count, ["spark.sql"])

business_reviews_count_submitted = business_reviews_count().collect()[0][0]


                                                                                

In [18]:
assert business_reviews_count_submitted != 2814, \
    (
        "2814 is the number of businesses with greater than 4 stars (you should include ones with 4 stars) "
        "and greater than 100 reviews (you should include ones with 100 reviews)."
    )

assert business_reviews_count_submitted != 7397, \
    (
        "7397 is the number of businesses with at least 4 stars and greater than 100 reviews (you should "
        "include ones with 100 reviews)."
    )
assert business_reviews_count_submitted != 2842, \
    (
        "2842 is the number of businesses with greater than 4 stars (you should include ones with 4 stars) "
        "and at least 100 reviews."
    )

In [19]:
# Autograder cell. This cell is worth 2 points (out of 20). This cell contains hidden tests.

## -- LITCHFIELD OHIO --

Get a list of businesses from Litchfield, OH. 

- The result should have these columns:
  - `business_id`
  - `name`
- The result rows do NOT need to be ordered

In [21]:
def litchfield_oh_businesses():
    # YOUR CODE HERE
#     raise NotImplementedError()

    query="""
          SELECT business_id, name FROM business WHERE city='Litchfield' AND state='OH'          
    """
    
    return spark.sql(query)

In [22]:
AutograderHelper.assert_function_calls(litchfield_oh_businesses, ["spark.sql"])

assert type(litchfield_oh_businesses()) == pyspark.sql.dataframe.DataFrame, \
    "The return value should be a Spark DataFrame."

litchfield_oh_business_names = [r["name"] for r in litchfield_oh_businesses().collect()]

                                                                                

In [23]:
assert "Tonios Pizza" in litchfield_oh_business_names, "'Tonios Pizza' should appear in the result."
assert "Hayseed" not in litchfield_oh_business_names, "'Hayseed' should not appear in the result."

In [24]:
# Autograder cell. This cell is worth 2 points (out of 20). This cell contains hidden tests.

## -- US STATES --

Determine which US states are represented in the data set. (The file `../../assets/data/states.csv` contains a list of US state names and abbreviations.)

- The result should have this columns:
  - `state` (the full name of the state in the dataset)
- The result rows do NOT need to be ordered

In [62]:
def states_names_in_data():
    # YOUR CODE HERE
#     raise NotImplementedError()

    states=spark.read.csv("../../assets/data/states.csv", header=True)
    states.createOrReplaceTempView("states")
    
    query="""
          SELECT DISTINCT states.state FROM states INNER JOIN business ON states.abbreviation=business.state
    """
    
    return spark.sql(query)

In [63]:
AutograderHelper.assert_function_calls(states_names_in_data, ["spark.sql"])

assert type(states_names_in_data()) == pyspark.sql.dataframe.DataFrame, \
    "The return value should be a Spark DataFrame."

state_names_list = [r["state"] for r in states_names_in_data().collect()]

                                                                                

In [64]:
assert "North Carolina" in state_names_list, "North Carolina should appear in the result."
assert "Michigan" not in state_names_list, "Michigan should appear in the result."

In [65]:
# Autograder cell. This cell is worth 3 points (out of 20). This cell contains hidden tests.

## -- FUNNIEST REVIEW --

Determine the text of the funniest review.

- The result should have 1 column and 1 row
- The name of the column does not matter

In [107]:
def funniest_review():
    # YOUR CODE HERE
#     raise NotImplementedError()
    
    query="""
          SELECT text FROM review WHERE funny=
          (
              SELECT MAX(funny) FROM review
          )
    """
    
    return spark.sql(query)

In [108]:
AutograderHelper.assert_function_calls(funniest_review, ["spark.sql"])

assert type(funniest_review()) == pyspark.sql.dataframe.DataFrame, \
    "The return value should be a Spark DataFrame."

funniest_review_first_row = funniest_review().take(1)[0]

                                                                                

In [109]:
funniest_review_len = len(funniest_review_first_row[0])
assert funniest_review_len == 421, \
    f"Hint: the funniest review has 421 characters (found {funniest_review_len})"

In [110]:
# Autograder cell. This cell is worth 2 points (out of 20). This cell contains hidden tests.

## -- REVIEW WORD COUNT -- 

Find the 10 reviews with the largest word counts.

- Use a UDF to determine review word counts using the Python `.split()` method on the string object.
- The result should have this columns:
  - `word_count`

In [117]:
def reviews_top_10_word_counts():
    # YOUR CODE HERE
#     raise NotImplementedError()

    from pyspark.sql.functions import udf
    from pyspark.sql.types import IntegerType
    
    def word_counts(string):
        return len(string.split())
    
    word_counts_int=udf(lambda x: word_counts(x), IntegerType())
    
    spark.udf.register("word_counts_int", word_counts_int)
    
    query="""
          SELECT word_counts_int(text) AS word_count FROM review 
          ORDER BY word_count DESC 
          LIMIT 10
    """
    
    return spark.sql(query)

In [118]:
AutograderHelper.assert_function_calls(reviews_top_10_word_counts, ["spark.sql"])

assert type(reviews_top_10_word_counts()) == pyspark.sql.dataframe.DataFrame, \
    "The return value should be a Spark DataFrame."

reviews_top_10_word_counts_first_row = reviews_top_10_word_counts().take(1)[0]

                                                                                

In [119]:
assert reviews_top_10_word_counts_first_row["word_count"] == 1056, "The first word_count should be 1056"

In [120]:
# Autograder cell. This cell is worth 3 points (out of 20). This cell contains hidden tests.

## -- MOST TIPS --

Determine the names of the top 100 users who provided the most tips.

- The result should have these columns:
  - `name`
  - `tip_count`
- The result should be sorted by highest-to-lowest tip_count, in the case of tip_count ties, the results should be sorted by name alphabetically. For example (this is fake data):
  ```
  +--------+---------+
  |    name|tip_count|
  +--------+---------+
  | Weifong|      167|
  |   Alice|       42|
  |     Bob|       42|
  |   Jamal|        3|
  +--------+---------+
  ```

In [70]:
def users_top_100_tip_count():
    # YOUR CODE HERE
#     raise NotImplementedError()

    query="""
          SELECT name, tip_count FROM
          (
              SELECT user.name as name, COUNT(tip.user_id) AS tip_count 
              FROM user INNER JOIN tip
              ON user.user_id=tip.user_id
              GROUP BY name, tip.user_id
              ORDER BY tip_count DESC, name
          )
          LIMIT 100
        """

    return spark.sql(query)

In [71]:
AutograderHelper.assert_function_calls(users_top_100_tip_count, ["spark.sql"])

assert type(users_top_100_tip_count()) == pyspark.sql.dataframe.DataFrame, \
    "The return value should be a Spark DataFrame."

users_top_100_tip_count_first_row = users_top_100_tip_count().take(1)[0]

                                                                                

In [72]:
assert users_top_100_tip_count_first_row["name"] == "Momo", "The first name should be Momo"
assert users_top_100_tip_count_first_row["tip_count"] == 2439, "The first tip_count should be 2439"

In [73]:
# Autograder cell. This cell is worth 3 points (out of 20). This cell contains hidden tests.

## -- ARIZONA SUMMARY -- 

List the names, number of reviews of businesses in Arizona ('AZ') and total number of reviews of the top 10 users (as determined by who has created the most number of reviews of businesses in Arizona). Include a column that shows the percentage of reviews that are of businesses from Arizona. 

- The result should have these columns:
  - `name`
  - `az_count`
  - `total_count`
  - `percent` (this will only be checked to within 0.01)
- The result should be sorted by highest-to-lowest `az_count`, in the case of `az_count ties`, the results should be sorted by highest-to-lowest `percent`



The first row of the results should be:
```
+--------+--------+-----------+---------+
|    name|az_count|total_count|  percent|
+--------+--------+-----------+---------+
|    Brad|    1637|       1642|99.695496|
+--------+--------+-----------+---------+
```

In [11]:
business.columns

['address',
 'attributes',
 'business_id',
 'categories',
 'city',
 'hours',
 'is_open',
 'latitude',
 'longitude',
 'name',
 'postal_code',
 'review_count',
 'stars',
 'state']

In [8]:
review.first()

Row(business_id='ujmEBvifdJM6h6RLv4wQIg', cool=0, date='2013-05-07 04:34:36', funny=1, review_id='Q1sbwvVQXV2734tPgoKj4Q', stars=1.0, text='Total bill for this horrible service? Over $8Gs. These crooks actually had the nerve to charge us $69 for 3 pills. I checked online the pills can be had for 19 cents EACH! Avoid Hospital ERs at all costs.', useful=6, user_id='hG7b0MtEbXx5QzbzE6C_VA')

In [20]:
review.columns

['business_id',
 'cool',
 'date',
 'funny',
 'review_id',
 'stars',
 'text',
 'useful',
 'user_id']

In [9]:
user.first()

Row(average_stars=4.03, compliment_cool=1, compliment_cute=0, compliment_funny=1, compliment_hot=2, compliment_list=0, compliment_more=0, compliment_note=1, compliment_photos=0, compliment_plain=1, compliment_profile=0, compliment_writer=2, cool=25, elite='2015,2016,2017', fans=5, friends='c78V-rj8NQcQjOI8KP3UEA, alRMgPcngYSCJ5naFRBz5g, ajcnq75Z5xxkvUSmmJ1bCg, BSMAmp2-wMzCkhTfq9ToNg, jka10dk9ygX76hJG0gfPZQ, dut0e4xvme7QSlesOycHQA, l4l5lBnK356zBua7B-UJ6Q, 0HicMOOs-M_gl2eO-zES4Q, _uI57wL2fLyftrcSFpfSGQ, T4_Qd0YWbC3co6WSMw4vxg, iBRoLWPtWmsI1kdbE9ORSA, xjrUcid6Ymq0DoTJELkYyw, GqadWVzJ6At-vgLzK_SKgA, DvB13VJBmSnbFXBVBsKmDA, vRP9nQkYTeNioDjtxZlVhg, gT0A1iN3eeQ8EMAjJhwQtw, 6yCWjFPtp_AD4x93WAwmnw, 1dKzpNnib-JlViKv8_Gt5g, 3Bv4_JxHXq-gVLOxYMQX0Q, ikQyfu1iViYh8T0us7wiFQ, f1GGltNaB7K5DR1jf3dOmg, tgeFUChlh7v8bZFVl2-hjQ, -9-9oyXlqsMG2he5xIWdLQ, Adj9fBPVJad8vSs-mIP7gw, Ce49RY8CKXVsTifxRYFTsw, M1_7TLi8CbdA89nFLlH4iw, wFsNv-hqbW_F5-IRqfBN6g, 0Q1L7zXHocaUZ2gsG2XJeg, cBFgmOCBdhYa0xoFEAzp_g, VrD_AgiFvzqtl

In [10]:
user.columns

['average_stars',
 'compliment_cool',
 'compliment_cute',
 'compliment_funny',
 'compliment_hot',
 'compliment_list',
 'compliment_more',
 'compliment_note',
 'compliment_photos',
 'compliment_plain',
 'compliment_profile',
 'compliment_writer',
 'cool',
 'elite',
 'fans',
 'friends',
 'funny',
 'name',
 'review_count',
 'useful',
 'user_id',
 'yelping_since']

In [35]:
def arizona_summary():
    # YOUR CODE HERE
#     raise NotImplementedError()

#     query="""
#           SELECT name, az_count, total_count, (az_count/total_count)*100 AS percent FROM
#           (
#           SELECT user.name AS name, SUM(business.review_count) AS az_count, SUM(user.review_count) as total_count FROM
#           business INNER JOIN review
#           ON business.business_id=review.business_id
#           INNER JOIN user
#           ON user.user_id=review.user_id
#           WHERE business.state='AZ'
#           GROUP BY user.name, review.user_id
#           )
#           ORDER BY az_count DESC, percent DESC
#     """

#     query="""
#               SELECT name, az_count, total_count, (az_count/total_count)*100 AS percent FROM
#               (
#               SELECT user.name AS name, SUM(business.review_count) AS az_count, SUM(user.review_count) as total_count FROM 
#               user INNER JOIN review 
#               ON user.user_id=review.user_id 
#               INNER JOIN business
#               ON business.business_id=review.business_id
#               WHERE business.state='AZ'
#               )
#               ORDER BY az_count DESC, percent DESC
#         """

#     query="""
#               SELECT name, az_count, total_count, (az_count/total_count)*100 AS percent FROM
#               (
#               SELECT user.name AS name, az_count=
#               (
#               SELECT SUM(business.review_count) FROM business WHERE business.state='AZ'
#               ), 
#               SUM(user.review_count) as total_count FROM
#               user INNER JOIN review 
#               ON user.user_id=review.user_id 
#               INNER JOIN business
#               ON business.business_id=review.business_id
#               WHERE business.state='AZ'
#               )
#               ORDER BY az_count DESC, percent DESC
#         """

#     query="""
#           SELECT name, az_count, total_count, (az_count/total_count)*100 AS percent FROM
#           (
#           SELECT user.name, review.user_id AS name FROM
#           user INNER JOIN review
#           ON user.user_id=review.user_id
          
#           )
#     """

#     query="""
#                SELECT name, az_count, total_count, (az_count/total_count)*100 AS percent FROM
#                (
#                SELECT user.name AS name, SUM(business.review_count) AS az_count, SUM(user.review_count) as total_count FROM
#                business INNER JOIN review
#                ON business.business_id=review.business_id
#                HAVING business.state='AZ'
#                INNER JOIN user
#                ON user.user_id=review.user_id
#                GROUP BY user.name, review.user_id
               
#                )"""

#     query="""
#               SELECT name, az_count, total_count, (az_count/total_count)*100 AS percent FROM
#               (
#               SELECT user.name AS name, SUM(business.review_count) FILTER(WHERE business.state='AZ') AS `az_count`, 
#               SUM(user.review_count) as total_count FROM
#               business INNER JOIN review
#               ON business.business_id=review.business_id
#               INNER JOIN user
#               ON user.user_id=review.user_id
#               WHERE business.state='AZ'
#               GROUP BY user.name, review.user_id
#               )
#               ORDER BY az_count DESC, percent DESC
#         """
             
    
    spark.sql(query).show()
    
    return spark.sql(query)

# user.name as name, COUNT(tip.user_id) AS tip_count 
#           FROM user INNER JOIN tip
#           ON user.user_id=tip.user_id
#           GROUP BY name, tip.user_id
#           ORDER BY tip_count DESC

In [36]:
AutograderHelper.assert_function_calls(arizona_summary, ["spark.sql"])

assert type(arizona_summary()) == pyspark.sql.dataframe.DataFrame, \
    "The return value should be a Spark DataFrame."

arizona_summary_first_row = arizona_summary().take(1)[0]

ParseException: "\nmismatched input 'FROM' expecting <EOF>(line 2, pos 88)\n\n== SQL ==\n\n              SELECT name, az_count, total_count, (az_count/total_count)*100 AS percent FROM\n----------------------------------------------------------------------------------------^^^\n              (\n              SELECT user.name AS name, SUM(business.review_count), FILTER(WHERE business.state='AZ') AS `az_count`, \n              SUM(user.review_count) as total_count FROM\n              business INNER JOIN review\n              ON business.business_id=review.business_id\n              INNER JOIN user\n              ON user.user_id=review.user_id\n              WHERE business.state='AZ'\n              GROUP BY user.name, review.user_id\n              )\n              ORDER BY az_count DESC, percent DESC\n        \n"

In [28]:
# spark.sql("""
# SELECT name, total_count FROM
# ( 
# SELECT user.name as name, review.user_id, SUM(user.review_count) as total_count
# FROM user inner join review on user.user_id=review.user_id inner join business on business.business_id=review.business_id 
# group by user.name, review.user_id
# having name='Brad'
# limit 10
# )
# """).show()



+----+-----------+
|name|total_count|
+----+-----------+
|Brad|         12|
|Brad|       2944|
|Brad|        987|
|Brad|         22|
|Brad|         14|
|Brad|         54|
|Brad|          2|
|Brad|          4|
|Brad|          6|
|Brad|        192|
+----+-----------+



                                                                                

In [41]:
# spark.sql("""
# select name, review_count as total_count from user where name="Brad" order by total_count desc limit 1
# """).show()

[Stage 36:>                                                         (0 + 1) / 1]

+----+-----------+
|name|total_count|
+----+-----------+
|Brad|       1642|
+----+-----------+



                                                                                

In [43]:
# spark.sql("""
# select business_id, state from business where state='AZ' limit 5
# """).show()

+--------------------+-----+
|         business_id|state|
+--------------------+-----+
|1SWheh84yJXfytovI...|   AZ|
|xvX2CttrVhyG2z1dF...|   AZ|
|Y6iyemLX_oylRpnr3...|   AZ|
|1Dfx3zM-rW4n-31Ke...|   AZ|
|giC3pVVFxCRR89rAp...|   AZ|
+--------------------+-----+



In [49]:
# spark.sql("""
# select review.user_id, business.business_id, user.review_count from review inner join 
# business on review.business_id=business.business_id inner join user on user.user_id=review.user_id
# order by review_count desc
# limit 5
# """).show()



+--------------------+--------------------+------------+
|             user_id|         business_id|review_count|
+--------------------+--------------------+------------+
|8k3aO-mPeyhbR5HUu...|J-U6C8FgveGsDdHwR...|       13278|
|8k3aO-mPeyhbR5HUu...|ii8sAGBexBOJoYRFa...|       13278|
|8k3aO-mPeyhbR5HUu...|DfgZlNgKwBvCpA_0a...|       13278|
|8k3aO-mPeyhbR5HUu...|6Q7-wkCPc1KF75jZL...|       13278|
|8k3aO-mPeyhbR5HUu...|z7GAft_JrFNmAH5xb...|       13278|
+--------------------+--------------------+------------+





In [76]:
# spark.sql("""
# select user.name, review.user_id, sum(business.review_count) as az_count 
# from user inner join review 
# on user.user_id=review.user_id
# inner join business on business.businesS_id=review.business_id
# group by user.name, review.user_id
# order by az_count desc
# limit 10
# """).show()

ParseException: "\nmismatched input '(' expecting <EOF>(line 2, pos 67)\n\n== SQL ==\n\nselect user.name, review.user_id, sum(business.review_count) filter(where business.state='AZ') as az_count \n-------------------------------------------------------------------^^^\nfrom user inner join review \non user.user_id=review.user_id\ninner join business on business.businesS_id=review.business_id\ngroup by user.name, review.user_id\norder by az_count desc\nlimit 10\n"

In [None]:
# spark.sql("""
# SELECT name, total_count FROM
#           (
#           SELECT user.name as name, COUNT(review.user_id) AS total_count 
#           FROM user INNER JOIN review
#           ON user.user_id=tip.user_id
#           GROUP BY name, tip.user_id
#           ORDER BY tip_count DESC
#           )
#           LIMIT 100
# """)

In [None]:
# spark.sql("""
# SELECT user.name as name, count(review.user_id) as total_count
# from 
# """)

In [None]:
assert arizona_summary_first_row["name"] == "Brad", "The first name should be Brad"
assert arizona_summary_first_row["az_count"] == 1637, "The first az_count should be 1637"
assert arizona_summary_first_row["total_count"] == 1642, "The first total_count should be 1642"

assert round(arizona_summary_first_row["percent"], 2) == 99.70, \
    (
        f"The first percent should be about 99.70 (checking to "
        f"nearest 0.01, found {arizona_summary_first_row['percent']})"
    )

In [None]:
# Autograder cell. This cell is worth 3 points (out of 20). This cell contains hidden tests.