## Part I - 1.2 Data cleaning

In [1]:
"""Display settings"""
from IPython.display import HTML, display
import tabulate

# optional `heading` arg. If provided it will be added to the first row as the table heading.
def displayResult(queryResult, heading=()):
    if heading != ():
        resultList = (heading,) + queryResult
        display(HTML(tabulate.tabulate([result for result in resultList], tablefmt='html')))
    else:
        display(HTML(tabulate.tabulate([result for result in queryResult], tablefmt='html')))

"""MySQL connection related functions and variables"""

import pymysql

def open_conn():
    """open the connection before each test case"""
    conn = pymysql.connect(user='public', password='ece656yelp',
                                   host='maindb.czbva1am4d4u.us-east-2.rds.amazonaws.com',
                                   database='yelp_db')
    return conn

def close_conn(conn):
    """close the connection after each test case"""
    conn.close()

def executeQuery(conn, query, commit=False):
    """ fetch result after query"""
    cursor = conn.cursor()
    query_num = query.count(";")
    if query_num > 1:
        for result in cursor.execute(query, params=None, multi=True):
            if result.with_rows:
                result = result.fetchall()
    else:
        cursor.execute(query)
        result = cursor.fetchall()
    # we commit the results only if we want the updates to the database
    # to persist.
    if commit:
        conn.commit()
    else:
        conn.rollback()
    # close the cursor used to execute the query
    cursor.close()
    return result

yelp_conn = open_conn()

#### 1. Check that no review is from the future or before Yelp's founding

In [2]:
query_1 = "SELECT id, date FROM review WHERE unix_timestamp(date) <= unix_timestamp('2004-10-01')\
           OR unix_timestamp(date) >= unix_timestamp('2018-01-01');"

result_1 = executeQuery(yelp_conn, query_1)

result_1

()

This shows 2 accounts that were created before Yelp's founding in October 2004.

This query deletes them:

In [3]:
# clean_1 = "DELETE FROM review WHERE unix_timestamp(date) <= unix_timestamp('2004-10-01')\
#            OR unix_timestamp(date) >= unix_timestamp('2018-01-01');"

# executeQuery(yelp_conn, clean_1)

#### 2. Can't leave a review dated before account creation

In [4]:
query_2 =  "SELECT user.id, user.yelping_since AS Date_started_yelping, review.date AS Date_of_Review\
            FROM (user INNER JOIN review ON user.id = review.user_id)\
            WHERE user.yelping_since > review.date\
            GROUP BY user.id;"

result_2 = executeQuery(yelp_conn, query_2)

In [5]:
len(result_2)

0

This means these users somehow posted a review before their account was created, suggesting a glitch with their database.

In [6]:
heading_2 = ("user.id", "user.yelping_since", "review.date")
displayResult(result_2[:20], heading_2)

0,1,2
user.id,user.yelping_since,review.date


This query deletes illegal reviews:

In [7]:
# clean_2 =  "DELETE FROM review WHERE id in\ 
#             (SELECT review.id FROM (user INNER JOIN review ON user.id = review.user_id)\
#             WHERE user.yelping_since > review.date\
#             GROUP BY user.id);"

# executeQuery(yelp_conn, clean_2)

#### 3. Can't be elite in a year before their account was made

In [8]:
query_3 =  "SELECT user.id, user.yelping_since AS Date_of_yelping, elite_years.year AS Year_of_Elite\
            FROM (user INNER JOIN elite_years ON user.id = elite_years.user_id)\
            WHERE YEAR(user.yelping_since) < elite_years.year\
            GROUP BY user.id;"

result_3 = executeQuery(yelp_conn, query_3)

In [9]:
len(result_3)

55493

In [10]:
heading_3 = ("user.id", "user.yelping_since", "elite_year.year")
displayResult(result_3[:20], heading_3)

0,1,2
user.id,user.yelping_since,elite_year.year
---1lKK3aKOuomHnwAkAow,2007-06-04 00:00:00,2013
--2vR0DIsmQ6WfcSzKWigw,2012-11-27 00:00:00,2015
--3l8wysfp49Z2TLnyT0vg,2013-12-14 00:00:00,2016
--3WaS23LcIXtxyFULJHTA,2010-05-02 00:00:00,2013
--41c9Tl0C9OGewIR7Qyzg,2011-07-03 00:00:00,2012
--4q8EyqThydQm-eKZpS-A,2008-01-07 00:00:00,2010
--56mD0sm1eOogphi2FFLw,2010-12-16 00:00:00,2016
--A4pFATzQJx9n4l1IAC3A,2015-09-08 00:00:00,2017
--cPqjzKHqHKmGala65zwg,2012-03-20 00:00:00,2014


This query deletes their elite records:

In [11]:
# clean_3 =  "DELETE FROM elite_years\
#               where user_id in (SELECT user.id\
#               FROM (user INNER JOIN (SELECT * FROM elite_years) AS E ON user.id = E.user_id)\
#               WHERE YEAR(user.yelping_since) < E.year\
#               GROUP BY user.id);"

# executeQuery(yelp_conn, clean_3)

#### 4. Can't checkin outside open hours

In [12]:
query_4 = "SELECT COUNT(*) FROM checkin JOIN (SELECT hours.business_id, SUBSTRING_INDEX(hours, '|', 1)\
           AS day_of_week, SUBSTRING_INDEX(SUBSTRING_INDEX(hours, '|', - 1), '-', 1) AS opening_time,\
           SUBSTRING_INDEX(SUBSTRING_INDEX(hours, '|', - 1), '-', - 1) AS closing_time FROM hours)\
           AS a ON a.business_id = checkin.business_id\
           AND a.day_of_week = SUBSTRING_INDEX(checkin.date, '-', 1)\
           WHERE a.opening_time > SUBSTRING_INDEX(checkin.date, '-', - 1)\
           AND a.closing_time < SUBSTRING_INDEX(checkin.date, '-', - 1);"
result_4 = executeQuery(yelp_conn, query_4)
result_4[0]

(480488,)

This shows that there are many check ins that occur outside of the businesses open hours which are potentially invalid checkins, but this is not a guarantee because it is possible that the business changed their hours of operation at some point after someone checked in, resulting in the discrepency. 

This also shows the need within the database for the date column in the checkin and the hours column in the hours table to be normalized by splitting into date, opening time and closing time columns as this would save computation time having to perform substring_index computations on every row. No change is made to the database to correct this because it may sometimes happen since the business may change its open hours without updating that on yelp.

#### 5. `User.review_count` cannot be less than the sum of the number of reviews by a user

In [13]:
query_5 = "select count(*) from user join (select count(user_id) as countedReviews, user_id from\
           review group by user_id) as a on a.user_id = user.id where a.countedReviews > review_count;"
result_5 = executeQuery(yelp_conn, query_5)
print(result_5[0])

(0,)


This shows that the way Yelp gets the review_count number is potentially flawed since it should never count there being less reviews than the number of reviews provided for each user, unless the dataset that it was acquiring the count from was out of date.

This query updates the incorrect review_count:

In [14]:
# clean_5_1 = "CREATE VIEW review_counts_for_users AS\
#              SELECT user_id, count(user_id) AS count\
#              FROM review GROUP BY user_id;"

# clean_5_2 = "UPDATE user SET review_count = (SELECT count\
#              FROM review_counts_for_users WHERE id=user_id);"

# executeQuery(yelp_conn, clean_5_1)
# executeQuery(yelp_conn, clean_5_2)

#### 6. Cannot be Elite in an invalid year
Invalid years include ones before 2004, years in the future or years they didn't post a review, tip or photo.

In [15]:
query_6 = "select count(*) from elite_years join \
           (SELECT user_id, SUBSTRING_INDEX(date, '-', 1) AS year FROM review) \
           as a on a.user_id=elite_years.user_id and a.year = elite_years.year \
           group by elite_years.user_id, elite_years.year;"
result_6 = executeQuery(yelp_conn, query_6)
print(result_6[0])

(37,)


In total there are 186900 entries in elite_years. 37 appear to be erroneous. This query deletes these incorrect elite records:

In [16]:
# clean_6 = "DELETE FROM elite_years where user_id in 
#              (select E.user_id from (SELECT * FROM elite_years) AS E join\
#              (SELECT user_id, SUBSTRING_INDEX(date, '-', 1) AS year FROM review)\
#              as a on a.user_id=E.user_id and a.year = E.year\
#              group by E.user_id, E.year);"
# executeQuery(yelp_conn, clean_6)