## Part I - 1.2 Data cleaning

In [4]:
"""Global functions and variables"""

import pymysql

def open_conn():
    """open the connection before each test case"""
    conn = pymysql.connect(user='public', password='ece656yelp',
                                   host='maindb.czbva1am4d4u.us-east-2.rds.amazonaws.com',
                                   database='yelp_db')
    return conn

def close_conn(conn):
    """close the connection after each test case"""
    conn.close()

def executeQuery(conn, query, commit=False):
    """ fetch result after query"""
    cursor = conn.cursor()
    query_num = query.count(";")
    if query_num > 1:
        for result in cursor.execute(query, params=None, multi=True):
            if result.with_rows:
                result = result.fetchall()
    else:
        cursor.execute(query)
        result = cursor.fetchall()
    # we commit the results only if we want the updates to Can't leave a review dated before account creationthe database
    # to persist.
    if commit:
        conn.commit()
    else:
        conn.rollback()
    # close the cursor used to execute the query
    cursor.close()
    return result

yelp_conn = open_conn()

#### 1. Check that no review is from the future or before Yelp's founding

In [13]:
query_1 = "SELECT id, date FROM review WHERE unix_timestamp(date) <= unix_timestamp('2004-10-01')\
           OR unix_timestamp(date) >= unix_timestamp('2018-01-01')";

result_1 = executeQuery(yelp_conn, query_1)
result_1

(('03B9-gqbeGoMmPJbNzNT5w', datetime.datetime(2004, 9, 15, 0, 0)), ('PbIY2aIyszb6he6J-ey67w', datetime.datetime(2004, 7, 22, 0, 0)))

This shows 2 accounts that were created before Yelp's founding in October 2004.

#### 2. Can't leave a review dated before account creation

In [2]:
query_2 =  "SELECT user.id, user.yelping_since AS Date_started_yelping, review.date AS Date_of_Review\
            FROM (user INNER JOIN review ON user.id = review.user_id)\
            WHERE user.yelping_since > review.date\
            GROUP BY user.id;"

result_2 = executeQuery(yelp_conn, query_2)

In [3]:
len(result_2)

191

This means these users somehow posted a review before their account was created, suggesting a glitch with their database.

In [5]:
output = [print(result) for result in result_2[:20]]

('-58CWJ48is4duXgpvsWEGA', datetime.datetime(2013, 9, 18, 0, 0), datetime.datetime(2008, 10, 23, 0, 0))
('-9NfX8JO_5UVN_h1K8yOcg', datetime.datetime(2015, 2, 12, 0, 0), datetime.datetime(2010, 2, 26, 0, 0))
('-kEsfYKPs1_rgEWEIui2Mw', datetime.datetime(2015, 2, 14, 0, 0), datetime.datetime(2014, 4, 7, 0, 0))
('-KP8Me2KRqO7IwKIaFL-Vg', datetime.datetime(2013, 10, 19, 0, 0), datetime.datetime(2013, 9, 2, 0, 0))
('09T8OU8BDhQkiU8m4vZy_A', datetime.datetime(2013, 10, 21, 0, 0), datetime.datetime(2013, 10, 16, 0, 0))
('0xjJDvZ6gZVoWRFEZJ48wA', datetime.datetime(2007, 1, 17, 0, 0), datetime.datetime(2007, 1, 15, 0, 0))
('1F9di6oPHhQm1qjZIcbsYA', datetime.datetime(2013, 12, 5, 0, 0), datetime.datetime(2013, 10, 5, 0, 0))
('2Ea6wAkeOPyZ7BD-0rPejQ', datetime.datetime(2013, 9, 18, 0, 0), datetime.datetime(2012, 10, 9, 0, 0))
('2oxUNDpouxH8Y02yG6pG-w', datetime.datetime(2010, 4, 28, 0, 0), datetime.datetime(2006, 1, 25, 0, 0))
('37jJedy6_ptCmNvBJ-H54g', datetime.datetime(2014, 8, 16, 0, 0), dateti

#### 3. Can't be elite in a year before their account was made

In [26]:
query_3 =  "SELECT user.id, user.yelping_since AS Date_of_yelping, elite_years.year AS Year_of_Elite\
            FROM (user INNER JOIN elite_years ON user.id = elite_years.user_id)\
            WHERE YEAR(user.yelping_since) < elite_years.year\
            GROUP BY user.id;"

result_3 = executeQuery(yelp_conn, query_3)

In [27]:
len(result_3)

55493

#### 4. Can't checkin outside open hours

In [33]:
query_4 = "SELECT COUNT(*) FROM checkin JOIN (SELECT hours.business_id, SUBSTRING_INDEX(hours, '|', 1)\
           AS day_of_week, SUBSTRING_INDEX(SUBSTRING_INDEX(hours, '|', - 1), '-', 1) AS opening_time,\
           SUBSTRING_INDEX(SUBSTRING_INDEX(hours, '|', - 1), '-', - 1) AS closing_time FROM hours)\
           AS a ON a.business_id = checkin.business_id\
           AND a.day_of_week = SUBSTRING_INDEX(checkin.date, '-', 1)\
           WHERE a.opening_time > SUBSTRING_INDEX(checkin.date, '-', - 1)\
           AND a.closing_time < SUBSTRING_INDEX(checkin.date, '-', - 1);"
result_4 = executeQuery(yelp_conn, query_4)
result_4[0]

((480488,),)

This shows that there are many check ins that occur outside of the businesses open hours which are potentially invalid checkins, but this is not a guarantee because it is possible that the business changed their hours of operation at some point after someone checked in, resulting in the discrepency. 

This also shows the need within the database for the date column in the checkin and the hours column in the hours table to be normalized by splitting into date, opening time and closing time columns as this would save computation time having to perform substring_index computations on every row.

#### 5. `User.review_count` cannot be less than the sum of the number of reviews by a user

In [17]:
query_5 = "select count(*) from user join (select count(user_id) as countedReviews, user_id from\
           review group by user_id) as a on a.user_id = user.id where a.countedReviews > review_count;"
result_5 = executeQuery(yelp_conn, query_5)
print(result_5[0])

(1319,)


This shows that the way Yelp gets the review_count number is potentially flawed since it should never count there being less reviews than the number of reviews provided for each user, unless the dataset that it was acquiring the count from was out of date.

#### 6. Cannot be Elite in an invalid year
Invalid years include ones before 2004, years in the future or years they didn't post a review, tip or photo.

In [12]:
query_6 = "select count(*) from elite_years join \
(SELECT user_id, SUBSTRING_INDEX(date, '-', 1) AS year FROM review) \
as a on a.user_id=elite_years.user_id and a.year = elite_years.year \
group by elite_years.user_id, elite_years.year;"
result_6 = executeQuery(yelp_conn, query_6)
print(result_6[0])

(37,)


In total there are 186900 entries in elite_years. 37 appear to be erroneous.