# Review data preparation

In [85]:
%load_ext autoreload
%autoreload 2

# Import ds libraries
import pandas as pd
import numpy as np
import re

from datetime import datetime, timedelta

# Import acquire functions
import nick_acquire as a
import nick_prepare as prep

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [188]:
api_reviews = pd.read_csv('reviews_progress.csv')
scrape_reviews = pd.read_csv('scrapted_reviews.csv')

In [200]:
api_reviews.head(3)

Unnamed: 0,camis,place_id,review_id,review_relative_time,review_rating,review_text,review_language,author_display_name,contributor_id,author_photo_uri,publish_time
0,41168748,ChIJNzTkzav1wokRg3R8aGzuiWI,ChdDSUhNMG9nS0VJQ0FnSURHd09qaWxBRRAB,a year ago,5,These Guys are really professional. Well behav...,en,Md Omar Faruk,113263009121414123208,https://lh3.googleusercontent.com/a-/ALV-UjWGD...,2021-11-14T20:50:12Z
1,41168748,ChIJNzTkzav1wokRg3R8aGzuiWI,ChdDSUhNMG9nS0VJQ0FnSUMyMzlTUXpnRRAB,a year ago,1,I haven't had Dunkin Donuts in years. Being th...,en,Hipolito Rivera,102916427998123407326,https://lh3.googleusercontent.com/a-/ALV-UjUN8...,2022-05-02T09:47:34Z
2,41168748,ChIJNzTkzav1wokRg3R8aGzuiWI,ChdDSUhNMG9nS0VJQ0FnSURrN2VDU3RRRRAB,2 years ago,5,"This Dunkin is nice, but it's located inside a...",en,DeeAnn Phillips,104489576491114386805,https://lh3.googleusercontent.com/a/ACg8ocL6Fa...,2021-05-22T23:23:44Z


In [204]:
def clean_api_reviews(api_data):
    df = api_data.copy()
    cols = ['camis', 'publish_time', 'review_text', 'review_rating']
    df = df[cols]
    return df

clean_api_reviews(api_reviews)

Unnamed: 0,camis,publish_time,review_text,review_rating
0,41168748,2021-11-14T20:50:12Z,These Guys are really professional. Well behav...,5
1,41168748,2022-05-02T09:47:34Z,I haven't had Dunkin Donuts in years. Being th...,1
2,41168748,2021-05-22T23:23:44Z,"This Dunkin is nice, but it's located inside a...",5
3,41168748,2021-08-17T02:55:05Z,Ordered a box of joes they tried to cheat with...,2
4,41168748,2023-03-06T16:17:03Z,Another review my daughter and friends love ...,4
...,...,...,...,...
43867,50107528,2023-05-24T21:33:14Z,"Easy to locate, great customer service. Went i...",2
43868,50107528,2023-07-16T04:20:30Z,A solid doner kebab. Good cuts of lamb and ple...,4
43869,50107528,2023-05-09T18:27:59Z,We spent 5 days in NYC eating a lot of great (...,5
43870,50107528,2023-03-05T21:48:09Z,Extremely authentic. Staff/owner is from Turke...,5


In [187]:
def clean_dates(data):
    scrape_reviews = data.copy()
    scrape_reviews.relative_date = scrape_reviews.relative_date.apply(lambda x: x[:-4])
    scrape_reviews.relative_date = ['1 years' if date == 'a year' else date for date in scrape_reviews.relative_date]
    scrape_reviews.relative_date = [re.sub(r'^a', '1', date) if date[0] == 'a' else date for date in scrape_reviews.relative_date]
    return scrape_reviews

In [189]:
reviews = clean_dates(scrape_reviews)

In [131]:
def adjust_dates(scrape_reviews):
    
    dataframes = []  # Create empty list to store dataframes
    
    # Isolate each restaurant by id
    for restaurant_id in scrape_reviews.id.unique():
        # Create dataframe of ONE restaurant
        restaurant = scrape_reviews[scrape_reviews.id == restaurant_id].copy()
        
        # Create df of review counts per relative_date and calculate average distribution of reviews
        place = scrape_reviews[scrape_reviews.id == restaurant_id]
        review_counts = pd.DataFrame(place.relative_date.value_counts())
        review_counts['increment'] = 365/review_counts.relative_date
        
        # Create empty list for new dates, i variable to count increments, and previous_year to track year 
        new_dates = []
        i = 0
        previous_year = '1 years'

        for date in restaurant.relative_date: 
            if 'years' in date:  # If date is in years, function will adjust it to estimated date
                if date != previous_year:  # When date changes from 'x years' to 'x + 1 years' counters are reset 
                    i = 0
                    previous_year = date
                # Calculate adjusted date
                adjusted_date = (365*(int(re.findall(r'\d+', date)[0]))) + (review_counts.loc[date].increment * i)
                i += 1
                new_dates.append(str(round(adjusted_date)))  # Append adjsuted date
            else:
                new_dates.append(date)  # Append normal date if date < 1 year
        restaurant['new_date'] = new_dates  # Replace dates with new_dates
        dataframes.append(restaurant)  # Append dataframe to list of dataframes
    reviews = pd.concat(dataframes)  # Join all dataframes
    return reviews  # Return joined data


In [190]:
reviews = adjust_dates(reviews)

In [138]:
def calculate_days(data):
    reviews = data.copy()
    new_date = []
    for date in reviews.new_date:
        #print(date)
        unit = re.sub(r'[^a-z]', '', date)
        if 'hour' in unit:
            new_date.append('1')
        elif 'day' in unit:
            new_date.append(re.sub(r'[^0-9]', '', date))
        elif 'week' in unit:
            new_date.append(int(re.sub(r'[^0-9]', '', date))*7)
        elif 'month' in unit:
            new_date.append(int(re.sub(r'[^0-9]', '', date))*30)
        else:
            new_date.append(date)

    reviews['newer_dates'] = new_date
    reviews['final_date'] = [pd.to_datetime(retrieval_date) - timedelta(days = n) for retrieval_date,n in zip(reviews.retrieval_date, reviews.newer_dates.astype(int))]
    return reviews

In [191]:
final_df = calculate_days(reviews)

In [205]:
def clean_reviews(data):
    final_df = data.copy()
    cols = ['id', 'final_date', 'caption', 'rating']
    final_df = final_df[cols]
    final_df.rating = final_df.rating.astype(int)
    final_df.columns = ['camis', 'publish_time', 'review_text', 'review_rating']
    return final_df

In [206]:
final_df = clean_reviews(final_df)

In [207]:
final_df.head()

Unnamed: 0,camis,publish_time,review_text,review_rating
0,41375676,2023-11-01 02:44:04.226505,,5
1,41375676,2023-11-01 02:44:04.227451,,5
2,41375676,2023-11-01 02:44:04.227954,,5
3,41375676,2023-10-29 02:44:04.228882,Fabulous food,5
4,41375676,2023-10-29 02:44:04.229576,"Overrated. Terrible service, the fault of the...",3


In [230]:
api_reviews = pd.read_csv('reviews_progress.csv')
scrape_reviews = pd.read_csv('scraped_reviews.csv')

In [231]:
r1 = prep.cleanse_reviews(scrape_reviews)
r1.head(3)

Unnamed: 0,camis,publish_time,review_text,review_rating
0,41375676,2023-11-01 02:44:04.226505,,5
1,41375676,2023-11-01 02:44:04.227451,,5
2,41375676,2023-11-01 02:44:04.227954,,5


In [232]:
r1.dtypes

camis                     int64
publish_time     datetime64[ns]
review_text              object
review_rating             int64
dtype: object

In [233]:
r2 = prep.clean_api_reviews(api_reviews)
r2.head(3)

Unnamed: 0,camis,publish_time,review_text,review_rating
0,41168748,2021-11-14 20:50:12+00:00,These Guys are really professional. Well behav...,5
1,41168748,2022-05-02 09:47:34+00:00,I haven't had Dunkin Donuts in years. Being th...,1
2,41168748,2021-05-22 23:23:44+00:00,"This Dunkin is nice, but it's located inside a...",5


In [234]:
r2.dtypes

camis                          int64
publish_time     datetime64[ns, UTC]
review_text                   object
review_rating                  int64
dtype: object

In [235]:
len(r2) + len(r1)

61337

In [237]:
joined_reviews = pd.concat([r1, r2])

In [238]:
joined_reviews.to_csv('reviews.csv', index=False)