# Review data preparation

In [None]:
%load_ext autoreload
%autoreload 2

# Import ds libraries
import pandas as pd
import numpy as np
import re

from datetime import datetime, timedelta

# Import acquire functions
import nick_acquire as a
import nick_prepare as prep

In [None]:
api_reviews = pd.read_csv('reviews_progress.csv')
scrape_reviews = pd.read_csv('scraped_reviews.csv')

In [None]:
api_reviews.head(3)

In [None]:
def clean_api_reviews(api_data):
    df = api_data.copy()
    cols = ['camis', 'publish_time', 'review_text', 'review_rating']
    df = df[cols]
    return df

clean_api_reviews(api_reviews)

In [None]:
def clean_dates(data):
    scrape_reviews = data.copy()
    scrape_reviews.relative_date = scrape_reviews.relative_date.apply(lambda x: x[:-4])
    scrape_reviews.relative_date = ['1 years' if date == 'a year' else date for date in scrape_reviews.relative_date]
    scrape_reviews.relative_date = [re.sub(r'^a', '1', date) if date[0] == 'a' else date for date in scrape_reviews.relative_date]
    return scrape_reviews

In [None]:
reviews = clean_dates(scrape_reviews)

In [None]:
def adjust_dates(scrape_reviews):
    
    dataframes = []  # Create empty list to store dataframes
    
    # Isolate each restaurant by id
    for restaurant_id in scrape_reviews.id.unique():
        # Create dataframe of ONE restaurant
        restaurant = scrape_reviews[scrape_reviews.id == restaurant_id].copy()
        
        # Create df of review counts per relative_date and calculate average distribution of reviews
        place = scrape_reviews[scrape_reviews.id == restaurant_id]
        review_counts = pd.DataFrame(place.relative_date.value_counts())
        review_counts['increment'] = 365/review_counts.relative_date
        
        # Create empty list for new dates, i variable to count increments, and previous_year to track year 
        new_dates = []
        i = 0
        previous_year = '1 years'

        for date in restaurant.relative_date: 
            if 'years' in date:  # If date is in years, function will adjust it to estimated date
                if date != previous_year:  # When date changes from 'x years' to 'x + 1 years' counters are reset 
                    i = 0
                    previous_year = date
                # Calculate adjusted date
                adjusted_date = (365*(int(re.findall(r'\d+', date)[0]))) + (review_counts.loc[date].increment * i)
                i += 1
                new_dates.append(str(round(adjusted_date)))  # Append adjsuted date
            else:
                new_dates.append(date)  # Append normal date if date < 1 year
        restaurant['new_date'] = new_dates  # Replace dates with new_dates
        dataframes.append(restaurant)  # Append dataframe to list of dataframes
    reviews = pd.concat(dataframes)  # Join all dataframes
    return reviews  # Return joined data


In [None]:
s = pd.read_csv('scraped_reviews.csv')

In [None]:
place = s[s.id == 41375676]
review_counts = pd.DataFrame(place.relative_date.value_counts())
#review_counts.rename(columns={review_counts.columns[0]:'new_name'})
# review_counts['increment'] = 365 / review_counts.index
review_counts

In [None]:
reviews = adjust_dates(reviews)

In [None]:
def calculate_days(data):
    reviews = data.copy()
    new_date = []
    for date in reviews.new_date:
        #print(date)
        unit = re.sub(r'[^a-z]', '', date)
        if 'hour' in unit:
            new_date.append('1')
        elif 'day' in unit:
            new_date.append(re.sub(r'[^0-9]', '', date))
        elif 'week' in unit:
            new_date.append(int(re.sub(r'[^0-9]', '', date))*7)
        elif 'month' in unit:
            new_date.append(int(re.sub(r'[^0-9]', '', date))*30)
        else:
            new_date.append(date)

    reviews['newer_dates'] = new_date
    reviews['final_date'] = [pd.to_datetime(retrieval_date) - timedelta(days = n) for retrieval_date,n in zip(reviews.retrieval_date, reviews.newer_dates.astype(int))]
    return reviews

In [None]:
final_df = calculate_days(reviews)

In [None]:
def clean_reviews(data):
    final_df = data.copy()
    cols = ['id', 'final_date', 'caption', 'rating']
    final_df = final_df[cols]
    final_df.rating = final_df.rating.astype(int)
    final_df.columns = ['camis', 'publish_time', 'review_text', 'review_rating']
    return final_df

In [None]:
final_df = clean_reviews(final_df)

In [None]:
final_df.head()

In [None]:
api_reviews = pd.read_csv('reviews_progress.csv')
scrape_reviews = pd.read_csv('scraped_reviews.csv')

In [None]:
prep.cleanse_reviews(scrape_reviews, api_reviews)

In [None]:
r = pd.read_csv('reviews.csv')

In [None]:
pd.to_datetime(r.publish_time)

In [None]:
print(pd. __version__)