In [None]:
from requests import Session
from lxml import html
from datetime import datetime
import time
import random
import sys
import logging
import psycopg2
from psycopg2 import sql
import pytz
import ssl

In [None]:

#rds settings
rds_host  = ""
name = "" 
db_name = "" 
password = ""


logger = logging.getLogger()
logger.setLevel(logging.INFO)

try:
    conn = psycopg2.connect(host=rds_host, user=name, password=password, database=db_name, port=5432, connect_timeout=5)
except psycopg2.OperationalError as e:
    logger.error("ERROR: Unexpected error: Could not connect to postGreSQL instance.")
    logger.error(e)
    sys.exit()

logger.info("SUCCESS: Connection to RDS postGreSQL instance succeeded")


# 2020-01-17 Added review_id, user_id
def yelpScraper(business_id):
    base_url = "https://www.yelp.com/biz/" # add business id
    api_url = "/review_feed?sort_by=date_desc&start=" # add number

    results = []
    for n in range(1):
        with Session() as s:
            url = base_url + business_id + api_url + str(n*20)    
            with s.get(url, timeout=5) as r:    
                if r.status_code==200:
                    response = dict(r.json()) 
                    _html = html.fromstring(response['review_list']) 
                    dates = _html.xpath("//div[@class='review-content']/descendant::span[@class='rating-qualifier']/text()")
                    try:
                        dates = [datetime.strptime(d.strip(), format("%m/%d/%Y")) for d in dates]
                        stars = _html.xpath("//div[@class='review-content']/descendant::div[@class='biz-rating__stars']/div/@title")
                        stars = [float(s.split(' ')[0]) for s in stars]
                        texts = [e.text for e in _html.xpath("//div[@class='review-content']/p")]
                        review_ids = _html.xpath("//div[@class='review review--with-sidebar']/@data-review-id")
                        user_ids = [s.split(':')[1] for s in _html.xpath("//div[@class='review review--with-sidebar']/@data-signup-object")]
                        results = results + [[date, star, text, review_id, user_id] 
                            for date, star, text, review_id, user_id in zip(dates, stars, texts, review_ids, user_ids)]
                    except ValueError:
                        stars = _html.xpath("//div[@class='review-content']/descendant::div[@class='biz-rating__stars']/div/@title")
                        stars = [float(s.split(' ')[0]) for s in stars]
                        texts = [e.text for e in _html.xpath("//div[@class='review-content']/p")]
                        review_ids = _html.xpath("//div[@class='review review--with-sidebar']/@data-review-id")
                        user_ids = [s.split(':')[1] for s in _html.xpath("//div[@class='review review--with-sidebar']/@data-signup-object")]
                        results = results + [[date, star, text, review_id, user_id] 
                            for date, star, text, review_id, user_id in zip(dates, stars, texts, review_ids, user_ids)]
        time.sleep(random.uniform(0.1, 0.5))    
    return results


def fixMissingScrapedDates(results):# replace missing dates with surrounding dates
    for i in range(len(results)):
        try:
            if '/' not in results[i][0]:
                try:
                    results[i][0] = results[i+1][0]
                except IndexError:
                    try:
                        results[i][0] = results[i-1][0]
                    except IndexError:
                        results[i][0] = results[0][0]
        except TypeError:
            pass
    return results
    
def dbConnect(results, business_id):
    item_count = 0
    with conn.cursor() as cur:
        for i in results:
            dateReview = i[0]
            try: 
                # formatting for successfully scraped accounts
                dateReview = datetime.strftime(dateReview, "%Y-%m-%d")
            except TypeError:
                # formatting for problem dates that fall through the scraper 
                # and are still a scraper object/lmtree
                dateReview = str(i[0])
                dateReview = dateReview.strip()
                dateReview = datetime.strptime(dateReview, "%m/%d/%Y")
            stars = float(i[1])
            reviewText = i[2]
            reviewId = i[3]
            userId = i[4]
            cur.execute("select distinct business_id from lab.yelp_scraping;")
            
            
            cur.execute(sql.SQL("insert into {} (review_id, business_id, user_id, stars, datetime, date, time, text, timestamp) values ( %s, %s, %s, %s, %s, %s, %s, %s, %s);")
                        .format(sql.Identifier('lab','yelp_scraping')),
                        [#'uuid-ossp',#uuid
                         reviewId,# review id
                         business_id,
                         userId, # user id
                         stars, # stars
                         datetime.now(), # datetime
                         dateReview,# date of review
                         datetime.now(),# time without timezone,
                         reviewText,# review text
                         datetime.now(pytz.utc),# time with timezone
                         ])

            conn.commit()
            cur.execute("select * from lab.yelp_scraping")
            for row in cur:
                item_count += 1
                logger.info(row)
        conn.commit()
    return "Added %d items from RDS postGreSQL table" %(item_count)

#normal dates
# business_id = "rR5Y9mp2Yob3rgetJscPWQ"

#missing date examples
business_id = "lLO8Nj-kPJ_b6vEs022GxQ"
business_id = "fLsXOkjewq5BqQbuUPYC9g"
results = yelpScraper(business_id)
results = fixMissingScrapedDates(results)
dbConnect(results, business_id)