# Yelp Restaurant Scraper - Web Scraping Project

### Tom Nunziata 2019.06.03

In [21]:
#Define function that recieves parsed content and scrapes the content for the data we're interested in.

def YelpParser(content):
    
    #VERSION 2.0
    soup = BeautifulSoup(content)
    Block = [i for i in soup.select('div.largerScrollablePhotos__373c0__3FEIJ')]
    
    Total_Names= [i.text for i in soup.select('h3>a')]
    Names= [i.text for i in soup.select('h3>a') if 'adredir' not in i.get('href')] #Removing Sponsored Content
    #print(f'Number of Sponsored listings to remove: {len(Total_Names)} - {len(Names)} = {len(Total_Names)-len(Names)} ')
    
    link1 = 'https://www.yelp.com'
    Ratings = []
    Reviews = []
    Costs = []
    Style = []
    Address_Block = []
    Links = []
    
    for i in range(0, len(Total_Names)):
        if Total_Names[i] in Names:
            
            #Grab Ratings:
            try:
                rate = Block[i].div.div.span.div['aria-label']
                Ratings.append(float(rate.split(' ')[0]))
            except IndexError:
                Ratings.append(None)
    
            #Grab Number of Reviews
            try:
                rev = ((Block[i].div.find_all('div'))[0].find_all('div'))[3].text
                Reviews.append(int(rev.split(' ')[0]))     
            except IndexError:
                Reviews.append(0.0)
                
            #Grab Cost Scale
            try:
                cost = Block[i].find_all('span',{'class':'priceRange__373c0__2DY87'})[0].text
                Costs.append(len(cost))
            except IndexError:
                Costs.append(None)
                
            #Grab Cuisine Style
            try:
                style = Block[i].find_all('a',{'class':'link-size--default__373c0__1skgq'})
                style_lists = [[j.text] for j in style]
                Style.append(style_lists)
        
            except IndexError:
                Style.append(['Unknown'])
            
            #Grab Address Block
        
            add = [i.getText(separator=u' | ') for i in soup.select('div.container__373c0__19wDx')][i]
            if len(add) > 0:
                Address_Block.append(add)    
            else:
                Address_Block.append('Unknown')
                
            
            #Grab Link
            try:
                link2 = Block[i].a.get('href')
                Links.append(link1+link2)   
            except IndexError:
                Links.append(['No Link Available'])
    
    info = getAddress(Address_Block)

    Phone_Numbers = info[0]
    Address = info[1]
    Area = info[2]
    
    #print('Names- ',len(Names),'Rating - ',len(Ratings),'Reviews - ',len(Reviews),'Cost - ',len(Costs),'Cuisine - ',len(Style),
    #      'Address - ',len(Address),'Area - ',len(Area),'Phone #s - ',len(Phone_Numbers),'Link - ',len(Links))
    
    return {'Names':Names, 'Rating_(Max=5)':Ratings, 'Reviews':Reviews, 'Cost_(Max=4)': Costs, 'Type_cuisine':Style, 
            'Address': Address, 'Area': Area, 'Phone_Number': Phone_Numbers, 'Link': Links}


In [15]:
#Define function that recieves parsed content and scrapes the content for the data we're interested in.

def getAddress(Address_total):
    Addy_Split = [i.split(' | ') for i in Address_total]
    Address = []
    Phone_Number = []
    Area = []

    for i in Addy_Split:
        if len(i) >=3:
            Phone_Number.append(i[0])
            Area.append(i[-1])
            Address.append((''.join(i[1:-1])))
        elif 0 < len(i) <= 2:
            
            if i[0].replace(' ','').replace('+','').replace('(','').replace(')','').replace('-','').isdigit() == False:
                Phone_Number.append('Unknown')
                
                if len(i) == 2:
                    Address.append((''.join(i[0:-1])))
                    Area.append(i[-1])
                else:
                    Address.append((''.join(i[1:])))
                    Area.append(' ')
                
            else:
                Phone_Number.append(i[0])
                Address.append((''.join(i[1:])))
                Area.append(' ')


    return (Phone_Number, Address, Area)

In [8]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

class YelpSpider:

    def __init__(self, pages_to_scrape=1, sleep_interval=-1, content_parser=None):
        
        self.pages_to_scrape = pages_to_scrape
        self.sleep_interval = sleep_interval
        self.content_parser = content_parser
        self.data_frame = pd.DataFrame([])

        
    '''
    Define method that retrieves information from the website and handles potential access issues.
    '''    
    def scrape_url(self, url):
        try:
            response = requests.get(url, timeout = 10)

            #Check response status
            if response.status_code < 300:    
                result = self.content_parser(response.content)
                self.BuildDataFrame(result)
                
            elif 500 > response.status_code >= 400:
                return('Forbidden Access')
            else:
                return('Server Connection Error')

        except requests.exceptions.SSLError:
            print('Warning, no certificate')
            return('no certificate')
        except requests.exceptions.Timeout:
            print("Timeout")
            return('timeout')
        except requests.exceptions.TooManyRedirects:    #Built-in 30 max redirects
            print('Too many redirects')
            return('redirect exceeded')
        except requests.exceptions.RequestException as e:
            return(f'unknown error: {e}')
   
    """
    Function builds new data frame for page and appends it to the existing self.data_frame
    """
    def BuildDataFrame(self, r):
        
        self.data_frame = self.data_frame.append(pd.DataFrame(r) ,ignore_index=True)
        
        
    """
    Method exports Data Frame to CSV
    """
    def ExportDFtoCSV(self, City):
        
        self.data_frame.to_csv(f'YelpData-{City}.csv',index=False)   
    
    """
    After the class is initiated, call this function to start the scraping jobs.
    
    """
    def kickstart(self):
        
        city_in = input('Enter a city:')
        state_country_in = input("Enter city's state (if in USA), or city's country (otherwise):")
        
        City = city_in.replace(' ','%20')
        State_Country = state_country_in.replace(' ','%20')
        
        if self.sleep_interval >0:
            import time
            for i in range(1, self.pages_to_scrape+1):
                n = (i-1)*30
                self.scrape_url(f'https://www.yelp.com/search?find_desc=Restaurants&find_loc={City}%2C%20{State_Country}&start={n}')
                time.sleep(self.sleep_interval)
        else:
            for i in range(1, self.pages_to_scrape+1):
                n = (i-1)*30
                self.scrape_url(f'https://www.yelp.com/search?find_desc=Restaurants&find_loc={City}%2C%20{State_Country}&start={n}')
               
        display(self.data_frame)    #Displays final data frame when loop is completed
        self.ExportDFtoCSV(City)         #Exports final data frame to CSV.

In [20]:
'''
Execute
'''

PAGES_TO_SCRAPE = 3 # how many webpages to scrape

# Initiate the YelpSpider class
my_yelp_spider = YelpSpider(PAGES_TO_SCRAPE, content_parser=YelpParser)

# Start scraping jobs
my_yelp_spider.kickstart()

Enter a city:Paris
Enter city's state (if in USA), or city's country (otherwise):France
Number of Sponsored listings to remove: 30 - 30 = 0 
Names-  30 Rating -  30 Reviews -  30 Cost -  30 Cuisine -  30 Address -  30 Area -  30 Phone #s -  30 Link -  30
['Le Comptoir de la Gastronomie', 'La Coïncidence', 'Le Potager du Père Thierry', 'Le Bistrot des Augustins', 'Le Temps des Cerises', 'Le Bistro du Périgord', 'Les Antiquaires', 'Chez Janou', 'L’Avant Comptoir', 'Le Petit Canard', 'La Cave Gourmande', 'Le Volant Basque', 'Le Bistrot des Campagnes', 'Pain Vin Fromages', 'Les Cocottes', 'Le Porte Pot', 'Firmine', 'L’As du Fallafel', 'Café de Paris', 'La Fontaine de Mars', 'Frenchie Restaurant', 'Café Louise', 'Le Soufflé', 'Au Père Louis', 'Les Philosophes', 'Le Relais de l’Entrecôte', 'Gambino', 'Fraîche', 'Chez Fernand', 'Le Boui Boui']
Number of Sponsored listings to remove: 30 - 30 = 0 
Names-  30 Rating -  30 Reviews -  30 Cost -  30 Cuisine -  30 Address -  30 Area -  30 Phone #s -

Unnamed: 0,Names,Rating_(Max=5),Reviews,Cost_(Max=4),Type_cuisine,Address,Area,Phone_Number,Link
0,Le Comptoir de la Gastronomie,4.5,902,2,[[French]],34 rue Montmartre,Châtelet/Les Halles,01 42 33 31 32,https://www.yelp.com/biz/le-comptoir-de-la-gas...
1,La Coïncidence,4.5,433,2,[[French]],15 rue Mesnil,Trocadéro/Iéna,01 47 55 96 44,https://www.yelp.com/biz/la-co%C3%AFncidence-p...
2,Le Potager du Père Thierry,4.5,395,2,[[French]],16 rue des Trois Frères,Montmartre,01 53 28 26 20,https://www.yelp.com/biz/le-potager-du-p%C3%A8...
3,Le Bistrot des Augustins,4.5,335,2,"[[Bistros], [Wine Bars]]",39 quai des Grands Augustins,Saint-Michel/Odéon,01 43 54 04 41,https://www.yelp.com/biz/le-bistrot-des-august...
4,Le Temps des Cerises,4.5,310,2,[[Bistros]],31 rue de la Cerisaie,Bastille,01 42 72 08 63,https://www.yelp.com/biz/le-temps-des-cerises-...
5,Le Bistro du Périgord,4.5,332,3,[[Bistros]],71 rue Saint-Jacques,Saint-Michel/Odéon,01 43 29 67 49,https://www.yelp.com/biz/le-bistro-du-p%C3%A9r...
6,Les Antiquaires,4.5,263,2,[[French]],13 rue du Bac,Musée d'Orsay,01 42 61 08 36,https://www.yelp.com/biz/les-antiquaires-paris...
7,Chez Janou,4.0,491,2,"[[French], [Brasseries], [Bistros]]",2 rue Roger Verlomme,Bastille,01 42 72 28 41,https://www.yelp.com/biz/chez-janou-paris?osq=...
8,L’Avant Comptoir,4.5,585,2,"[[Tapas Bars], [Sushi Bars]]",Located in Hotel Relais Saint Germain,Saint-Michel/Odéon,01 42 38 47 55,https://www.yelp.com/biz/l-avant-comptoir-pari...
9,Le Petit Canard,4.5,168,2,[[French]],19 rue Henri Monnier,Pigalle,01 49 70 07 95,https://www.yelp.com/biz/le-petit-canard-paris...
