## Initialize

In [1]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import re
import time
import random
import datetime

In [2]:
ua = UserAgent()

## Find the total number of pages

In [3]:
def total_pages(url):
    useragent = ua.random    # randomly choose a user-agent
    headers = {'User-Agent':useragent}
    response = requests.get(url,headers=headers)
    soup = BeautifulSoup(response.content,'lxml')
    pages = soup.find('div',class_="pagination").find_all('li')[-1].find('a').get_text()
    pages = int(pages)    # 'pages' is the total number of pages
    return pages

## Function to parse each item's page

In [4]:
def help_retrieve(dic,key):
    try:
        return dic[key]
    except KeyError:
        return np.NaN

In [5]:
def parse_item(url):
    item_url = 'https://www.winemag.com/buying-guide' + url
    useragent = ua.random    # randomly choose a user-agent
    headers = {'User-Agent':useragent}
    response = requests.get(item_url,headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content,'lxml')
        try:
            description = soup.find('p',class_='description').get_text()
        except AttributeError:
            description = np.NaN
        try:
            taster = soup.find('span',class_='taster-area').get_text()
        except AttributeError:
            taster = np.NaN

        info_list = soup.find_all('li',class_='row')
        info_dict = dict()

        for element in info_list:
            tag = element.find_all('div')[0].get_text().strip()
            value = element.find_all('div')[1].get_text().strip()
            info_dict[tag] = value

        designation = help_retrieve(info_dict,"Designation")
        variety = help_retrieve(info_dict,"Variety")
        appellation = help_retrieve(info_dict,"Appellation")
        winery = help_retrieve(info_dict,"Winery")
        alcohol = help_retrieve(info_dict,"Alcohol")
        bottle_size = help_retrieve(info_dict,"Bottle Size")
        category = help_retrieve(info_dict,"Category")
        importer = help_retrieve(info_dict,"Importer")
        date_published = help_retrieve(info_dict,"Date Published")
        user_avg_rating = help_retrieve(info_dict,"User Avg Rating")

        if user_avg_rating == 'Not rated yet [Add Your Review]':
            user_avg_rating = np.NaN

        related_items = soup.find_all('li',class_='review-item')
        related_items = [element.find('a').get('data-review-id') for element in related_items]    # related_items holds a list of related items' ids

        return (description,taster,designation,variety,appellation,winery,alcohol,bottle_size,\
                category,importer,date_published,user_avg_rating,related_items)
    else:
        print(f"{item_url} request failed! Status code: {response.status_code}. Skip it.")
        raise RuntimeError

## Function to parse each page

In [6]:
# only store the ending of each item's url to save memory, because the former parts are the same for all
pattern = re.compile(r'https://www.winemag.com/buying-guide(?P<ending>.*)')

# request and parse each page
# return a list with items in this page as tuples inside
def parse_page(base_url,i):
    url = re.sub(r"page=\d+",f"page={i}",base_url)  
    useragent = ua.random    # randomly choose a user-agent
    headers = {'User-Agent':useragent}
    response = requests.get(url,headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content,'lxml')
        li = soup.find_all('li',class_="review-item")
        page = list()

        for element in li:
            try:
                name = element.find('h3').get_text()
                rating = element.find('span',class_="rating").find('strong').get_text()
                price = element.find('span',class_="price").get_text()
                price = price.replace('$','')
            except AttributeError:
                print(f"Some items are missed from page {i}: {url}. Skip it.")
                continue
            try:
                price = float(price)
            except ValueError:
                price = np.NaN
            item_url = element.find('a',class_="review-listing row").get('href')
            item_url = pattern.search(item_url).group('ending')
            item_id = element.find('a',class_="review-listing row").get('data-review-id')
            try:
                details = parse_item(item_url)
            except RuntimeError:
                details = (np.NaN,)*13
            all_info = (item_id,name,rating,price,item_url) + details
            page.append(all_info)
            
        return page
    else:
        print(f"Page #{i} request failed! Status code: {response.status_code}. Skip it.")
        raise RuntimeError()

## Scrape all the pages

In [7]:
def scrape_all(url):
    result = list()
    start = datetime.datetime.now()
    pages = total_pages(url)

    for i in range(1,pages+1):
        try:
            result.extend(parse_page(url,i))
            time.sleep(random.random())
        except RuntimeError:
            pass
        except:
            print(r"No worry! I'm still working :)")
        finally:
            if i%10 == 0:
                print(f"Process overview: {i} pages have been scraped...")

    end = datetime.datetime.now()
    interval = (end - start).total_seconds()
    hour = int(interval // 3600)
    minute = int((interval % 3600) // 60)
    second = int((interval % 3600) % 60)
    print(f"For base url: {url}")
    print(f"Scraping has been done in {hour}h {minute}min {second}s. Congrats!")
    
    return result

In [8]:
def scrape_url_list(urls, file_name="PortugueseWines_WineEnthusiastScrape.csv"):
    result = list()
    
    for url in urls:
        try:
            print(f"Begin scraping from base url: {url}")
            result.extend(scrape_all(url))
        except:
            print(f"Scraping from base url: {url} FAILED! Going to the next one...")
            continue
        
    df = pd.DataFrame(result,columns=['id','name','rating','price','item_url','description','taster',\
                                      'designation','variety','appellation','winery','alcohol','bottle_size',\
                                        'category','importer','date_published','user_avg_rating','related_items'])
    df.to_csv(file_name,encoding='utf-8')
    
    return df

In [9]:
urls = ['https://www.winemag.com/?s=&drink_type=wine&country=Portugal&page=1'
       ]

## Begin scraping!

In [10]:
df = scrape_url_list(urls)

Begin scraping from base url: https://www.winemag.com/?s=&drink_type=wine&country=Portugal&page=1
Process overview: 10 pages have been scraped...
Process overview: 20 pages have been scraped...
Process overview: 30 pages have been scraped...
Process overview: 40 pages have been scraped...
Process overview: 50 pages have been scraped...
Process overview: 60 pages have been scraped...
Page #66 request failed! Status code: 502. Skip it.
Process overview: 70 pages have been scraped...
Page #80 request failed! Status code: 404. Skip it.
Process overview: 80 pages have been scraped...
Process overview: 90 pages have been scraped...
Process overview: 100 pages have been scraped...
Process overview: 110 pages have been scraped...
Process overview: 120 pages have been scraped...
Process overview: 130 pages have been scraped...
Process overview: 140 pages have been scraped...
Process overview: 150 pages have been scraped...
Process overview: 160 pages have been scraped...
Process overview: 170 p

In [11]:
df

Unnamed: 0,id,name,rating,price,item_url,description,taster,designation,variety,appellation,winery,alcohol,bottle_size,category,importer,date_published,user_avg_rating,related_items
0,393609,Casa Ferreirinha 2018 Quinta da Leda Red (Douro),96,75.0,/casa-ferreirinha-2018-quinta-da-leda-red-douro/,\n This wine comes from o...,Roger Voss,Quinta da Leda,Portuguese Red,"Douro, Portugal",Casa Ferreirinha,14%,750 ml,Red,"Evaton, Inc",7/1/2022,,"[386828, 384780, 385019]"
1,396635,Quinta Vale D. Maria 2019 Vinha do Rio Red (Do...,95,130.0,/quinta-vale-d-maria-2019-vinha-do-rio-red-douro/,\n Named after the Rio To...,Roger Voss,Vinha do Rio,Portuguese Red,"Douro, Portugal",Quinta Vale D. Maria,15.5%,750 ml,Red,Aveleda Inc,7/1/2022,,"[354536, 354064, 344748]"
2,396563,Quinta do Vallado 2018 Vinha da Granja Red (Do...,95,270.0,/quinta-do-vallado-2018-vinha-da-granja-red-do...,\n Thirty-four-old vine v...,Roger Voss,Vinha da Granja,Portuguese Red,"Douro, Portugal",Quinta do Vallado,14.5%,750 ml,Red,Quintessential Wines,7/1/2022,,"[348685, 236440, 393609, 396563, 396635, 393614]"
3,396175,Aveleda 2017 Quinta d'Aguieira Touriga Naciona...,94,60.0,/aveleda-2017-quinta-daguieira-touriga-naciona...,"\n The wine, with its ric...",Roger Voss,Quinta d'Aguieira,"Touriga Nacional, Portuguese Red","Bairrada, Portugal",Aveleda,14%,750 ml,Red,Aveleda Inc,7/1/2022,,"[386695, 386538, 384824]"
4,396201,Conde Vimioso 2017 Vinha do Convento Red (Tejo),94,25.0,/conde-vimioso-2017-vinha-do-convento-red-tejo/,\n This small-production ...,Roger Voss,Vinha do Convento,Portuguese Red,"Tejo, Portugal",Conde Vimioso,14.5%,750 ml,Red,Terroir Selections,7/1/2022,,"[396551, 396166, 396439]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11605,109592,Offley 2005 Late Bottled Vintage (Port),88,24.0,/offley-2005-late-bottled-vintage-port-blend-p...,"\n Big, bold fruity wine,...",Roger Voss,Late Bottled Vintage,"Port, Port Blend","Port, Portugal",Offley,,750 ml,Port/Sherry,"Evaton, Inc",3/1/2010,,"[393024, 392457, 381592]"
11606,109580,Warre's 2000 Late Bottled Vintage (Port),88,62.0,/warres-2000-late-bottled-vintage-port-blend-p...,"\n A firm wine, dry in st...",Roger Voss,Late Bottled Vintage,"Port, Port Blend","Port, Portugal",Warre's,,750 ml,Port/Sherry,Vineyard Brands,3/1/2010,,"[354953, 357536, 358080]"
11607,109631,Poças NV Special Reserve Ruby (Port),88,19.0,/pocas-nv-special-reserve-ruby-port-109631/,,,,,,,,,,,,,
11608,109577,Quinta da Gaivosa 1999 Late Bottled Vintage (...,88,,/quinta-gaivosa-1999-late-bottled-vintage-port...,,,,,,,,,,,,,
