In [1]:
!pip install fake_useragent
!pip install re



ERROR: Could not find a version that satisfies the requirement re (from versions: none)
ERROR: No matching distribution found for re


## Initialize

In [2]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import re
import time
import random
import datetime

In [3]:
ua = UserAgent()

## Find the total number of pages

In [4]:
def total_pages(url):
    useragent = ua.random    # randomly choose a user-agent
    headers = {'User-Agent':useragent}
    response = requests.get(url,headers=headers)
    soup = BeautifulSoup(response.content,'lxml')
    pages = soup.find('div',class_="pagination").find_all('li')[-1].find('a').get_text()
    pages = int(pages)    # 'pages' is the total number of pages
    return pages

## Function to parse each item's page

In [5]:
def help_retrieve(dic,key):
    try:
        return dic[key]
    except KeyError:
        return np.NaN

In [6]:
def parse_item(url):
    item_url = 'https://www.winemag.com/buying-guide' + url
    useragent = ua.random    # randomly choose a user-agent
    headers = {'User-Agent':useragent}
    response = requests.get(item_url,headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content,'lxml')
        try:
            description = soup.find('p',class_='description').get_text()
        except AttributeError:
            description = np.NaN
        try:
            taster = soup.find('span',class_='taster-area').get_text()
        except AttributeError:
            taster = np.NaN

        info_list = soup.find_all('li',class_='row')
        info_dict = dict()

        for element in info_list:
            tag = element.find_all('div')[0].get_text().strip()
            value = element.find_all('div')[1].get_text().strip()
            info_dict[tag] = value

        designation = help_retrieve(info_dict,"Designation")
        variety = help_retrieve(info_dict,"Variety")
        appellation = help_retrieve(info_dict,"Appellation")
        winery = help_retrieve(info_dict,"Winery")
        alcohol = help_retrieve(info_dict,"Alcohol")
        bottle_size = help_retrieve(info_dict,"Bottle Size")
        category = help_retrieve(info_dict,"Category")
        importer = help_retrieve(info_dict,"Importer")
        date_published = help_retrieve(info_dict,"Date Published")
        user_avg_rating = help_retrieve(info_dict,"User Avg Rating")

        if user_avg_rating == 'Not rated yet [Add Your Review]':
            user_avg_rating = np.NaN

        related_items = soup.find_all('li',class_='review-item')
        related_items = [element.find('a').get('data-review-id') for element in related_items]    # related_items holds a list of related items' ids

        return (description,taster,designation,variety,appellation,winery,alcohol,bottle_size,\
                category,importer,date_published,user_avg_rating,related_items)
    else:
        print(f"{item_url} request failed! Status code: {response.status_code}. Skip it.")
        raise RuntimeError

## Function to parse each page

In [7]:
# only store the ending of each item's url to save memory, because the former parts are the same for all
pattern = re.compile(r'https://www.winemag.com/buying-guide(?P<ending>.*)')

# request and parse each page
# return a list with items in this page as tuples inside
def parse_page(base_url,i):
    url = re.sub(r"page=\d+",f"page={i}",base_url)  
    useragent = ua.random    # randomly choose a user-agent
    headers = {'User-Agent':useragent}
    response = requests.get(url,headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content,'lxml')
        li = soup.find_all('li',class_="review-item")
        page = list()

        for element in li:
            try:
                name = element.find('h3').get_text()
                rating = element.find('span',class_="rating").find('strong').get_text()
                price = element.find('span',class_="price").get_text()
                price = price.replace('$','')
            except AttributeError:
                print(f"Some items are missed from page {i}: {url}. Skip it.")
                continue
            try:
                price = float(price)
            except ValueError:
                price = np.NaN
            item_url = element.find('a',class_="review-listing row").get('href')
            item_url = pattern.search(item_url).group('ending')
            item_id = element.find('a',class_="review-listing row").get('data-review-id')
            try:
                details = parse_item(item_url)
            except RuntimeError:
                details = (np.NaN,)*13
            all_info = (item_id,name,rating,price,item_url) + details
            page.append(all_info)
            
        return page
    else:
        print(f"Page #{i} request failed! Status code: {response.status_code}. Skip it.")
        raise RuntimeError()

## Scrape all the pages

In [8]:
def scrape_all(url):
    result = list()
    start = datetime.datetime.now()
    pages = total_pages(url)

    for i in range(1,pages+1):
        try:
            result.extend(parse_page(url,i))
            time.sleep(random.random())
        except RuntimeError:
            pass
        except:
            print(r"No worry! I'm still working :)")
        finally:
            if i%10 == 0:
                print(f"Process overview: {i} pages have been scraped...")

    end = datetime.datetime.now()
    interval = (end - start).total_seconds()
    hour = int(interval // 3600)
    minute = int((interval % 3600) // 60)
    second = int((interval % 3600) % 60)
    print(f"For base url: {url}")
    print(f"Scraping has been done in {hour}h {minute}min {second}s. Congrats!")
    
    return result

In [9]:
def scrape_url_list(urls, file_name="raw_data.csv"):
    result = list()
    
    for url in urls:
        try:
            print(f"Begin scraping from base url: {url}")
            result.extend(scrape_all(url))
        except:
            print(f"Scraping from base url: {url} FAILED! Going to the next one...")
            continue
        
    df = pd.DataFrame(result,columns=['id','name','rating','price','item_url','description','taster',\
                                      'designation','variety','appellation','winery','alcohol','bottle_size',\
                                        'category','importer','date_published','user_avg_rating','related_items'])
    df.to_csv(file_name,encoding='utf-8')
    
    return df

urls = ['https://www.winemag.com/?s=&rating=98.0-*&drink_type=wine&page=1&search_type=reviews',\
       'https://www.winemag.com/?s=&rating=94.0-97.99&drink_type=wine&page=1&search_type=reviews',\
       'https://www.winemag.com/?s=&rating=80.0-82.99&drink_type=wine&page=1&search_type=reviews',\
       'https://www.winemag.com/?s=&rating=90.0-93.99&price=16.0-25.99,26.0-40.99&drink_type=wine&page=1&search_type=reviews',\
        'https://www.winemag.com/?s=&rating=83.0-86.99&price=41.0-60.99,61.0-75.99,76.0-99.99,100.0-199.99,200.0-*&drink_type=wine&page=1&search_type=reviews',\
       'https://www.winemag.com/?s=&rating=90.0-93.99&price=1.0-15.99,41.0-60.99,61.0-75.99,76.0-99.99,100.0-199.99,200.0-*&drink_type=wine&page=1&search_type=reviews',\
        'https://www.winemag.com/?s=&rating=83.0-86.99&price=1.0-15.99&drink_type=wine&page=1&search_type=reviews',\
       'https://www.winemag.com/?s=&rating=87.0-89.99&price=1.0-15.99,26.0-40.99,41.0-60.99&drink_type=wine&page=1&search_type=reviews',\
        'https://www.winemag.com/?s=&rating=83.0-86.99&price=26.0-40.99&drink_type=wine&page=1&search_type=reviews',\
       'https://www.winemag.com/?s=&rating=87.0-89.99&price=16.0-25.99,61.0-75.99,76.0-99.99,100.0-199.99,200.0-*&drink_type=wine&page=1&search_type=reviews',\
       'https://www.winemag.com/?s=&rating=83.0-86.99&price=16.0-25.99&drink_type=wine&page=1&search_type=reviews']

In [10]:
urls = ['https://www.winemag.com/?s=&rating=98.0-*&drink_type=wine&page=1&search_type=reviews']

## Begin scraping!

In [11]:
df = scrape_url_list(urls)

Begin scraping from base url: https://www.winemag.com/?s=&rating=98.0-*&drink_type=wine&page=1&search_type=reviews
No worry! I'm still working :)
https://www.winemag.com/buying-guide/domaine-de-la-janasse-2019-vieilles-vignes-red-chateauneuf-du-pape/ request failed! Status code: 502. Skip it.
Process overview: 10 pages have been scraped...
Some items are missed from page 13: https://www.winemag.com/?s=&rating=98.0-*&drink_type=wine&page=13&search_type=reviews. Skip it.
https://www.winemag.com/buying-guide/realm-cellars-2016-moonracer-red-napa-valley/ request failed! Status code: 502. Skip it.
Process overview: 20 pages have been scraped...
Process overview: 30 pages have been scraped...
Some items are missed from page 32: https://www.winemag.com/?s=&rating=98.0-*&drink_type=wine&page=32&search_type=reviews. Skip it.
Some items are missed from page 33: https://www.winemag.com/?s=&rating=98.0-*&drink_type=wine&page=33&search_type=reviews. Skip it.
Some items are missed from page 33: http

In [12]:
df

Unnamed: 0,id,name,rating,price,item_url,description,taster,designation,variety,appellation,winery,alcohol,bottle_size,category,importer,date_published,user_avg_rating,related_items
0,395717,Ramey 2019 Rochioli Vineyard Chardonnay (Russi...,100,75.0,/ramey-2019-rochioli-vineyard-chardonnay-russi...,\n There is a beautiful m...,Virginie Boone,Rochioli Vineyard,Chardonnay,"Russian River Valley, Sonoma, California, US",Ramey,14.5%,750 ml,White,,7/1/2022,,"[395716, 395714, 395712]"
1,395716,Ramey 2019 Hyde Vineyard Chardonnay (Carneros),98,75.0,/ramey-2019-hyde-vineyard-chardonnay-carneros/,\n From the great site—a ...,Virginie Boone,Hyde Vineyard,Chardonnay,"Carneros, Napa-Sonoma, California, US",Ramey,14.5%,750 ml,White,,7/1/2022,,"[395717, 395714, 395712]"
2,395714,Ramey 2019 Ritchie Vineyard Chardonnay (Russia...,98,75.0,/ramey-2019-ritchie-vineyard-chardonnay-russia...,\n This is an outstanding...,Virginie Boone,Ritchie Vineyard,Chardonnay,"Russian River Valley, Sonoma, California, US",Ramey,13.5%,750 ml,White,,7/1/2022,,"[395717, 395716, 395712]"
3,392668,Tenuta San Guido 2019 Sassicaia (Bolgheri Sas...,99,255.0,/tenuta-san-guido-2019-sassicaia-bolgheri-sass...,\n Enticing aromas of blu...,Kerin O’Keefe,Sassicaia,"Red Blends, Red Blends","Bolgheri Sassicaia, Tuscany, Italy",Tenuta San Guido,14%,750 ml,Red,Kobrand,5/1/2022,,"[369577, 342191, 333217]"
4,381708,Domaine Pierre Usseglio et Fils 2018 Cuvée de ...,99,125.0,/domaine-pierre-usseglio-et-fils-2018-cuvee-de...,\n Streaks of black fig a...,Anna Lee C. Iijima,Cuvée de mon Aïeul,Rhône-style Red Blend,"Châteauneuf-du-Pape, Rhône Valley, France",Domaine Pierre Usseglio et Fils,15.5%,750 ml,Red,Maison Barriere USA,5/1/2022,,"[363800, 319588, 17247]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
624,379442,Château Latour 2020 Barrel Sample (Pauillac),98,,/chateau-latour-2020-barrel-sample-pauillac/,\n 97–99. Barrel Sample. ...,Roger Voss,Barrel Sample,Bordeaux-style Red Blend,"Pauillac, Bordeaux, France",Château Latour,,750 ml,Red,Château Latour,1/1/1970,,"[384511, 395903, 396227]"
625,350785,Château Pavie 2019 Barrel Sample (Saint-Émilion),98,,/chateau-pavie-2019-barrel-sample-saint-emilion/,\n 97–99. Barrel Sample. ...,Roger Voss,Barrel Sample,Bordeaux-style Red Blend,"Saint-Émilion, Bordeaux, France",Château Pavie,,750 ml,Red,Vignobles Perse,1/1/1970,,"[384511, 395903, 396227]"
626,350786,Château Pichon Baron 2019 Barrel Sample (Paui...,98,,/chateau-pichon-baron-2019-barrel-sample-pauil...,\n 97–99. Barrel Sample. ...,Roger Voss,Barrel Sample,Bordeaux-style Red Blend,"Pauillac, Bordeaux, France",Château Pichon Baron,,750 ml,Red,Millésima USA,1/1/1970,,"[384511, 395903, 396227]"
627,350788,Château Pontet-Canet 2019 Barrel Sample (Paui...,98,,/chateau-pontet-canet-2019-barrel-sample-pauil...,\n 97–99. Barrel Sample. ...,Roger Voss,Barrel Sample,Bordeaux-style Red Blend,"Pauillac, Bordeaux, France",Château Pontet-Canet,,750 ml,Red,Château Pontet-Canet,1/1/1970,,"[384511, 395903, 396227]"
