## Scraping Key Information of Wine Products

In [1]:
# Required libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
import random
import yaml
import io

In [2]:
# Read YAML file to get the weblink of the online store
with open("web_links.yaml", 'r') as stream:
    web_link = yaml.safe_load(stream)

base_link = web_link['base_link']

In [3]:
util_dict = {
    'headers' : {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
}

### Main Loop that Scrapes all Necessary Information

In [4]:
pages = 2 # define how many pages need to be considered 
headers = util_dict['headers']
df_fin = pd.DataFrame()

for pg in range(1, pages+1):
    page_link = base_link + '?page=' + str(pg)
    result = requests.get(page_link, headers=headers)
    soup = BeautifulSoup(result.content, 'html.parser')
    
    main_tab = soup.find_all('div', attrs={'class':'boost-pfs-filter-products product-list product-list--collection'})
    prod_list = [m.find_all('div', attrs={'class':'product-item'}) for m in main_tab][0]
    print('Page {} has {} products'.format(pg, len(prod_list)))
    
    df_pg_fin = pd.DataFrame()
    for p in range(len(prod_list)):
        data_pg = {
        'wine_name': [], 'long_name':[], 'listed_price':[], 'real_price': [], 'short_url': [], 'long_description':[]}
        
        prod = prod_list[p].find('div', attrs={'product-item__info-inner'})
        data_pg['wine_name'].append(prod.find('a').text)
    
        title = prod.find('a', attrs={'class':'product-item__title text--strong link'})
        data_pg['long_name'].append(title.text)
        short_url = title['href']
        data_pg['short_url'].append(short_url)
    
        price_all = prod.find_all('div', attrs={'class':'product-item__price-list price-list'})
        listed_price = price_all[0].find_all('span', attrs={'class':'price price--highlight', 'class':'price'})
        data_pg['listed_price'].append(float(listed_price[0].get_text(strip=True).split('$')[1]))
    
        real_price = price_all[0].find_all('span', attrs={'class':'price price--compare', 'class':'price'})
        data_pg['real_price'].append(float(listed_price[0].get_text(strip=True).split('$')[1]))
          
        # product page scraping
        prod_pg_link = base_link + title['href']
        prod_result = requests.get(prod_pg_link, headers=headers)
        prod_soup = BeautifulSoup(prod_result.content, 'html.parser')

        table = prod_soup.find_all('table')
        if table:
            all_tr = table[0].find_all('tr')
            td_s = [k.find_all('td') for k in all_tr]
            pair = [[td[0].get_text(strip=True), td[1].get_text(strip=True)] for td in td_s if td]
          
            ele = [e[1] for e in pair]
            col = [e[0] for e in pair]
        
            df_ele = pd.DataFrame(ele).T
            df_ele.columns = col
        else:
            df_ele = None
        
        # extracting the long descriptions
        descr = prod_soup.find_all('div', attrs={'class':'rte text--pull'})
        txts = [tx.text for tx in descr[0].find_all('p') if tx.text not in ['', '\xa0']]
        comb_txt = ''.join(txts)
        data_pg['long_description'].append(comb_txt)
        df_pg_wine = pd.DataFrame(data_pg)
        
        dff = pd.concat([df_pg_wine, df_ele], axis=1)
        #print(dff.shape[1])
        df_pg_fin = pd.concat([df_pg_fin, dff], axis=0)
        
        time.sleep(4)
        
    df_fin = pd.concat([df_fin, df_pg_fin], axis=0)

Page 1 has 36 products
Page 2 has 36 products


In [5]:
df_fin.head()

Unnamed: 0,wine_name,long_name,listed_price,real_price,short_url,long_description,Winery,Country,Region,Year,Grape Varietal,Size,ABV,Wine Style
0,Caymus Vineyards,Bonanza Wine Cabernet Sauvignon Lot 6 (750 ml),18.95,18.95,/products/bonanza-wine-cabernet-sauvignon-lot-...,Embark on a journey through the Wild West with...,Chuck Wagner -Caymus Vineyards,United States,California,No Vintage,Cabernet Sauvignon,750 ML,14.6%,Red Wine
0,Bogle,Juggernaut Hillside Cabernet Sauvignon 2021 (7...,17.95,17.95,/products/juggernaut-hillside-cabernet-sauvign...,Juggernaut Hillside Cabernet Sauvignon 2021 is...,Bogle Wine,United States,California,2021,Cabernet Sauvignon,750 ML,14.5%,Red Wine
0,Tasting Sets,12 Bottle World Tour Mixed Wine Tasting Set (7...,74.95,74.95,/products/12-bottle-world-tour-mixed-wine-tast...,Our experts and wine tasting panel put togethe...,,,,,,,,
0,Duckhorn Vineyards,Decoy Sonoma Cabernet Sauvignon 2021 (750 ml),19.5,19.5,/products/decoy-sonoma-cabernet-sauvignon-2021...,Discover the decadent taste of Decoy Sonoma Ca...,Duckhorn Vineyards- Decoy,United States,California,2021,Cabernet Sauvignon,750 ML,13.9%,Red Wine
0,Barefoot,Barefoot Pink Moscato (750 ml),5.95,5.95,/products/barefoot-pink-moscato-750-ml,Experience the joy of a luscious and juicy win...,Barefoot,United States,California,No Vintage,Pink Moscato,750 ML,9%,White Wine


In [None]:
df_fin.shape

In [None]:
# save the table
df_fin.to_csv('wine_pg2.csv', index=False)

In [None]:
df_fin.long_description[0:1].values