In [1]:
import numpy as np
import pandas as pd

import requests
from bs4 import BeautifulSoup
import re

from tqdm import tqdm_notebook as tqdm

In [2]:
pd.set_option('display.max_columns', 500)

In [3]:
url_home = 'https://www.brewersfriend.com'

page_num = 1
r = requests.get(url_home + '/homebrew-recipes/page/' + str(page_num) + '/?sort=breweddown', headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(r.text, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html ng-app="bfApp">
 <head>
  <meta charset="utf-8"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type">
   <script crossorigin="anonymous" src="https://browser.sentry-cdn.com/4.6.4/bundle.min.js">
   </script>
   <script type="text/javascript">
    var sentryErrorsToIgnore = [
    // Ignore Ads errors that aren't hurting anything
    'Can\'t execute code from a freed script'
];

// Ignore certain errors on Edge
if ( window.navigator.userAgent.indexOf("Edge") > -1 ) {
    sentryErrorsToIgnore.push( "Unable to get property 'now' of undefined or null reference" );
    sentryErrorsToIgnore.push( "Unable to get property 'getBoundingClientRect' of undefined or null reference" );
    sentryErrorsToIgnore.push( "TypeError: Permission denied" );
    sentryErrorsToIgnore.push( "TypeError: Object expected" );
}

Sentry.init({
	dsn: 'https://1bf3f44ed68e4195ad7c6c3209f82a6a@sentry.io/1176007',
	environment: "www.brewersfriend.com",
	release: "6884337dad8deb43e

In [4]:
header = [
    'title',
    'style',
    'size',
    'ABV',
    'IBU',
    'OG',
    'FG',
    'color',
    'method',
    'views',
    'brewed',
    'rating'
]

additional_info = [
    'size_unit',
    'author',
    'author_name',
    'boil_size',
    'boil_time',
    'boil_grav',
    'pitch_rate',
    'temp_prim',
    'temp_unit',
    'prim_method',
    'prim_amount',
    'prim_amount_unit',
    'creation_date'
]

url_columns = [
    'url_beer',
    'url_style',
    'url_author',
    'efficiency',
    'mash_thick',
    'sugar_scale'
]
columns = header + additional_info + url_columns

In [12]:
beer_df = pd.DataFrame(columns = columns)
idx = 0

page_num = 1
beers_collected = 0
while page_num <= 500:
    
    r = requests.get(url_home + '/homebrew-recipes/page/' + str(page_num) + '/?sort=breweddown', headers={'User-Agent': 'Mozilla/5.0'})
    soup = BeautifulSoup(r.text, 'html.parser')
    beers_table = soup.find('table')

    for row in beers_table.find_all('tr'):

        if (row.get('id') == None) & ((row.get('bgcolor') == '#fcfcfc') | (row.get('bgcolor') == '#eeeeee')):

            beer = pd.DataFrame(index=[idx], columns=columns)
            for i,cell in enumerate(row.find_all('td')):

                feature = header[i]
                beer[feature] = cell.text              

                if feature == 'title':
                    #get url
                    beer['url_beer'] = url_home + cell.find_all('a', href=True)[1]['href']

                elif feature == 'style':
                    #get url
                    beer['url_style'] = url_home + cell.find_all('a', href=True)[0]['href'] 

                elif feature == 'size':
                    txt = cell.text
                    beer['size'] = re.search(r'[0-9]+(\.[0-9]+)?', txt)[0]
                    beer['size_unit'] = re.search(r'[A-z]+', txt)[0]

                elif feature == 'ABV':
                    beer['ABV'] = re.sub(r'%', '', cell.text)

                elif feature == 'rating':
                    #rating integer part
                    beer['rating'] = len(cell.find_all('span', {'class':'active'})) 
                    #rating fractional part
                    if beer['rating'].values[0] != 5:
                        beer['rating'] += int(re.search(r'[0-9]+', cell.find('span', {'class':'last'})['style'])[0]) / 100

                elif feature == 'color':
                    beer['color'] = re.search(r'[0-9]+(\.[0-9]+)?', cell.text)[0]

        elif (row.get('id') != None):
            if re.search(r'expand.*', row.get('id')):

                info_table = row.find_all('table')[0]

                for i,tr in enumerate(info_table.findChildren('tr', recursive=False)):

                    #Title
                    if i == 0:
                        beer['title'] = tr.findChild('td').text.replace('Title:', '').strip()

                    #Author
                    elif i == 1:
                        if len(tr.find_all('font')) > 0:
                            beer['author_name'] = tr.find_all('font')[0].text
                            url = tr.find_all('a', href=True)
                            if len(url) > 0:
                                beer['url_author'] = url[0]['href']
                                if re.search(r'(?<=brewer\/)[0-9]+', url[0]['href']):
                                    beer['author'] = re.search(r'(?<=brewer\/)[0-9]+', url[0]['href'])[0]
                                else:
                                    beer['author'] = None
                            else: 
                                beer['url_author']  = None 
                                beer['author']      = None
                        else:
                            beer['author_name'] = None
                            beer['url_author']  = None 
                            beer['author']      = None

                   #Info row 1 
                    elif i == 2:
                        all_info_text = tr.text

                        tmp = re.search(r'(?<=Boil Size: )[0-9]+(\.[0-9]+)?', all_info_text)
                        if tmp != None:
                            beer['boil_size'] = tmp[0]
                        else:
                            beer['boil_size'] = None

                        tmp = re.search(r'(?<=Boil Time: )[0-9]+', all_info_text)
                        if tmp != None:
                            beer['boil_time'] = tmp[0]
                        else:
                            beer['boil_time'] = None

                        tmp = re.search(r'(?<=Boil Gravity: )[0-9]+(\.[0-9]+)?', all_info_text)
                        if tmp != None:
                            beer['boil_grav'] = tmp[0]
                        else:
                            beer['boil_grav'] = None

                        tmp = re.search(r'(?<=Efficiency: )[0-9]+', all_info_text)
                        if tmp != None:
                            beer['efficiency'] = tmp[0]
                        else:
                            beer['efficiency'] = None

                        tmp = re.search(r'(?<=Mash Thickness: )[0-9]+(\.[0-9]+)?', all_info_text)
                        if tmp != None:
                            beer['mash_thick'] = tmp[0]
                        else:
                            beer['mash_thick'] = None

                        tmp = re.search(r'(?<=Sugar Scale: )[A-z ]+', all_info_text)
                        if tmp != None:
                            beer['sugar_scale'] = tmp[0]
                        else:
                            beer['sugar_scale'] = None

                    #Info row 2
                    elif i == 3:
                        all_info_text = tr.text

                        tmp = re.search(r'(?<=Pitch Rate: )[0-9]+(\.[0-9]+)?', all_info_text)
                        if tmp != None:
                            beer['pitch_rate'] = tmp[0]
                        else:
                            beer['pitch_rate'] = None

                        tmp = re.search(r'(?<=Primary Temp: )[A-z0-9°\/\. ]+', all_info_text)
                        if (tmp[0].strip() != 'N/A') & (tmp != None):
                            beer['temp_prim'] = tmp[0].split('°')[0]
                            beer['temp_unit'] = tmp[0].split('°')[1]
                        else:
                            beer['temp_prim'] = None
                            beer['temp_unit'] = None

                        tmp = re.search(r'(?<=Priming Method: )[A-z\/\. ]+', all_info_text)
                        if (tmp != None):
                            if (tmp[0].strip() != 'N/A'):
                                beer['prim_method'] = tmp[0]
                            else:
                                beer['prim_method'] = None
                        else:
                            beer['prim_method'] = None

                        tmp = re.search(r'(?<=Priming Amount: )[A-z0-9\.\,\/\~ ]+', all_info_text)
                        if (tmp != None):
                            if (tmp[0].strip() != 'N/A'):
                                beer['prim_amount'] = tmp[0].split(' ')[0]
                                if len(tmp[0].split(' ')) > 1:
                                    beer['prim_amount_unit'] = tmp[0].split(' ')[1]
                                else:
                                    beer['prim_amount_unit'] = None
                            else:
                                beer['prim_amount'] = None
                                beer['prim_amount_unit'] = None
                        else:
                            beer['prim_amount'] = None
                            beer['prim_amount_unit'] = None

                        tmp = re.search(r'(?<=Creation Date: )[A-z0-9\/: ]+', all_info_text)
                        if tmp != None:
                            beer['creation_date'] = tmp[0]
                        else:
                            beer['creation_date'] = None

                
                if(beer['brewed'] < 2):
                    break
                
                beer_df = beer_df.append(beer,  verify_integrity=True, sort=False)
                beers_collected += 1
                idx += 1
                
    beer_df.to_csv('./data/beer_basic_data.csv')
    page_num += 1
    print('Scraping page:', page_num, '\t Beers collected:', beers_collected)
    beers_collected = 0
    
    if(beer['brewed'] < 2):
        break

beer_df

Scraping page: 2 	 Beers collected: 20


KeyboardInterrupt: 

In [13]:
beer_df[beer_df.boil_size.astype(float) > 500]

Unnamed: 0,title,style,size,ABV,IBU,OG,FG,color,method,views,brewed,rating,size_unit,author,author_name,boil_size,boil_time,boil_grav,pitch_rate,temp_prim,temp_unit,prim_method,prim_amount,prim_amount_unit,creation_date,url_beer,url_style,url_author,efficiency,mash_thick,sugar_scale
15,Hopped and Loaded 16 BBL,American IPA,496,7.22,113.97,16.408,3.12005,6.57,All Grain,7628,47,5.0,Gal,52796,105WestBrewing,531,75,15.4,0.75,68,F,,,,10/27/2015 07:07 PM,https://www.brewersfriend.com/homebrew/recipe/...,https://www.brewersfriend.com/styles/american-...,https://www.brewersfriend.com/homebrew/brewer/...,80,1.3,Plato


In [171]:
row

<tr id="expand382205" style="display:none;overflow:hidden;"><td colspan="12" style="border:1px solid #eeefff;"> <table bgcolor="#eeefff" border="0" cellspacing="1" height="100" style="margin:0px;border-radius: 2px;" width="100%">
<tr>
<td bgcolor="white" colspan="6" style="vertical-align: top;" valign="top">
<b>Title:</b> Munchen Grosses Bier - Jamil	  </td>
</tr>
<tr>
<td bgcolor="white" colspan="6" style="vertical-align: top;" valign="top">
<table cellpadding="0" cellspacing="0" style="margin:0px;width:1px;">
<tr>
<td align="left" valign="middle">
<b>Author:</b>
</td>
<td style="white-space: nowrap;" valign="middle">
<a href="https://www.brewersfriend.com/homebrew/brewer/45056"><font style="font-weight:700;color:#777;font-size:13px;line-height:1;">TZ</font></a> </td>
</tr>
</table>
</td>
</tr>
<tr>
<td bgcolor="white"><b>Boil Size:</b> 31.2  L</td>
<td bgcolor="white"><b>Boil Time:</b> 90</td>
<td bgcolor="white"><b>Boil Gravity:</b> 1.04</td>
<td bgcolor="white"><b>Efficiency:</b> 6

In [78]:
title = header[0]
print(title)
title == 'Title '

Title 


False

In [73]:
tmp = 'asd'
tmp == 'asd'

True