In [141]:
import numpy as np
import pandas as pd

import requests
from bs4 import BeautifulSoup
import re

from tqdm import tqdm_notebook as tqdm

In [68]:
pd.set_option('display.max_columns', 500)

In [61]:
url_home = 'https://www.brewersfriend.com'

page_num = 1
r = requests.get(url_home + '/homebrew-recipes/page/' + str(page_num) + '/?sort=breweddown', headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(r.text, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html ng-app="bfApp">
 <head>
  <script crossorigin="anonymous" src="https://browser.sentry-cdn.com/4.6.4/bundle.min.js">
  </script>
  <script type="text/javascript">
   Sentry.init({
	dsn: 'https://1bf3f44ed68e4195ad7c6c3209f82a6a@sentry.io/1176007',
	environment: "www.brewersfriend.com",
	release: "9517c21e46f20f07a55a7a1d8d1adcc4564d987f",
	blacklistUrls: [
        // Ignore file:// URLs when people save pages they shouldn't
        /file:\/\//
	],
	whitelistUrls: [
        'www.brewersfriend.com',
        'beta.brewersfriend.com',
        'josh.brewersfriend.com'
    ]
});

Sentry.configureScope(function(scope) {
	scope.setUser({"ip_address":"94.21.233.128"});
});
  </script>
  <script>
   dataLayer = (typeof dataLayer === 'undefined') ? [] : dataLayer;

dataLayer.push({'bf_email': ""});

dataLayer.push({'bf_homepage_photo': ""});
  </script>
  <script>
   (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
new Date().getTime(),event:'gtm.js'});var f=d.getEl




In [71]:
header = [
    'title',
    'style',
    'size',
    'ABV',
    'IBU',
    'OG',
    'FG',
    'color',
    'method',
    'views',
    'brewed',
    'rating'
]

additional_info = [
    'size_unit',
    'author',
    'author_name',
    'boil_size',
    'boil_time',
    'boil_grav',
    'pitch_rate',
    'temp_prim',
    'temp_unit',
    'prim_method',
    'prim_amount',
    'prim_amount_unit',
    'creation_date'
]

url_columns = [
    'url_beer',
    'url_style',
    'url_author',
    'efficiency',
    'mash_thick',
    'sugar_scale'
]
columns = header + additional_info + url_columns

In [173]:
beer_df = pd.DataFrame(columns = columns)
idx = 0

page_num = 1
beers_collected = 0
while page_num <= 325:
    
    r = requests.get(url_home + '/homebrew-recipes/page/' + str(page_num) + '/?sort=breweddown', headers={'User-Agent': 'Mozilla/5.0'})
    soup = BeautifulSoup(r.text, 'html.parser')
    beers_table = soup.find('table')

    for row in beers_table.find_all('tr'):

        if (row.get('id') == None) & ((row.get('bgcolor') == '#fcfcfc') | (row.get('bgcolor') == '#eeeeee')):

            beer = pd.DataFrame(index=[idx], columns=columns)
            for i,cell in enumerate(row.find_all('td')):

                feature = header[i]
                beer[feature] = cell.text              

                if feature == 'title':
                    #get url
                    beer['url_beer'] = url_home + cell.find_all('a', href=True)[1]['href']

                elif feature == 'style':
                    #get url
                    beer['url_style'] = url_home + cell.find_all('a', href=True)[0]['href'] 

                elif feature == 'size':
                    txt = cell.text
                    beer['size'] = re.search(r'[0-9](\.[0-9]+)?', txt)[0]
                    beer['size_unit'] = re.search(r'[A-z]+', txt)[0]

                elif feature == 'ABV':
                    beer['ABV'] = re.sub(r'%', '', cell.text)

                elif feature == 'rating':
                    #rating integer part
                    beer['rating'] = len(cell.find_all('span', {'class':'active'})) 
                    #rating fractional part
                    if beer['rating'].values[0] != 5:
                        beer['rating'] += int(re.search(r'[0-9]+', cell.find('span', {'class':'last'})['style'])[0]) / 100

                elif feature == 'color':
                    beer['color'] = re.search(r'[0-9]+', cell.text)[0]

        elif (row.get('id') != None):
            if re.search(r'expand*', row.get('id')):

                info_table = row.find_all('table')[0]

                for i,tr in enumerate(info_table.findChildren('tr', recursive=False)):

                    #Title
                    if i == 0:
                        beer['title'] = tr.findChild('td').text.replace('Title:', '').strip()

                    #Author
                    elif i == 1:
                        if len(tr.find_all('font')) > 0:
                            beer['author_name'] = tr.find_all('font')[0].text
                            url = tr.find_all('a', href=True)
                            if len(url) > 0:
                                beer['url_author'] = url[0]['href']
                                if re.search(r'(?<=brewer\/)[0-9]+', url[0]['href']):
                                    beer['author'] = re.search(r'(?<=brewer\/)[0-9]+', url[0]['href'])[0]
                                else:
                                    beer['author'] = None
                            else: 
                                beer['url_author']  = None 
                                beer['author']      = None
                        else:
                            beer['author_name'] = None
                            beer['url_author']  = None 
                            beer['author']      = None

                   #Info row 1 
                    elif i == 2:
                        all_info_text = tr.text

                        tmp = re.search(r'(?<=Boil Size: )[0-9]+(\.[0-9]+)?', all_info_text)
                        if tmp != None:
                            beer['boil_size'] = tmp[0]
                        else:
                            beer['boil_size'] = None

                        tmp = re.search(r'(?<=Boil Time: )[0-9]+', all_info_text)
                        if tmp != None:
                            beer['boil_time'] = tmp[0]
                        else:
                            beer['boil_time'] = None

                        tmp = re.search(r'(?<=Boil Gravity: )[0-9]+(\.[0-9]+)?', all_info_text)
                        if tmp != None:
                            beer['boil_grav'] = tmp[0]
                        else:
                            beer['boil_grav'] = None

                        tmp = re.search(r'(?<=Efficiency: )[0-9]+', all_info_text)
                        if tmp != None:
                            beer['efficiency'] = tmp[0]
                        else:
                            beer['efficiency'] = None

                        tmp = re.search(r'(?<=Mash Thickness: )[0-9]+(\.[0-9]+)?', all_info_text)
                        if tmp != None:
                            beer['mash_thick'] = tmp[0]
                        else:
                            beer['mash_thick'] = None

                        tmp = re.search(r'(?<=Sugar Scale: )[A-z ]+', all_info_text)
                        if tmp != None:
                            beer['sugar_scale'] = tmp[0]
                        else:
                            beer['sugar_scale'] = None

                    #Info row 2
                    elif i == 3:
                        all_info_text = tr.text

                        tmp = re.search(r'(?<=Pitch Rate: )[0-9]+(\.[0-9]+)?', all_info_text)
                        if tmp != None:
                            beer['pitch_rate'] = tmp[0]
                        else:
                            beer['pitch_rate'] = None

                        tmp = re.search(r'(?<=Primary Temp: )[A-z0-9°\/\. ]+', all_info_text)
                        if (tmp[0].strip() != 'N/A') & (tmp != None):
                            beer['temp_prim'] = tmp[0].split('°')[0]
                            beer['temp_unit'] = tmp[0].split('°')[1]
                        else:
                            beer['temp_prim'] = None
                            beer['temp_unit'] = None

                        tmp = re.search(r'(?<=Priming Method: )[A-z\/\. ]+', all_info_text)
                        if (tmp != None):
                            if (tmp[0].strip() != 'N/A'):
                                beer['prim_method'] = tmp[0]
                            else:
                                beer['prim_method'] = None
                        else:
                            beer['prim_method'] = None

                        tmp = re.search(r'(?<=Priming Amount: )[A-z0-9\.\,\/\~ ]+', all_info_text)
                        if (tmp != None):
                            if (tmp[0].strip() != 'N/A'):
                                beer['prim_amount'] = tmp[0].split(' ')[0]
                                if len(tmp[0].split(' ')) > 1:
                                    beer['prim_amount_unit'] = tmp[0].split(' ')[1]
                                else:
                                    beer['prim_amount_unit'] = None
                            else:
                                beer['prim_amount'] = None
                                beer['prim_amount_unit'] = None
                        else:
                            beer['prim_amount'] = None
                            beer['prim_amount_unit'] = None

                        tmp = re.search(r'(?<=Creation Date: )[A-z0-9\/: ]+', all_info_text)
                        if tmp != None:
                            beer['creation_date'] = tmp[0]
                        else:
                            beer['creation_date'] = None


                beer_df = beer_df.append(beer,  verify_integrity=True, sort=False)
                beers_collected += 1
                idx += 1
                
    beer_df.to_csv('./data/beer_basic_data.csv')
    page_num += 1
    print('Scraping page:', page_num, '\t Beers collected:', beers_collected)
    beers_collected = 0

beer_df

Scraping page: 2 	 Beers collected: 20
Scraping page: 3 	 Beers collected: 20
Scraping page: 4 	 Beers collected: 20
Scraping page: 5 	 Beers collected: 20
Scraping page: 6 	 Beers collected: 20
Scraping page: 7 	 Beers collected: 20
Scraping page: 8 	 Beers collected: 20
Scraping page: 9 	 Beers collected: 20
Scraping page: 10 	 Beers collected: 20
Scraping page: 11 	 Beers collected: 20
Scraping page: 12 	 Beers collected: 20
Scraping page: 13 	 Beers collected: 20
Scraping page: 14 	 Beers collected: 20
Scraping page: 15 	 Beers collected: 20
Scraping page: 16 	 Beers collected: 20
Scraping page: 17 	 Beers collected: 20
Scraping page: 18 	 Beers collected: 20
Scraping page: 19 	 Beers collected: 20
Scraping page: 20 	 Beers collected: 20
Scraping page: 21 	 Beers collected: 20
Scraping page: 22 	 Beers collected: 20
Scraping page: 23 	 Beers collected: 20
Scraping page: 24 	 Beers collected: 20
Scraping page: 25 	 Beers collected: 20
Scraping page: 26 	 Beers collected: 20
Scraping

Scraping page: 204 	 Beers collected: 20
Scraping page: 205 	 Beers collected: 20
Scraping page: 206 	 Beers collected: 20
Scraping page: 207 	 Beers collected: 20
Scraping page: 208 	 Beers collected: 20
Scraping page: 209 	 Beers collected: 20
Scraping page: 210 	 Beers collected: 20
Scraping page: 211 	 Beers collected: 20
Scraping page: 212 	 Beers collected: 20
Scraping page: 213 	 Beers collected: 20
Scraping page: 214 	 Beers collected: 20
Scraping page: 215 	 Beers collected: 20
Scraping page: 216 	 Beers collected: 20
Scraping page: 217 	 Beers collected: 20
Scraping page: 218 	 Beers collected: 20
Scraping page: 219 	 Beers collected: 20
Scraping page: 220 	 Beers collected: 20
Scraping page: 221 	 Beers collected: 20
Scraping page: 222 	 Beers collected: 20
Scraping page: 223 	 Beers collected: 20
Scraping page: 224 	 Beers collected: 20
Scraping page: 225 	 Beers collected: 20
Scraping page: 226 	 Beers collected: 20
Scraping page: 227 	 Beers collected: 20
Scraping page: 2

Unnamed: 0,title,style,size,ABV,IBU,OG,FG,color,method,views,brewed,rating,size_unit,author,author_name,boil_size,boil_time,boil_grav,pitch_rate,temp_prim,temp_unit,prim_method,prim_amount,prim_amount_unit,creation_date,url_beer,url_style,url_author,efficiency,mash_thick,sugar_scale
0,Sierra Nevada Pale Ale Clone,American Pale Ale,6.5,5.58,40.12,1.055,1.013,8,All Grain,193152,481,4.73,Gal,5889,Ozarks Mountain Brew,7.75,70,1.047,1.0,,,,,,12/27/2012 09:03 PM,https://www.brewersfriend.com/homebrew/recipe/...,https://www.brewersfriend.com/styles/american-...,https://www.brewersfriend.com/homebrew/brewer/...,79,,Specific Gravity
1,Avg. Perfect Northeast IPA (NEIPA),Specialty IPA: New England IPA,5.75,6.5,50.6,1.062,1.013,5,All Grain,197026,224,4.66,Gal,49801,kcq101,7.5,60,1.048,,,,Keg,,,05/31/2016 02:52 PM,https://www.brewersfriend.com/homebrew/recipe/...,https://www.brewersfriend.com/styles/specialty...,https://www.brewersfriend.com/homebrew/brewer/...,70,1.25,Specific Gravity
2,Zombie Dust Clone - ALL GRAIN,American IPA,6,5.94,54.57,1.061,1.016,8,All Grain,177768,169,4.57,Gal,,Players Only Brewing,7,60,1.053,,,,,,,03/29/2012 02:27 PM,https://www.brewersfriend.com/homebrew/recipe/...,https://www.brewersfriend.com/styles/american-...,,70,,Specific Gravity
3,Zombie Dust Clone - EXTRACT,American IPA,5,6.16,64.77,1.063,1.016,8,Extract,170363,145,4.57,Gal,,Players Only Brewing,6,60,1.052,,,,,,,03/29/2012 02:52 PM,https://www.brewersfriend.com/homebrew/recipe/...,https://www.brewersfriend.com/styles/american-...,,70,,Specific Gravity
4,Simple Citra IPA - One Gallon,American IPA,1,6.1,37.06,1.058,1.011,11,All Grain,36061,103,5.00,Gal,45128,enrique,1.5,60,1.039,0.35,65,F,,,,04/02/2015 06:03 PM,https://www.brewersfriend.com/homebrew/recipe/...,https://www.brewersfriend.com/styles/american-...,https://www.brewersfriend.com/homebrew/brewer/...,55,1.5,Specific Gravity
5,Vanilla Cream Ale,Cream Ale,5.75,5.48,17.65,1.055,1.013,4,All Grain,235906,86,4.70,Gal,116,stikks,7.5,75,1.038,,64,F,corn sugar,4.5,oz,01/31/2012 01:35 AM,https://www.brewersfriend.com/homebrew/recipe/...,https://www.brewersfriend.com/styles/cream-ale/,https://www.brewersfriend.com/homebrew/brewer/116,70,,Specific Gravity
6,Southern Tier Pumking clone,Holiday/Winter Special Spiced Beer,5.5,8.16,60.65,1.083,1.021,15,All Grain,168781,77,4.58,Gal,955,mackeydj,6.5,60,1.07,,,,,,,08/20/2012 03:48 AM,https://www.brewersfriend.com/homebrew/recipe/...,https://www.brewersfriend.com/styles/holiday-w...,https://www.brewersfriend.com/homebrew/brewer/955,70,,Specific Gravity
7,Great Lakes Christmas Ale Clone,Winter Seasonal Beer,6,7.31,28.98,1.072,1.017,10,BIAB,20313,54,5.00,Gal,39433,Cameron,8.11,60,1.053,0.35,70,F,,,,09/30/2015 08:44 PM,https://www.brewersfriend.com/homebrew/recipe/...,https://www.brewersfriend.com/styles/winter-se...,https://www.brewersfriend.com/homebrew/brewer/...,70,,Specific Gravity
8,Bells two hearted clone,American IPA,5.5,7.14,61.83,1.073,1.018,8,All Grain,148134,53,3.33,Gal,955,mackeydj,6.5,60,1.061,0.75,65,F,,,,03/02/2012 02:44 AM,https://www.brewersfriend.com/homebrew/recipe/...,https://www.brewersfriend.com/styles/american-...,https://www.brewersfriend.com/homebrew/brewer/955,75,,Specific Gravity
9,Goose Island IPA Clone,English IPA,5.5,6.63,55.52,1.065,1.015,9,All Grain,55877,50,5.00,Gal,5889,Ozarks Mountain Brew,7.5,60,1.048,,64,F,,,,12/25/2012 03:04 PM,https://www.brewersfriend.com/homebrew/recipe/...,https://www.brewersfriend.com/styles/english-ipa/,https://www.brewersfriend.com/homebrew/brewer/...,75,1.5,Specific Gravity


In [167]:
tmp

In [171]:
row

<tr id="expand382205" style="display:none;overflow:hidden;"><td colspan="12" style="border:1px solid #eeefff;"> <table bgcolor="#eeefff" border="0" cellspacing="1" height="100" style="margin:0px;border-radius: 2px;" width="100%">
<tr>
<td bgcolor="white" colspan="6" style="vertical-align: top;" valign="top">
<b>Title:</b> Munchen Grosses Bier - Jamil	  </td>
</tr>
<tr>
<td bgcolor="white" colspan="6" style="vertical-align: top;" valign="top">
<table cellpadding="0" cellspacing="0" style="margin:0px;width:1px;">
<tr>
<td align="left" valign="middle">
<b>Author:</b>
</td>
<td style="white-space: nowrap;" valign="middle">
<a href="https://www.brewersfriend.com/homebrew/brewer/45056"><font style="font-weight:700;color:#777;font-size:13px;line-height:1;">TZ</font></a> </td>
</tr>
</table>
</td>
</tr>
<tr>
<td bgcolor="white"><b>Boil Size:</b> 31.2  L</td>
<td bgcolor="white"><b>Boil Time:</b> 90</td>
<td bgcolor="white"><b>Boil Gravity:</b> 1.04</td>
<td bgcolor="white"><b>Efficiency:</b> 6

In [78]:
title = header[0]
print(title)
title == 'Title '

Title 


False

In [73]:
tmp = 'asd'
tmp == 'asd'

True