In [1]:
import numpy as np
import pandas as pd

import requests
from bs4 import BeautifulSoup
import re

from tqdm import tqdm_notebook as tqdm

In [2]:
pd.set_option('display.max_columns', 500)

In [21]:
url_home = 'https://www.brewersfriend.com'

page_num = 1
r = requests.get('https://www.brewersfriend.com/homebrew/recipe/view/290559/hopped-and-loaded-16-bbl', headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(r.text, 'html.parser')
print(soup.findAll('div', {'class': 'description'}))

[<div class="description" style="overflow: hidden;">
<span class="viewStats">
<span class="firstLabel">Method:</span>
<strong>All Grain</strong>
</span>
<br/>
<span class="viewStats">
<span class="firstLabel">Style:</span>
<strong><span itemprop="recipeCategory">
<a href="https://www.brewersfriend.com/styles/american-ipa-pre-2015/">American IPA</a> </span></strong>
</span>
<br/>
<span class="viewStats">
<span class="firstLabel">Boil Time:</span>
<strong>75 min</strong>
</span>
<br/>
<span class="viewStats">
<span class="firstLabel">Batch Size:</span>
<strong> <span itemprop="recipeYield">496 gallons</span> </strong>
<span style="font-size: 0.9em; font-style: italic;">
						(fermentor volume)					</span>
</span>
<br/>
<span class="viewStats">
<span class="firstLabel">Pre Boil Size:</span>
<strong>531 gallons </strong>
</span>
<br/>
<span class="viewStats">
<span class="firstLabel">Post Boil Size:</span>
<strong>504.4 gallons </strong>
</span>
<br/>
<span class="viewStats">
<span class=

# Basic data scraper

In [4]:
header = [
    'title',
    'style',
    'size',
    'ABV',
    'IBU',
    'OG',
    'FG',
    'color',
    'method',
    'views',
    'brewed',
    'rating'
]

additional_info = [
    'size_unit',
    'author',
    'author_name',
    'boil_size',
    'boil_time',
    'boil_grav',
    'pitch_rate',
    'temp_prim',
    'temp_unit',
    'prim_method',
    'prim_amount',
    'prim_amount_unit',
    'creation_date'
]

url_columns = [
    'url_beer',
    'url_style',
    'url_author',
    'efficiency',
    'mash_thick',
    'sugar_scale'
]
columns = header + additional_info + url_columns

In [18]:
beer_df = pd.DataFrame(columns = columns)
idx = 0

page_num = 1
beers_collected = 0
while page_num <= 500:
    
    r = requests.get(url_home + '/homebrew-recipes/page/' + str(page_num) + '/?sort=breweddown', headers={'User-Agent': 'Mozilla/5.0'})
    soup = BeautifulSoup(r.text, 'html.parser')
    beers_table = soup.find('table')

    for row in beers_table.find_all('tr'):

        if (row.get('id') == None) & ((row.get('bgcolor') == '#fcfcfc') | (row.get('bgcolor') == '#eeeeee')):

            beer = pd.DataFrame(index=[idx], columns=columns)
            for i,cell in enumerate(row.find_all('td')):

                feature = header[i]
                beer[feature] = cell.text              

                if feature == 'title':
                    #get url
                    beer['url_beer'] = url_home + cell.find_all('a', href=True)[1]['href']

                elif feature == 'style':
                    #get url
                    beer['url_style'] = url_home + cell.find_all('a', href=True)[0]['href'] 

                elif feature == 'size':
                    txt = cell.text
                    beer['size'] = re.search(r'[0-9]+(\.[0-9]+)?', txt)[0]
                    beer['size_unit'] = re.search(r'[A-z]+', txt)[0]

                elif feature == 'ABV':
                    beer['ABV'] = re.sub(r'%', '', cell.text)

                elif feature == 'rating':
                    #rating integer part
                    beer['rating'] = len(cell.find_all('span', {'class':'active'})) 
                    #rating fractional part
                    if beer['rating'].values[0] != 5:
                        beer['rating'] += int(re.search(r'[0-9]+', cell.find('span', {'class':'last'})['style'])[0]) / 100

                elif feature == 'color':
                    beer['color'] = re.search(r'[0-9]+(\.[0-9]+)?', cell.text)[0]

        elif (row.get('id') != None):
            if re.search(r'expand.*', row.get('id')):

                info_table = row.find_all('table')[0]

                for i,tr in enumerate(info_table.findChildren('tr', recursive=False)):

                    #Title
                    if i == 0:
                        beer['title'] = tr.findChild('td').text.replace('Title:', '').strip()

                    #Author
                    elif i == 1:
                        if len(tr.find_all('font')) > 0:
                            beer['author_name'] = tr.find_all('font')[0].text
                            url = tr.find_all('a', href=True)
                            if len(url) > 0:
                                beer['url_author'] = url[0]['href']
                                if re.search(r'(?<=brewer\/)[0-9]+', url[0]['href']):
                                    beer['author'] = re.search(r'(?<=brewer\/)[0-9]+', url[0]['href'])[0]
                                else:
                                    beer['author'] = None
                            else: 
                                beer['url_author']  = None 
                                beer['author']      = None
                        else:
                            beer['author_name'] = None
                            beer['url_author']  = None 
                            beer['author']      = None

                   #Info row 1 
                    elif i == 2:
                        all_info_text = tr.text

                        tmp = re.search(r'(?<=Boil Size: )[0-9]+(\.[0-9]+)?', all_info_text)
                        if tmp != None:
                            beer['boil_size'] = tmp[0]
                        else:
                            beer['boil_size'] = None

                        tmp = re.search(r'(?<=Boil Time: )[0-9]+', all_info_text)
                        if tmp != None:
                            beer['boil_time'] = tmp[0]
                        else:
                            beer['boil_time'] = None

                        tmp = re.search(r'(?<=Boil Gravity: )[0-9]+(\.[0-9]+)?', all_info_text)
                        if tmp != None:
                            beer['boil_grav'] = tmp[0]
                        else:
                            beer['boil_grav'] = None

                        tmp = re.search(r'(?<=Efficiency: )[0-9]+', all_info_text)
                        if tmp != None:
                            beer['efficiency'] = tmp[0]
                        else:
                            beer['efficiency'] = None

                        tmp = re.search(r'(?<=Mash Thickness: )[0-9]+(\.[0-9]+)?', all_info_text)
                        if tmp != None:
                            beer['mash_thick'] = tmp[0]
                        else:
                            beer['mash_thick'] = None

                        tmp = re.search(r'(?<=Sugar Scale: )[A-z ]+', all_info_text)
                        if tmp != None:
                            beer['sugar_scale'] = tmp[0]
                        else:
                            beer['sugar_scale'] = None

                    #Info row 2
                    elif i == 3:
                        all_info_text = tr.text

                        tmp = re.search(r'(?<=Pitch Rate: )[0-9]+(\.[0-9]+)?', all_info_text)
                        if tmp != None:
                            beer['pitch_rate'] = tmp[0]
                        else:
                            beer['pitch_rate'] = None

                        tmp = re.search(r'(?<=Primary Temp: )[A-z0-9°\/\. ]+', all_info_text)
                        if (tmp[0].strip() != 'N/A') & (tmp != None):
                            beer['temp_prim'] = tmp[0].split('°')[0]
                            beer['temp_unit'] = tmp[0].split('°')[1]
                        else:
                            beer['temp_prim'] = None
                            beer['temp_unit'] = None

                        tmp = re.search(r'(?<=Priming Method: )[A-z\/\. ]+', all_info_text)
                        if (tmp != None):
                            if (tmp[0].strip() != 'N/A'):
                                beer['prim_method'] = tmp[0]
                            else:
                                beer['prim_method'] = None
                        else:
                            beer['prim_method'] = None

                        tmp = re.search(r'(?<=Priming Amount: )[A-z0-9\.\,\/\~ ]+', all_info_text)
                        if (tmp != None):
                            if (tmp[0].strip() != 'N/A'):
                                beer['prim_amount'] = tmp[0].split(' ')[0]
                                if len(tmp[0].split(' ')) > 1:
                                    beer['prim_amount_unit'] = tmp[0].split(' ')[1]
                                else:
                                    beer['prim_amount_unit'] = None
                            else:
                                beer['prim_amount'] = None
                                beer['prim_amount_unit'] = None
                        else:
                            beer['prim_amount'] = None
                            beer['prim_amount_unit'] = None

                        tmp = re.search(r'(?<=Creation Date: )[A-z0-9\/: ]+', all_info_text)
                        if tmp != None:
                            beer['creation_date'] = tmp[0]
                        else:
                            beer['creation_date'] = None

                
                if(int(beer['brewed']) < 2):
                    break
                
                beer_df = beer_df.append(beer,  verify_integrity=True, sort=False)
                beers_collected += 1
                idx += 1
                
    beer_df.to_csv('./data/beer_basic_data.csv')
    page_num += 1
    print('Scraping page:', page_num, '\t Beers collected:', beers_collected)
    beers_collected = 0
    
    if(int(beer['brewed']) < 2):
        break

beer_df

Scraping page: 2 	 Beers collected: 20
Scraping page: 3 	 Beers collected: 20
Scraping page: 4 	 Beers collected: 20
Scraping page: 5 	 Beers collected: 20
Scraping page: 6 	 Beers collected: 20
Scraping page: 7 	 Beers collected: 20
Scraping page: 8 	 Beers collected: 20
Scraping page: 9 	 Beers collected: 20
Scraping page: 10 	 Beers collected: 20
Scraping page: 11 	 Beers collected: 20
Scraping page: 12 	 Beers collected: 20
Scraping page: 13 	 Beers collected: 20
Scraping page: 14 	 Beers collected: 20
Scraping page: 15 	 Beers collected: 20
Scraping page: 16 	 Beers collected: 20
Scraping page: 17 	 Beers collected: 20
Scraping page: 18 	 Beers collected: 20
Scraping page: 19 	 Beers collected: 20
Scraping page: 20 	 Beers collected: 20
Scraping page: 21 	 Beers collected: 20
Scraping page: 22 	 Beers collected: 20
Scraping page: 23 	 Beers collected: 20
Scraping page: 24 	 Beers collected: 20
Scraping page: 25 	 Beers collected: 20
Scraping page: 26 	 Beers collected: 20
Scraping

Scraping page: 205 	 Beers collected: 20
Scraping page: 206 	 Beers collected: 20
Scraping page: 207 	 Beers collected: 20
Scraping page: 208 	 Beers collected: 20
Scraping page: 209 	 Beers collected: 20
Scraping page: 210 	 Beers collected: 20
Scraping page: 211 	 Beers collected: 20
Scraping page: 212 	 Beers collected: 20
Scraping page: 213 	 Beers collected: 20
Scraping page: 214 	 Beers collected: 20
Scraping page: 215 	 Beers collected: 20
Scraping page: 216 	 Beers collected: 20
Scraping page: 217 	 Beers collected: 20
Scraping page: 218 	 Beers collected: 20
Scraping page: 219 	 Beers collected: 20
Scraping page: 220 	 Beers collected: 20
Scraping page: 221 	 Beers collected: 20
Scraping page: 222 	 Beers collected: 20
Scraping page: 223 	 Beers collected: 20
Scraping page: 224 	 Beers collected: 20
Scraping page: 225 	 Beers collected: 20
Scraping page: 226 	 Beers collected: 20
Scraping page: 227 	 Beers collected: 20
Scraping page: 228 	 Beers collected: 20
Scraping page: 2

Unnamed: 0,title,style,size,ABV,IBU,OG,FG,color,method,views,brewed,rating,size_unit,author,author_name,boil_size,boil_time,boil_grav,pitch_rate,temp_prim,temp_unit,prim_method,prim_amount,prim_amount_unit,creation_date,url_beer,url_style,url_author,efficiency,mash_thick,sugar_scale
0,Sierra Nevada Pale Ale Clone,American Pale Ale,6.5,5.58,40.12,1.055,1.013,8,All Grain,224412,556,4.79,Gal,5889,Ozarks Mountain Brew,7.75,70,1.047,1.0,,,,,,12/27/2012 09:03 PM,https://www.brewersfriend.com/homebrew/recipe/...,https://www.brewersfriend.com/styles/american-...,https://www.brewersfriend.com/homebrew/brewer/...,79,,Specific Gravity
1,Avg. Perfect Northeast IPA (NEIPA),Specialty IPA: New England IPA,5.75,6.5,50.6,1.062,1.013,5.2,All Grain,231660,280,4.65,Gal,49801,kcq101,7.5,60,1.048,,,,Keg,,,05/31/2016 02:52 PM,https://www.brewersfriend.com/homebrew/recipe/...,https://www.brewersfriend.com/styles/specialty...,https://www.brewersfriend.com/homebrew/brewer/...,70,1.25,Specific Gravity
2,Zombie Dust Clone - ALL GRAIN,American IPA,6,5.94,54.57,1.061,1.016,8.5,All Grain,191290,191,4.66,Gal,,Players Only Brewing,7,60,1.053,,,,,,,03/29/2012 02:27 PM,https://www.brewersfriend.com/homebrew/recipe/...,https://www.brewersfriend.com/styles/american-...,,70,,Specific Gravity
3,Zombie Dust Clone - EXTRACT,American IPA,5,6.16,64.77,1.063,1.016,8.98,Extract,176309,155,4.62,Gal,,Players Only Brewing,6,60,1.052,,,,,,,03/29/2012 02:52 PM,https://www.brewersfriend.com/homebrew/recipe/...,https://www.brewersfriend.com/styles/american-...,,70,,Specific Gravity
4,Simple Citra IPA - One Gallon,American IPA,1,6.1,37.06,1.058,1.011,11.45,All Grain,44521,123,4.79,Gal,45128,enrique,1.5,60,1.039,0.35,65,F,,,,04/02/2015 06:03 PM,https://www.brewersfriend.com/homebrew/recipe/...,https://www.brewersfriend.com/styles/american-...,https://www.brewersfriend.com/homebrew/brewer/...,55,1.5,Specific Gravity
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7174,American Pale Ale FWH,American Pale Ale,2.5,5.65,49.37,1.056,1.013,9.93,All Grain,1984,2,0.00,Gal,217,grosbeak,4,75,,,,,,,,01/03/2012 03:23 AM,https://www.brewersfriend.com/homebrew/recipe/...,https://www.brewersfriend.com/styles/american-...,https://www.brewersfriend.com/homebrew/brewer/217,70,,Specific Gravity
7175,Pliney and Friends,Imperial IPA,5.5,8.34,149.04,1.077,1.014,7.66,BIAB,1478,2,0.00,Gal,,buckeye2011,7,75,,,,,,,,12/29/2011 04:53 PM,https://www.brewersfriend.com/homebrew/recipe/...,https://www.brewersfriend.com/styles/imperial-...,,75,,Specific Gravity
7176,Conrad's Oatmeal Milk Stout,Sweet Stout,6,5.26,33.33,1.054,1.014,39.33,All Grain,2065,2,0.00,Gal,,BRNZ42,7.25,60,,,,,,,,12/26/2011 08:45 AM,https://www.brewersfriend.com/homebrew/recipe/...,https://www.brewersfriend.com/styles/sweet-stout/,,65,,Specific Gravity
7177,Double IPA,Imperial IPA,5.5,10.52,100.28,1.093,1.019,7.53,All Grain,3172,2,0.00,Gal,,turnerbrau,8,120,1.064,,68,F,,,,12/15/2011 04:00 PM,https://www.brewersfriend.com/homebrew/recipe/...,https://www.brewersfriend.com/styles/imperial-...,,69,1.2,Specific Gravity


In [17]:
beer_df[beer_df.boil_size.astype(float) > 500]

Unnamed: 0,title,style,size,ABV,IBU,OG,FG,color,method,views,brewed,rating,size_unit,author,author_name,boil_size,boil_time,boil_grav,pitch_rate,temp_prim,temp_unit,prim_method,prim_amount,prim_amount_unit,creation_date,url_beer,url_style,url_author,efficiency,mash_thick,sugar_scale
15,Hopped and Loaded 16 BBL,American IPA,496,7.22,113.97,16.408,3.12005,6.57,All Grain,7629,47,5.0,Gal,52796,105WestBrewing,531,75,15.4,0.75,68.0,F,,,,10/27/2015 07:07 PM,https://www.brewersfriend.com/homebrew/recipe/...,https://www.brewersfriend.com/styles/american-...,https://www.brewersfriend.com/homebrew/brewer/...,80,1.3,Plato
82,American Blonde 17 BBL,Blonde Ale,527,4.75,22.62,11.1357,2.19826,4.44,All Grain,2096,14,0.0,Gal,52796,105WestBrewing,551,60,10.7,0.75,68.0,F,,,,10/11/2015 11:17 PM,https://www.brewersfriend.com/homebrew/recipe/...,https://www.brewersfriend.com/styles/blonde-ale/,https://www.brewersfriend.com/homebrew/brewer/...,86,1.5,Plato
97,UbikEklektik herbal IPA - Guri,English IPA,813,6.5,74.4,14.5895,2.89606,5.94,All Grain,955,13,0.0,L,14075,H�bris,915,70,13.0,1.0,20.0,C,,,,10/09/2015 12:03 PM,https://www.brewersfriend.com/homebrew/recipe/...,https://www.brewersfriend.com/styles/english-ipa/,https://www.brewersfriend.com/homebrew/brewer/...,74,,Plato
186,Tasty Bev,No Profile Selected,600,4.88,33.93,1.049,1.012,5.0,All Grain,36,9,0.0,Gal,240069,Nkauffman,630,60,1.046,0.35,,,,,,09/30/2019 11:39 PM,https://www.brewersfriend.com/homebrew/recipe/...,https://www.brewersfriend.com/styles/no-profil...,https://www.brewersfriend.com/homebrew/brewer/...,79,1.33,Specific Gravity
401,Bressane blanche,Weizen/Weissbier,2200,4.97,15.03,12.9468,3.69622,3.36,All Grain,571,6,0.0,L,73073,Chimayx,2600,60,12.0,0.35,21.0,C,,,,07/31/2017 09:26 AM,https://www.brewersfriend.com/homebrew/recipe/...,https://www.brewersfriend.com/styles/weizen-we...,https://www.brewersfriend.com/homebrew/brewer/...,78,3.5,Plato
420,Oktoberfest 17 BBL,Märzen,527,5.72,30.97,13.4784,2.81747,9.91,All Grain,798,6,0.0,Gal,52796,105WestBrewing,562,90,12.7,1.5,52.0,F,,,,08/16/2016 05:11 PM,https://www.brewersfriend.com/homebrew/recipe/...,https://www.brewersfriend.com/styles/marzen/,https://www.brewersfriend.com/homebrew/brewer/...,85,1.4,Plato


# Detail scraper

In [None]:
columns = ['reviews', 'calories', 'calories_unit', 'carbs', 'carbs_unit', 'mash_ph']
urls = pd.read_csv('data/beer_basic_data.csv', index_col='Unnamed: 0').url_beer
beer_df = pd.DataFrame(index=urls.index, columns = columns)

In [92]:
set(urls.index) - set(beer_df[beer_df.isna().all(axis=1) == False].index)

{1340,
 1341,
 1342,
 1343,
 1344,
 1345,
 1346,
 1347,
 1348,
 1349,
 1350,
 1351,
 1352,
 1353,
 1354,
 1355,
 1356,
 1357,
 1358,
 1359,
 1360,
 1361,
 1362,
 1363,
 1364,
 1365,
 1366,
 1367,
 1368,
 1369,
 1370,
 1371,
 1372,
 1373,
 1374,
 1375,
 1376,
 1377,
 1378,
 1379,
 1380,
 1381,
 1382,
 1383,
 1384,
 1385,
 1386,
 1387,
 1388,
 1389,
 1390,
 1391,
 1392,
 1393,
 1394,
 1395,
 1396,
 1397,
 1398,
 1399,
 1400,
 1401,
 1402,
 1403,
 1404,
 1405,
 1406,
 1407,
 1408,
 1409,
 1410,
 1411,
 1412,
 1413,
 1414,
 1415,
 1416,
 1417,
 1418,
 1419,
 1420,
 1421,
 1422,
 1423,
 1424,
 1425,
 1426,
 1427,
 1428,
 1429,
 1430,
 1431,
 1432,
 1433,
 1434,
 1435,
 1436,
 1437,
 1438,
 1439,
 1440,
 1441,
 1442,
 1443,
 1444,
 1445,
 1446,
 1447,
 1448,
 1449,
 1450,
 1451,
 1452,
 1453,
 1454,
 1455,
 1456,
 1457,
 1458,
 1459,
 1460,
 1461,
 1462,
 1463,
 1464,
 1465,
 1466,
 1467,
 1468,
 1469,
 1470,
 1471,
 1472,
 1473,
 1474,
 1475,
 1476,
 1477,
 1478,
 1479,
 1480,
 1481,
 1482,

In [94]:
indices = set(urls.index) - set(beer_df[beer_df.isna().all(axis=1) == False].index)
for idx in tqdm(indices):
    
    url = urls.loc[idx]
    r = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
    soup = BeautifulSoup(r.text, 'html.parser')
    
    # description
    description = soup.findAll('div', {'class': 'description'})[0]
    for stats in description.findAll('span', {'class': 'viewStats'}):
        label = stats.findAll('span', {'class': 'firstLabel'})[0].text
        if label == 'Calories:':
            text = stats.find('strong').text
            beer_df.loc[idx, 'calories'] = text.split(' ')[0]
            beer_df.loc[idx, 'calories_unit'] = text.split(' ')[1]
        elif label == 'Carbs:':
            text = stats.find('strong').text
            beer_df.loc[idx, 'carbs'] = text.split(' ')[0]
            beer_df.loc[idx, 'carbs_unit'] = text.split(' ')[1]
    
    ## review count
    try:
        beer_df.loc[idx, 'reviews'] = description.findAll('span', {'itemprop': 'reviewCount'})[0].text
    except IndexError:
        beer_df.loc[idx, 'reviews'] = 0
    
    # mash ph
    beer_df.loc[idx, 'mash_ph'] = soup.find('div', {'class': 'value phMin'}).text.replace('\t','').replace('\n','')
                    
    beer_df.to_csv('./data/beer_detail_info.csv')

beer_df

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(IntProgress(value=0, max=5839), HTML(value='')))




Unnamed: 0,reviews,calories,calories_unit,carbs,carbs_unit,mash_ph
0,25,182,calories,18,g,5.67
1,20,204,calories,19,g,5.49
2,9,203,calories,22,g,5.81
3,8,208,calories,22,g,5.41
4,5,190,calories,17,g,
...,...,...,...,...,...,...
7174,0,185,calories,18,g,
7175,0,254,calories,22,g,
7176,0,178,calories,19,g,
7177,0,308,calories,28,g,


In [78]:
title = header[0]
print(title)
title == 'Title '

Title 


False

In [73]:
tmp = 'asd'
tmp == 'asd'

True