In [None]:
# ostatnie piwo: https://www.polskikraft.pl/piwo/9740
# ostatni browar: https://www.polskikraft.pl/browar/411
# ostatni user: https://www.polskikraft.pl/profil/28348

# imports

In [2]:
import os
dumpdir = '/content/drive/MyDrive/datasets/PolskiKraft'

In [3]:
import pandas as pd
from tqdm.auto import tqdm
from bs4 import BeautifulSoup

In [9]:
# import the data
!cp {dumpdir}/*.zip .
!unzip -q piwa.zip
!unzip -q browary.zip

# Extracting the data

## Etractors (defining the functions)

### Brewery

In [12]:
with open('browary/411.html', 'r') as f:
    html_content = f.read()

In [10]:
def extract_brewery_beers_from_brewery_html(html_content):
    #in: string containing html file
    #out: list of dicts of extracted properties:
    #     [{...}]
    soup = BeautifulSoup(html_content, 'html.parser')
    beer_entries = soup.find_all("div", {"class": "pk-search-entry-link"})
    results = []
    for html_entry in beer_entries:

        tmp = html_entry.find_all("div", {"class":"row pk-search-entry"})[0].contents
        results.append({
            'brewery_name': soup.find_all("ol", {"class":"breadcrumb pk-breadcrumb"})[0].find_all('li')[1].text,
            'name': tmp[3].text,
            'rating': float(html_entry.find_all("div", {"class":"row rating"})[0].text),
            'type': tmp[6].text,
            'thumbnail_link': tmp[1].get('src'),
        })
    return results

In [13]:
pd.DataFrame(extract_brewery_beers_from_brewery_html(html_content))

Unnamed: 0,brewery_name,name,rating,type,thumbnail_link
0,Monkey Browar,Wróg U Bram,3.6,American Pale Ale,https://www.polskikraft.pl/img/photos/16019238...
1,Monkey Browar,Born To Be Hoppy,4.1,DDH DIPA,https://www.polskikraft.pl/img/photos/16019237...
2,Monkey Browar,Born To Be Hoppy,4.4,DDH Double IPA,https://www.polskikraft.pl/img/photos/16019229...


In [14]:
def extract_brewery_rating_from_brewery_html(html_content):
    #in: string containing html file
    #out: dict of extracted properties:
    #     {...}
    soup = BeautifulSoup(html_content, 'html.parser')
    the_div = soup.find_all("div", {"class":"panel-body pk-panel-photo-tile"})[0]
    results = {
        'name': the_div.find("p", {"class":"beer-name"}).text,
        'beers': the_div.find("p", {"class":"brewery-name"}).text,
        'rating': float(the_div.find("span", {"class":"rating"}).text),
        'logo_link': soup.find_all("div", {"class":"panel-body pk-panel-photo-tile"})[0].find('img').get('src'),
    }
    return results

In [15]:
extract_brewery_rating_from_brewery_html(html_content)

{'beers': '3 piwa',
 'logo_link': 'https://www.polskikraft.pl/img/photos/1601922522.jpg',
 'name': 'Monkey Browar',
 'rating': 4.0}

### Beers

In [47]:
with open('piwa/9740.html', 'r') as f:
    html_content = f.read()

In [58]:
def extract_similar_beers_from_beer_html_soup(soup):
    #in: string containing html file
    #out: dict of extracted properties:
    #     {...}

    similar_div = soup.find_all("div", {"class":"panel-body pk-panel-tile pk-panel-tile-list"})[4]
    hrefs = [x.get("href") for x in similar_div.find_all("a", {"class":"big-link"})]
    return [{'similar': x.split('_')[-1], 'similar_link': x} for x in hrefs]

def extract_similar_beers_from_beer_html(html_content):
    #in: string containing html file
    #out: dict of extracted properties:
    #     {...}

    soup = BeautifulSoup(html_content, 'html.parser')
    return extract_similar_beers_from_beer_html_soup(soup)

In [49]:
pd.DataFrame(extract_similar_beers_from_beer_html(html_content))

Unnamed: 0,similar,similar_link
0,1748,https://www.polskikraft.pl/piwo/pinta-vermont-...
1,2330,https://www.polskikraft.pl/piwo/browar-ziemia-...
2,2476,https://www.polskikraft.pl/piwo/browar-bazylis...
3,2485,https://www.polskikraft.pl/piwo/browar-deer-be...
4,2568,https://www.polskikraft.pl/piwo/piwne-podziemi...
...,...,...
294,9700,https://www.polskikraft.pl/piwo/piwojad-test-%...
295,9704,https://www.polskikraft.pl/piwo/browar-stu-mos...
296,9705,https://www.polskikraft.pl/piwo/piwojad-ddh-ne...
297,9723,https://www.polskikraft.pl/piwo/alebrowar-new-...


In [98]:
def extract_beer_data_from_beer_html_soup(soup):
    #in: string containing html file
    #out: dict of extracted properties:
    #     {...}

    the_div = soup.find("div", {"class":"panel-body pk-panel-photo-tile"})
    description_div = soup.find("div", {"class":"panel-body pk-panel-tile"})
    results = {
        'name': the_div.find("p", {"class":"beer-name"}).text,
        'brewery': the_div.find("p", {"class":"brewery-name"}).text,
        'rating': the_div.find("span", {"class":"rating"}).text.strip(),
        'logo_link': soup.find("div", {"class":"panel-body pk-panel-photo-tile"}).find('img').get('src'),
        'features': {x.find("p").text: x.find("h1").text
            for x in description_div.find_all("div", {"class": "col-xs-4"})},
        'descriptive_features': {x.find("h3").text: x.find("p").text.strip()
            for x in description_div.find_all("div", {"class": "col-xs-12 pk-details-category"})}
    }
    return results

def extract_beer_data_from_beer_html(html_content):
    #in: string containing html file
    #out: dict of extracted properties:
    #     {...}

    soup = BeautifulSoup(html_content, 'html.parser')
    return extract_beer_data_from_beer_html_soup(soup)

In [99]:
extract_beer_data_from_beer_html(html_content)

{'brewery': 'PINTA',
 'descriptive_features': {'CHMIELE': 'Tradition (DE)',
  'DROŻDŻE': 'Safale US-05',
  'STYL': 'Lekkie Wędzone',
  'SŁODY': 'Wędzony Wayermann, Monachijski II  Weyermann, Caramunich®  II, Carafa® Special  I, Słód PINTA Orzechowy Wędzony'},
 'features': {'alc': '3', 'blg': '9', 'ibu': '-'},
 'logo_link': 'https://www.polskikraft.pl/img/photos/1442392353.jpg',
 'name': 'Podymek',
 'rating': ''}

In [61]:
def extract_user_ratings_from_beer_html_soup(soup):
    #in: string containing html file
    #out: list of dicts of extracted properties:
    #     {...}
    the_div = soup.find("div", {"class":"panel panel-default hidden-xs"})
    ratings_element = the_div.find_all("div", {"class":"row pk-comments-item pk-comment-link"})
    results = []
    for user_rating_element in ratings_element:
        user_profile_link = user_rating_element.find_all("a")[1].get('href')
        username_element = user_rating_element.find("p", {"class": "username"})
        tmp = user_rating_element.find_all("div", {"class":"row pk-comments-bjcp-bar"})
        results.append({
            'rating': len(user_rating_element.find_all("i", {"class":"fa fa-star fa-lg"})),
            'user': list(username_element.children)[0].strip(),
            'date': username_element.find("sub").text,
            'user_id': user_profile_link.split('/')[-1],
            'user_profile': user_profile_link,
            'user_avatar_link': user_rating_element.find("img").get("src"),
            'comment': user_rating_element.find("p", {"class":"content"}).text.strip(),
            'BJCP': {
                x.find('p', {"class":"title"}).text:
                x.find('p', {"class":"rating"}).text
                for x in tmp
                },
        })
    return results

def extract_user_ratings_from_beer_html(html_content):
    #in: string containing html file
    #out: list of dicts of extracted properties:
    #     {...}
    soup = BeautifulSoup(html_content, 'html.parser')
    return extract_user_ratings_from_beer_html_soup(soup)

In [62]:
pd.DataFrame(extract_user_ratings_from_beer_html(html_content))

Unnamed: 0,rating,user,date,user_id,user_profile,user_avatar_link,comment,BJCP
0,4,Grzegorz Marć,20/12/2017,5539,https://www.polskikraft.pl/profil/5539,https://lh4.googleusercontent.com/-H_P1fUtZ-XM...,,{}
1,3,Łukasz Sławiński,02/01/2016,1332,https://www.polskikraft.pl/profil/1332,https://scontent.xx.fbcdn.net/v/t1.0-1/p200x20...,,{}


## Creating the dataset

### Beers

In [None]:
beers_metadata = []
PK_ratings = []
beer_network_according_to_PK = []
bad_files = []

for i in tqdm(range(1,9740+1)):
    path = f'piwa/{i}.html'
    with open(path, 'r') as f:
        html_content = f.read()
    try:
        soup = BeautifulSoup(html_content, 'html.parser')
        metadata = extract_beer_data_from_beer_html_soup(soup)
        ratings = extract_user_ratings_from_beer_html_soup(soup)
        similar_to = extract_similar_beers_from_beer_html_soup(soup)
        edges = [(i,x['similar']) for x in similar_to]

        # update data with beer id
        metadata.update({'beer_id': i})
        for r in ratings:
            r.update({'beer_id': i})
        beers_metadata.append(metadata)

        # unpack specific features
        mf = metadata.pop('features')
        mdf = metadata.pop('descriptive_features')
        metadata.update(mf)
        metadata.update(mdf)
        for r in ratings:
            bjcp_features = r.pop('BJCP')
            r.update(bjcp_features)

        # append the entries
        PK_ratings += ratings
        beer_network_according_to_PK += edges
    except:
        bad_files.append(i)

beers_metadata = pd.DataFrame(beers_metadata)
PK_ratings = pd.DataFrame(PK_ratings)
print('bad_files:', bad_files)

HBox(children=(FloatProgress(value=0.0, max=9740.0), HTML(value='')))

In [None]:
beers_metadata.head()

In [None]:
PK_ratings.head()

save the results to a file

In [None]:
beers_metadata.to_csv('beers_metadata.csv', index=False)

In [86]:
PK_ratings.to_csv('ratings.csv', index=False)

In [87]:
with open('beer_network.csv', 'w') as f:
    for x in beer_network_according_to_PK:
        f.write(f"{x[0]}, {x[1]}\n")

In [88]:
!cp beers_metadata.csv ratings.csv beer_network.csv {dumpdir}

In [89]:
!cp beers_metadata.csv ratings.csv beer_network.csv /content/drive/MyDrive/DANsem3/SR