In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from time import sleep

# Basic data structure

In [29]:
class User:
    def __init__(self, user_id, name, num_checkins, num_beers, num_friends, friends, num_badges, is_supporter, facebook, twitter, foursquare, location, profile_picture, profile_banner):
        self.user_id = user_id
        self.name = name
        self.num_checkins = num_checkins
        self.num_beers = num_beers
        self.num_friends = num_friends
        self.friends = friends
        self.num_badges = num_badges
        self.is_supporter = is_supporter
        self.facebook = facebook
        self.twitter = twitter
        self.foursquare = foursquare
        self.location = location
        self.profile_picture = profile_picture
        self.profile_banner = profile_banner
    
    def __repr__(self):
        support_prefix = "is a" if self.is_supporter else "not a"
        return f"{self.name} ({self.user_id}), {self.num_beers} beers, {self.num_checkins} checkins, {self.num_friends} friends, {self.num_badges} badges, {support_prefix} supporter"

In [4]:
class CheckIn:
    def __init__(self, checkin_id, user_id, beer_id, rating = None, location_id = None, comment = None, tagged_friends = [], cheers = 0):
        self.checkin_id = checkin_id
        self.user_id = user_id
        self.beer_id = beer_id
        self.rating = rating
        self.location_id = location_id
        self.comment = comment
        self.tagged_friends = tagged_friends
        self.cheers = cheers
        
    def __repr__(self):
        return f"({self.checkin_id}, {self.user_id}, {self.beer_id}, {self.rating}, {self.location_id}, {self.comment}, {self.tagged_friends}, {self.cheers})"

In [5]:
class Beer:
    def __init__(self, beer_id, name, brewery_id, style, abv, ibu, avg_rating, total_ratings, total_checkins, unique_users):
        self.beer_id = beer_id
        self.name = name
        self.brewery_id = brewery_id
        self.style = style
        self.abv = abv
        self.ibu = ibu
        self.avg_rating = avg_rating
        self.total_ratings = total_ratings
        self.total_checkins = total_checkins
        self.unique_users = unique_users
    
    def __repr__(self):
        return f"({self.beer_id}, {self.name}, {self.brewery_id}, {self.style}, {self.abv} ABV, {self.ibu} IBU, {self.avg_rating}, {self.total_ratings}, {self.total_checkins}, {self.unique_users})"

# Scrapping helpers

In [64]:
default_headers = { 
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0",
            "Accept": "*/*",
            "Accept-Language": "en-US,en;q=0.5",
            "X-Requested-With": "XMLHttpRequest",
            "Pragma": "no-cache",
            "Cache-Control": "no-cache",
        }

def referer_headers(referer):
    headers = default_headers.copy()
    headers['referer'] = referer
    return headers

cookies = {'untappd_user_v3_e': '59884cc5903a2ad0d4a2707a8caf891d9ac17e0c016977b66432c1e7ae6b2d5667ed6a177cccf18861870eb1c0d6b333888d6d0c01ae69b45e5dcd0c5bb00d1edReLZMP%2Fi3XSY3q3FUNdC6FMVPkz3hUGk%2FFPBfVStfaamglZ0wJMZczAFofaAewWTdWi%2BCC260FZ1uGrzfRWGg%3D%3D'}

In [65]:
def find_stats_from_div(div, user_id):
    def find_stat(href):
        return int(div.find("a", {"href": href}).find("span", {"class": "stat"}).text.replace(",", ""))
    
    num_checkins = find_stat(f"/user/{user_id}")
    num_beers = find_stat(f"/user/{user_id}/beers")
    num_friends = find_stat(f"/user/{user_id}/friends")
    num_badges = find_stat(f"/user/{user_id}/badges")
    return num_checkins, num_beers, num_friends, num_badges
    

def find_details_from_div(div):
    def find_socials(socials_div):
        facebook, twitter, foursquare = None, None, None
        socials_list = socials_div.findAll("a")
        for social in socials_list:
            if social.text == "Foursquare":
                foursquare = social["href"]
            if social.text == "Facebook":
                facebook = social["href"]
            if social.text == "Twitter":
                twitter = social["href"]
        return facebook, twitter, foursquare
    
    location_text = div.find("p", {"class": "location"}).text
    location = None if location_text == "" else location_text
    
    socials_div = div.find("div", {"class": "social"})
    facebook, twitter, foursquare = find_socials(socials_div)
    return location, facebook, twitter, foursquare
    

def scrap_user_stats(user_id):    
    user_url = f"https://untappd.com/user/{user_id}"
    request = requests.get(user_url, headers=default_headers, cookies=cookies)
    while request.status_code != 200:
            print(f"GET ERROR {request.status_code} for {user_id}'s stats", end="\r")
            sleep(2)
            request = requests.get(user_url, headers=default_headers, cookies=cookies)
            
    soup = BeautifulSoup(request.text, 'html.parser')
    
    if soup.find("div", {"class": "private_user"}) is not None:
        return None
    
    name = soup.find("div", {"class": "info"}).find("h1").text.strip()
    is_supporter = soup.find("div", {"class": "user-info"}).find("span", {"class": "supporter"}) is not None

    stats_div = soup.find("div", {"class": "stats"})
    num_checkins, num_beers, num_friends, num_badges = find_stats_from_div(stats_div, user_id)
    
    details_div = soup.find("div", {"class": "user-details"})
    location, facebook, twitter, foursquare = find_details_from_div(details_div)
    
    profile_picture = soup.find("div", {"class": "avatar-holder"}).find("img")["src"]
    profile_banner = soup.find("div", {"class": "profile_header"})["data-image-url"]
    
    return User(user_id, name, num_checkins, num_beers, num_friends, None, num_badges, is_supporter, facebook, twitter, foursquare, location, profile_picture, profile_banner)

In [None]:
def scrap_user_friends(user_id, number_of_friends=2147483647):
    if number_of_friends == 0:
        return []
    
    friends_url = f"https://untappd.com/user/{user_id}/friends"
    request = requests.get(friends_url, headers=default_headers, cookies=cookies)
    while request.status_code != 200:
            print(f"GET ERROR {request.status_code} for {user_id}'s friends", end="\r")
            sleep(2)
            request = requests.get(user_url, headers=default_headers, cookies=cookies)
    
    soup = BeautifulSoup(request.text, 'html.parser')
    friends_div = soup.find("div", {"class": "current"}).findAll("span", {"class": "username"})
    friends = [span.getText() for span in friends_div]
    
    old_number_of_friends = 0
    headers = referer_headers(f"https://untappd.com/user/{user_id}/friends")
    while number_of_friends != len(friends) and old_number_of_friends != len(friends):
        offset = len(friends)
        more_friends_url_template = f"https://untappd.com/friend/more_friends/{user_id}/{offset}?sort="
            
        request = requests.get(more_friends_url_template, headers=headers, cookies=cookies)
        while request.status_code != 200:
            print(f"GET ERROR {request.status_code} for {user_id} with {offset}th friends", end="\r")
            if request.status_code == 429:
                sleep(30)
            request = requests.get(more_friends_url_template, headers=headers, cookies=cookies)
            
        soup = BeautifulSoup(request.text, 'html.parser')
        old_number_of_friends = len(friends)
        friends = friends + [a.text for a in soup.findAll("span", {"class": "username"})]
    return friends

def scrap_user(user_id):
    user_stats = scrap_user_stats(user_id)

    if user_stats is None:
        return None
    
    user_friends = scrap_user_friends(user_id, user_stats.num_friends)
    user_stats.friends = user_friends
    return user_stats

In [18]:
def parse_checkin_from_html(checkin):
    checkin_id = int(checkin["data-checkin-id"])
    checkin_div = checkin.find("p", {"class": "text"})
    user_id = checkin_div.find("a", {"class": "user"})["href"].split("/")[-1]
    checkin_description = checkin_div.contents
    
    beer_id = None
    if ' is drinking a ' in checkin_description:
        beer_id = int(checkin_description[checkin_description.index(' is drinking a ') + 1]['href'].split("/")[-1])

    location_id = None
    if ' at ' in checkin_description:
        location_id = int(checkin_description[checkin_description.index(' at ') + 1]['href'].split("/")[-1])
        
    rating_div = checkin.find("div", {"class": "caps"})
    rating = None if rating_div is None else float(rating_div['data-rating'])

    comment_div = checkin.find("p", {"class": "comment-text"})
    comment = None if comment_div is None else comment_div.text.strip()

    tagged_friends_div = checkin.find("div", {"class": "tagged-friends"})
    tagged_friends = [] if tagged_friends_div is None else [a["href"].split("/")[-1] for a in tagged_friends_div.findAll("a")]

    cheers_div = checkin.find("div", {"class": "cheers"})
    cheers = 0 if cheers_div is None else int(cheers_div.find("span", {"class": "count"}).find("span").text)

    return CheckIn(checkin_id, user_id, beer_id, rating, location_id, comment, tagged_friends, cheers)


def scrap_user_checkins(user_id):
    request_url = f"https://untappd.com/user/{user_id}"
    request = requests.get(request_url, headers=default_headers, cookies=cookies)
    checkins_div = BeautifulSoup(request.text, 'html.parser').select('div[id*="checkin_"]')
    
    previous_len = 0
    checkins = [parse_checkin_from_html(checkin) for checkin in checkins_div]

    more_checkins_headers = referer_headers(f"Referer: https://untappd.com/user/{user_id}")
    
    while len(checkins) != previous_len and len(checkins) < 100:
        last_checkin_id = checkins[-1].checkin_id
        more_checkins_url = f"https://untappd.com/profile/more_feed/{user_id}/{last_checkin_id}?v2=true"
        request = requests.get(more_checkins_url, headers=more_checkins_headers, cookies=cookies)
        checkins_div = BeautifulSoup(request.text, 'html.parser').select('div[id*="checkin_"]')
        checkins += [parse_checkin_from_html(checkin) for checkin in checkins_div]
        print(f"Found {len(checkins)}")
    
    return checkins

In [8]:
beer_id = 96597
beer_url = f"https://untappd.com/b/a/{beer_id}"
request = requests.get(beer_url, headers=default_headers, cookies=cookies)
soup = BeautifulSoup(request.text, 'html.parser')

In [9]:
name_div = soup.find("div", {"class": "name"})
name = name_div.find("h1").text
brewery_id = name_div.find("a")['href'].split("/")[-1]
style = name_div.find("p", {"class":"style"}).text

details_div = soup.find("div", {"class": "details"})
abv = float(details_div.find("p", {"class": "abv"}).text.split("%")[0])
ibu_text = details_div.find("p", {"class": "ibu"}).text.strip()
ibu = None if ibu_text == "No IBU" else int(ibu_text.split(" ")[0])
avg_rating = float(details_div.find("div", {"class": "caps"})["data-rating"])
total_ratings = int(details_div.find("p", {"class": "raters"}).text.split(" ")[0].replace(",", ""))


name, brewery_id, style, abv, ibu, avg_rating, total_ratings

('Gulden Draak 9000 Quadruple',
 'BrouwerijVanSteenberge',
 'Belgian Quadrupel',
 10.5,
 25,
 3.88228,
 105603)

# Crawling

In [19]:
def user_ids_to_df(user_ids):
    df = pd.DataFrame(index=pd.Index(user_ids, name="user_id"))
    df["is_visited"] = False
    df["name"] = None
    df["checkins"] = 0
    df["beers"] = 0
    df["number_or_friends"] = 0
    df["friends"] = None
    df["badges"] = 0
    df["is_supporter"] = False
    
    df.is_supporter = df.is_supporter.astype(dtype=np.bool, copy=False)
    df.badges = df.badges.astype(dtype=np.int32, copy=False)
    df.checkins = df.checkins.astype(dtype=np.int32, copy=False)
    df.number_or_friends = df.number_or_friends.astype(dtype=np.int32, copy=False)
    return df

In [9]:
start_users = ["Dobby67", "Sourtats", "pasvaiste", "TombiLion", "errau", "timm3h", "Sheehan", "Jonnyhead"]
users_df = user_ids_to_df(start_users)

In [20]:
users_df = pd.read_pickle("untappd/crawled_users.pkl")

In [None]:
i = 0
while users_df[~users_df.is_visited].shape[0] != 0 and i < 1000:
    user_id = users_df[users_df.is_visited == False].index[0]
    print(f"Scrapping {user_id}:")
    
    user_stats = scrap_user(user_id)
    sleep(2)
    
    if user_stats is not None:
        print(f"    -> public profile ({user_stats.num_friends} friends)")
        unseen_users = [friend for friend in user_stats.friends if friend not in users_df.index]
        print(f"    -> with {len(unseen_users)} unseen ids")
        unseen_df = user_ids_to_df(unseen_users)
        users_df = users_df.append(unseen_df)
        users_df.loc[user_id] = (True, user_stats.name, user_stats.num_checkins, user_stats.num_beers, user_stats.num_friends, user_stats.friends, user_stats.num_badges, user_stats.is_supporter)
    else:
        print("    -> private profile")
        users_df.loc[user_id] = (True, None, 0, 0, 0, None, 0, False)
        
    print(f"    -> {users_df.shape[0]} entries in total")
    users_df.to_pickle("untappd/crawled_users.pkl")
    i+=1

Scrapping Issegoz:
GET ERROR 429 for Issegoz with 23000th friends