In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from time import sleep
import os
import gzip

# Basic data structure

In [5]:
class CheckIn:
    def __init__(self, checkin_id, user_id, beer_id, rating = None, location_id = None, comment = None, tagged_friends = [], cheers = 0):
        self.checkin_id = checkin_id
        self.user_id = user_id
        self.beer_id = beer_id
        self.rating = rating
        self.location_id = location_id
        self.comment = comment
        self.tagged_friends = tagged_friends
        self.cheers = cheers
        
    def __repr__(self):
        return f"({self.checkin_id}, {self.user_id}, {self.beer_id}, {self.rating}, {self.location_id}, {self.comment}, {self.tagged_friends}, {self.cheers})"

# Scrapping helpers

In [1]:
default_headers = { 
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0",
            "Accept": "*/*",
            "Accept-Language": "en-US,en;q=0.5",
            "X-Requested-With": "XMLHttpRequest",
            "Pragma": "no-cache",
            "Cache-Control": "no-cache",
        }

def referer_headers(referer):
    headers = default_headers.copy()
    headers['referer'] = referer
    return headers

cookies = {'untappd_user_v3_e': '59884cc5903a2ad0d4a2707a8caf891d9ac17e0c016977b66432c1e7ae6b2d5667ed6a177cccf18861870eb1c0d6b333888d6d0c01ae69b45e5dcd0c5bb00d1edReLZMP%2Fi3XSY3q3FUNdC6FMVPkz3hUGk%2FFPBfVStfaamglZ0wJMZczAFofaAewWTdWi%2BCC260FZ1uGrzfRWGg%3D%3D'}

In [9]:
def parse_checkin_from_html(checkin):
    checkin_id = int(checkin["data-checkin-id"])
    checkin_div = checkin.find("p", {"class": "text"})
    user_id = checkin_div.find("a", {"class": "user"})["href"].split("/")[-1]
    checkin_description = checkin_div.contents
    
    beer_id = None
    if ' is drinking a ' in checkin_description:
        beer_id = int(checkin_description[checkin_description.index(' is drinking a ') + 1]['href'].split("/")[-1])

    location_id = None
    if ' at ' in checkin_description:
        location_id = int(checkin_description[checkin_description.index(' at ') + 1]['href'].split("/")[-1])
        
    rating_div = checkin.find("div", {"class": "caps"})
    rating = None if rating_div is None else float(rating_div['data-rating'])

    comment_div = checkin.find("p", {"class": "comment-text"})
    comment = None if comment_div is None else comment_div.text.strip()

    tagged_friends_div = checkin.find("div", {"class": "tagged-friends"})
    tagged_friends = [] if tagged_friends_div is None else [a["href"].split("/")[-1] for a in tagged_friends_div.findAll("a")]

    cheers_div = checkin.find("div", {"class": "cheers"})
    cheers = 0 if cheers_div is None else int(cheers_div.find("span", {"class": "count"}).find("span").text)

    return CheckIn(checkin_id, user_id, beer_id, rating, location_id, comment, tagged_friends, cheers)


def scrap_user_checkins(user_id):
    request_url = f"https://untappd.com/user/{user_id}"
    request = requests.get(request_url, headers=default_headers, cookies=cookies)
    checkins_div = BeautifulSoup(request.text, 'html.parser').select('div[id*="checkin_"]')
    
    previous_len = 0
    checkins = [parse_checkin_from_html(checkin) for checkin in checkins_div]

    more_checkins_headers = referer_headers(f"Referer: https://untappd.com/user/{user_id}")
    
    while len(checkins) != previous_len and len(checkins) < 100:
        last_checkin_id = checkins[-1].checkin_id
        more_checkins_url = f"https://untappd.com/profile/more_feed/{user_id}/{last_checkin_id}?v2=true"
        request = requests.get(more_checkins_url, headers=more_checkins_headers, cookies=cookies)
        checkins_div = BeautifulSoup(request.text, 'html.parser').select('div[id*="checkin_"]')
        checkins += [parse_checkin_from_html(checkin) for checkin in checkins_div]
        print(f"Found {len(checkins)}")
    
    return checkins

# Crawling

In [10]:
def user_ids_to_df(user_ids):
    df = pd.DataFrame(index=pd.Index(user_ids, name="user_id"))
    df["is_visited"] = False
    df["name"] = None
    df["num_checkins"] = -1
    df["num_beers"] = -1
    df["num_badges"] = -1
    df["num_friends"] = -1
    df["friends"] = None
    df["is_supporter"] = False
    df["facebook"] = None
    df["twitter"] = None
    df["foursquare"] = None
    df["location"] = None
    df["profile_picture"] = None
    df["profile_banner"] = None
    
    df.is_supporter = df.is_supporter.astype(dtype=np.bool, copy=False)
    df.num_checkins = df.num_checkins.astype(dtype=np.int32, copy=False)
    df.num_beers = df.num_beers.astype(dtype=np.int32, copy=False)
    df.num_badges = df.num_badges.astype(dtype=np.int32, copy=False)
    df.num_friends = df.num_friends.astype(dtype=np.int32, copy=False)
    return df

In [11]:
default_visited_row = True, None, -1, -1, -1, -1, None, False, None, None, None, None, None, None

def visited_user_to_row(user):
    return True, user.name, user.num_checkins, user.num_beers, user.num_badges, user.num_friends, user.friends, user.is_supporter, user.facebook, user.twitter, user.foursquare, user.location, user.profile_picture, user.profile_banner

In [12]:
start_users = ["Dobby67", "Sourtats", "pasvaiste", "TombiLion", "errau", "timm3h", "Sheehan", "Jonnyhead"]
users_df = user_ids_to_df(start_users)

In [13]:
users_df = pd.read_pickle("crawled_users.pkl")

In [1]:
i = 0
while users_df[~users_df.is_visited].shape[0] != 0 and i < 1000:
    user_id = users_df[users_df.is_visited == False].index[0]
    print(f"Scrapping {user_id}:")
    
    user_stats = scrap_user(user_id)
    sleep(3)
    
    if user_stats is not None:
        print(f"    -> public profile ({user_stats.num_friends} friends)")
        unseen_users = [friend for friend in user_stats.friends if friend not in users_df.index]
        print(f"    -> with {len(unseen_users)} unseen ids")
        unseen_df = user_ids_to_df(unseen_users)
        users_df = users_df.append(unseen_df)
        users_df.loc[user_id] = visited_user_to_row(user_stats)
    else:
        print("    -> private profile")
        users_df.loc[user_id] = default_visited_row
        
    print(f"    -> {users_df.shape[0]} entries in total")
    users_df.to_pickle("crawled_users.pkl")
    i+=1

NameError: name 'users_df' is not defined

12000 at 13:01  
50400 at 14:34  
84812 at 17:15  
94332 at 18:14  
104519 at 19:41  
128542 at 23:28  
136000 at 00:00

restarted at 10h15
156551 at 12:50
