In [1]:
import threading
import time
import random
import requests
import pandas as pd
import numpy as np

from user_stats_scrapper import UserStatsScrapper
from user_friends_scrapper import UserFriendsScrapper

In [2]:
def user_ids_to_df(user_ids):
    df = pd.DataFrame(index=pd.Index(user_ids, name="user_id"))
    df["is_visited"] = False
    df["name"] = None
    df["num_checkins"] = -1
    df["num_beers"] = -1
    df["num_badges"] = -1
    df["num_friends"] = -1
    df["friends"] = None
    df["is_supporter"] = False
    df["facebook"] = None
    df["twitter"] = None
    df["foursquare"] = None
    df["location"] = None
    df["profile_picture"] = None
    df["profile_banner"] = None
    
    df.is_supporter = df.is_supporter.astype(dtype=np.bool, copy=False)
    df.num_checkins = df.num_checkins.astype(dtype=np.int32, copy=False)
    df.num_beers = df.num_beers.astype(dtype=np.int32, copy=False)
    df.num_badges = df.num_badges.astype(dtype=np.int32, copy=False)
    df.num_friends = df.num_friends.astype(dtype=np.int32, copy=False)
    return df

In [3]:
default_visited_row = True, None, -1, -1, -1, -1, None, False, None, None, None, None, None, None

def visited_user_to_row(user):
    return True, user.name, user.num_checkins, user.num_beers, user.num_badges, user.num_friends, user.friends, user.is_supporter, user.facebook, user.twitter, user.foursquare, user.location, user.profile_picture, user.profile_banner

In [4]:
df_lock = threading.Lock()
users_df = pd.read_pickle("crawled_users.pkl")

In [5]:
def log(thread_id, msg, is_local):
    with open(f"log/{thread_id}.txt", "a") as f:
        f.write(msg + "\n")  
    if is_local:
        print(msg)

In [6]:
def user_scrapping_thread(thread_id, user_ids, proxies, is_local=False):
    global users_df
    auth_cookie = {'untappd_user_v3_e': '59884cc5903a2ad0d4a2707a8caf891d9ac17e0c016977b66432c1e7ae6b2d5667ed6a177cccf18861870eb1c0d6b333888d6d0c01ae69b45e5dcd0c5bb00d1edReLZMP%2Fi3XSY3q3FUNdC6FMVPkz3hUGk%2FFPBfVStfaamglZ0wJMZczAFofaAewWTdWi%2BCC260FZ1uGrzfRWGg%3D%3D'}
    
    for user_id in user_ids:
        log(thread_id, f"Scrapping {user_id}:", is_local)

        user_stats = UserStatsScrapper(user_id).scrap(auth_cookie, proxies)
        time.sleep(2)

        if user_stats is not None:
            log(thread_id, f"    -> public profile ({user_stats.num_friends} friends)", is_local)
            user_friends = UserFriendsScrapper(user_id, user_stats.num_friends).scrap(auth_cookie, proxies)
            user_stats.friends = user_friends
            
            unseen_users = []
            if user_friends is not None:
                unseen_users = [friend for friend in user_stats.friends if friend not in users_df.index]
            
            log(thread_id, f"    -> with {len(unseen_users)} unseen ids", is_local)
            unseen_df = user_ids_to_df(unseen_users)
            df_lock.acquire()
            try:
                users_df = users_df.append(unseen_df)
                users_df.loc[user_id] = visited_user_to_row(user_stats)
                if is_local:
                    users_df.to_pickle("crawled_users.pkl")
            finally:
                log(thread_id, f"    -> {users_df.shape[0]} entries in total", is_local)
                df_lock.release()
        else:
            log(thread_id, "    -> private profile", is_local)
            df_lock.acquire()
            try:
                users_df.loc[user_id] = default_visited_row
                if is_local:
                    users_df.to_pickle("crawled_users.pkl")
            finally:
                log(thread_id, f"    -> {users_df.shape[0]} entries in total", is_local)
                df_lock.release()

In [7]:
number_of_threads = 1
proxies_split_lists = [None] * number_of_threads #[a.tolist() for a in np.array_split(txt_proxies.copy(), number_of_threads)]
user_ids_split_lists = [a.to_list() for a in np.array_split(users_df[~users_df.is_visited].index, number_of_threads)]

In [None]:
for i in range(number_of_threads):
    x = threading.Thread(target=user_scrapping_thread, args=(i, user_ids_split_lists[i], proxies_split_lists[i], True,))
    x.start()

In [None]:
user_scrapping_thread(0, user_ids_split_lists[0], proxies_split_lists[0], True)

**17/09:**  
12000 at 13:01  
50400 at 14:34  
84812 at 17:15  
94332 at 18:14  
104519 at 19:41  
128542 at 23:28  
136000 at 00:00  
  
**18/09:**  
restarted at 10h15  
156551 at 12:50 <- threading was unsuccessful  
180000 at 16:53  
201587 at 19:45  
224700 at 23:45