In [1]:
import threading
import time
import random
import requests
import pandas as pd
import numpy as np

from scrapper.user_stats_scrapper import UserStatsScrapper
from scrapper.user_friends_scrapper import UserFriendsScrapper
from scrapper.network.tor_proxy import TorProxy
from scrapper.network.list_proxy import ListProxy
from scrapper.scrapping_status import Status

In [2]:
def log(thread_id, msg):
    with open(f"logs/{thread_id}.txt", "a") as f:
        f.write(msg + "\n")

In [3]:
def saving_thread(df, df_lock, df_save_path, saving_interval=300):
    i=1
    while True:
        time.sleep(saving_interval)
        df_lock.acquire()
        try:
            df.to_pickle(df_save_path)
            print("DF saved at {:03d}: {} entries".format(i, df.shape[0]))
        finally:
            df_lock.release()
            i += 1

In [4]:
def get_tor_proxies(num_threads, starting_port):
    return [TorProxy(starting_port + 2*i, starting_port + 2*i + 1) for i in range(num_threads)]

def get_txt_proxies(list_path, proxy_type, num_threads):
    proxy_ips = []
    with open(list_path, "r") as f:
        proxy_ips = [ip.strip() for ip in f.readlines()]
    np.random.shuffle(proxy_ips)
    proxy_ips = np.array_split(proxy_ips, num_threads)
    return [ListProxy(ips, proxy_type) for ips in proxy_ips]

In [5]:
def find_unvisited_users(seen_users):
    all_users = pd.read_pickle("user_friends.pkl")
    all_users = all_users[all_users.status == Status.VISITED]
    all_users = all_users[all_users.friends.apply(len) > 0]
    all_users = set(all_users.friends.explode().unique())
    return np.array(list(all_users - set(seen_users)))

In [6]:
def scrap_user_friends(num_threads, user_ids_thread_limit = 200):
    user_friends_lock = threading.Lock()
    df_save_path = "user_friends.pkl"
    user_friends_df = pd.read_pickle(df_save_path)
    
    def scrap_user_friends_thread(thread_id, user_ids, proxy):
        time.sleep(2 * random.random())
        auth_cookie = {'untappd_user_v3_e': '59884cc5903a2ad0d4a2707a8caf891d9ac17e0c016977b66432c1e7ae6b2d5667ed6a177cccf18861870eb1c0d6b333888d6d0c01ae69b45e5dcd0c5bb00d1edReLZMP%2Fi3XSY3q3FUNdC6FMVPkz3hUGk%2FFPBfVStfaamglZ0wJMZczAFofaAewWTdWi%2BCC260FZ1uGrzfRWGg%3D%3D'}
        i = 0
        
        for user_id in user_ids:
            log(thread_id, f"Scrapping {i}th user: {user_id}:")
            status, friends = UserFriendsScrapper(user_id).scrap(auth_cookie, proxy)
            
            new_entry = {}
            new_entry["status"]  = status
            new_entry["friends"] = friends
            
            user_friends_lock.acquire()
            try:
                user_friends_df.loc[user_id] = new_entry
            finally:
                user_friends_lock.release()
                log(thread_id, f"    done with status: {status}")
                i += 1
        return
    
    new_users = find_unvisited_users(user_friends_df.index.unique())
    if new_users.size > (user_ids_thread_limit * num_threads):
        new_users = new_users[0:user_ids_thread_limit * num_threads]
    
    user_splits = np.array_split(new_users, num_threads)
    proxies = get_txt_proxies("proxies/fineproxy_socks5.txt", "socks5", num_threads)
    #proxies = get_tor_proxies(num_threads, starting_port=10000)
    
    for i in range(num_threads):
        threading.Thread(target=scrap_user_friends_thread, args=(i, user_splits[i], proxies[i],)).start()
    threading.Thread(target=saving_thread, args=(user_friends_df, user_friends_lock, df_save_path,)).start() 

In [7]:
scrap_user_friends(100, 5000)
print("lets go")

lets go




DF saved at 001: 457341 entries


**17/09:**  
12000 at 13:01  
50400 at 14:34  
84812 at 17:15  
94332 at 18:14  
104519 at 19:41  
128542 at 23:28  
136000 at 00:00  
  
**18/09:**  
restarted at 10h15  
156551 at 12:50 <- threading was unsuccessful  
180000 at 16:53  
201587 at 19:45  
224700 at 23:45

**19/09** (tor will be tried today):  
restarted at 9:56  
232000 at 11:45 pause until tor implementation  
restarted at 13:50 with 50 tor workers (4 workers crashed in 10min, will stop experiment at 10 crashes)  
272000 at 14:00 (YAAAYYY!!!)  
283600 at 14:06  
317500 at 14:20 stopped the experiment  
restarted at 14:30 with 100 workers but bigger delay to avoid ip-banning  
328000 at 14:35  
474000 at 16:15  
600000 at 18:20 stopped experiment after 11 ip ban and 20 worker crashes  
  
**20/09**:  
restarted at 0:00  
913000 at 10:00 stopped (10% found!!!!)  
restarted at 12:00  
1010687 at 13:10  
  
**24/09**:  
restarted at 12:00 with 1500000  
1600000 at 14:00 stopped  
restarted at 15:15  
1670000 at 18:12 stopped  

**25/09**:  
restarted at 12:20 with fineproxy lists (100 simultaneous requests)  
1710000 at start  
1787000 at 13:45 restarted with 200 simultaneous requests (1300users/5min)  
pause at 16:25, going with 500 simultaneous requests  
1963000 at 21:20 stopped  
restarted at 22:10  
1992000 at 0:00  
