In [34]:
import os
import time
import requests
import pandas as pd
from dotenv import load_dotenv

In [33]:
load_dotenv()
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
HEADERS = {
    "User-Agent": "TDS Project 1",
    "Authorization": f"token {GITHUB_TOKEN}"
}
BASE_URL = "https://api.github.com"

#### Scraping all users in the in the city of `Beijing` with over `500` followers

In [54]:
def get_user_details(user, **request_params):
    url = f"{BASE_URL}/users/{user['login']}"
    response = requests.get(url, **request_params)
    if response.status_code != 200:
        print(f"Error fetching user details: {response.status_code}")
        return None

    data = response.json()

    # user = {
    #     "login": data.get("login", ''),
    #     "name": data.get("name", ''),
    #     "company": data.get("company", ''),
    #     "location": data.get("location", ''),
    #     "email": data.get("email", ''),
    #     "hireable": data.get("hireable", ''),
    #     "bio": data.get("bio", ''),
    #     "followers": data.get("followers", 0),
    #     "following": data.get("following", 0),
    #     "created_at": data.get("created_at", ''),
    # }

    return data

In [60]:
def get_users_in_city(city, min_followers=500,  **request_params):
    page = 1
    users = pd.DataFrame([
        "login", "name", "company", "location", "email", "hireable", "bio", "public_repos", "followers", "following", "created_at"
    ])
    users = None
    while True:

        url = f"{BASE_URL}/search/users?q=location:{city}+followers:>{min_followers}&per_page=100&page={page}"
        response = requests.get(url, **request_params)
        if response.status_code != 200:
            print(f"Error fetching users: {response.status_code}")
            break

        data = response.json()
        items = data.get("items", [])

        if users is None:
            total_pages = int(response.links["last"]["url"].split(
                "=")[-1]) if "last" in response.links else 1

        for user in items:
            user_details = get_user_details(user, **request_params)
            if user_details is None:
                continue
            if users is None:
                users = pd.DataFrame(columns=user_details.keys())

            users.loc[len(users)] = user_details.values()

        print(f"Fetched page {page}/{total_pages} - {len(items)} users")

        if "next" not in response.links:
            break

        page += 1
        time.sleep(2)

    return users

In [61]:
user_details = get_users_in_city("Beijing", headers=HEADERS)

Fetched page 1/4 - 100 users
Fetched page 2/4 - 100 users
Fetched page 3/4 - 100 users
Fetched page 4/4 - 60 users


In [62]:
user_details.shape

(360, 33)

In [63]:
user_details.head()

Unnamed: 0,login,id,node_id,avatar_url,gravatar_id,url,html_url,followers_url,following_url,gists_url,...,email,hireable,bio,twitter_username,public_repos,public_gists,followers,following,created_at,updated_at
0,michaelliao,470058,MDQ6VXNlcjQ3MDA1OA==,https://avatars.githubusercontent.com/u/470058...,,https://api.github.com/users/michaelliao,https://github.com/michaelliao,https://api.github.com/users/michaelliao/follo...,https://api.github.com/users/michaelliao/follo...,https://api.github.com/users/michaelliao/gists...,...,askxuefeng@gmail.com,,Crypto developer.,liaoxuefeng,99,0,37235,3,2010-11-06T12:21:35Z,2024-10-11T08:36:39Z
1,daimajia,2503423,MDQ6VXNlcjI1MDM0MjM=,https://avatars.githubusercontent.com/u/250342...,,https://api.github.com/users/daimajia,https://github.com/daimajia,https://api.github.com/users/daimajia/followers,https://api.github.com/users/daimajia/followin...,https://api.github.com/users/daimajia/gists{/g...,...,daimajia@gmail.com,,Zhenfund VP of Investment.,daimajia,89,9,24632,271,2012-10-07T02:40:06Z,2024-10-02T20:13:18Z
2,xiaolai,152970,MDQ6VXNlcjE1Mjk3MA==,https://avatars.githubusercontent.com/u/152970...,,https://api.github.com/users/xiaolai,https://github.com/xiaolai,https://api.github.com/users/xiaolai/followers,https://api.github.com/users/xiaolai/following...,https://api.github.com/users/xiaolai/gists{/gi...,...,lixiaolai@gmail.com,,A lifelong student.,xiaolai,54,49,19238,37,2009-11-13T18:29:42Z,2024-10-04T02:13:28Z
3,draveness,6493255,MDQ6VXNlcjY0OTMyNTU=,https://avatars.githubusercontent.com/u/649325...,,https://api.github.com/users/draveness,https://github.com/draveness,https://api.github.com/users/draveness/followers,https://api.github.com/users/draveness/followi...,https://api.github.com/users/draveness/gists{/...,...,i@draven.co,True,HFT / C++ / Go,draven907,50,27,13009,28,2014-01-24T16:22:01Z,2024-10-11T06:43:11Z
4,hongyangAndroid,10704521,MDQ6VXNlcjEwNzA0NTIx,https://avatars.githubusercontent.com/u/107045...,,https://api.github.com/users/hongyangAndroid,https://github.com/hongyangAndroid,https://api.github.com/users/hongyangAndroid/f...,https://api.github.com/users/hongyangAndroid/f...,https://api.github.com/users/hongyangAndroid/g...,...,623565791@qq.com,True,学习ing,,102,9,12985,35,2015-01-26T07:05:45Z,2024-07-03T06:14:21Z


In [65]:
user_details.info()

<class 'pandas.core.frame.DataFrame'>
Index: 360 entries, 0 to 359
Data columns (total 33 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   login                360 non-null    object
 1   id                   360 non-null    int64 
 2   node_id              360 non-null    object
 3   avatar_url           360 non-null    object
 4   gravatar_id          360 non-null    object
 5   url                  360 non-null    object
 6   html_url             360 non-null    object
 7   followers_url        360 non-null    object
 8   following_url        360 non-null    object
 9   gists_url            360 non-null    object
 10  starred_url          360 non-null    object
 11  subscriptions_url    360 non-null    object
 12  organizations_url    360 non-null    object
 13  repos_url            360 non-null    object
 14  events_url           360 non-null    object
 15  received_events_url  360 non-null    object
 16  type         

In [66]:
user_details.to_csv("beijing_users.csv", index=False)