In [1]:
import os
import time
import requests
import pandas as pd
from dotenv import load_dotenv
from multiprocessing import Pool

In [2]:
load_dotenv()
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
HEADERS = {
    "User-Agent": "TDS Project 1",
    "Authorization": f"token {GITHUB_TOKEN}"
}
BASE_URL = "https://api.github.com"

## Scraping all users in the in the city of `Beijing` with over `500` followers

#### `users.csv` has following information about each user in Beijing with over 500 followers, with fields:

- `login`: Their Github user ID
- `name`: Their full name
- `company`: The company they work at. Clean up company names. At least make sure:
- - They're trimmed of whitespace
- - Leading `@` symbols are stripped
- - They are converted to UPPERCASE
- `location`: The city they are in
- `email`: Their email address
- `hireable`: Whether they are open to being hired
- `bio`: A short bio about them
- `public_repos`: The number of public repositories they have
- `followers`: The number of followers they have
- `following`: The number of people they are following
- `created_at`: When they joined Github


In [54]:
def get_user_details(user, **request_params):
    url = f"{BASE_URL}/users/{user['login']}"
    response = requests.get(url, **request_params)
    if response.status_code != 200:
        print(f"Error fetching user details: {response.status_code}")
        return None

    data = response.json()
    return data

In [60]:
def get_users_in_city(city, min_followers=500,  **request_params):
    page = 1
    users = pd.DataFrame([
        "login", "name", "company", "location", "email", "hireable", "bio", "public_repos", "followers", "following", "created_at"
    ])
    users = None
    while True:

        url = f"{BASE_URL}/search/users?q=location:{city}+followers:>{min_followers}&per_page=100&page={page}"
        response = requests.get(url, **request_params)
        if response.status_code != 200:
            print(f"Error fetching users: {response.status_code}")
            break

        data = response.json()
        items = data.get("items", [])

        if users is None:
            total_pages = int(response.links["last"]["url"].split(
                "=")[-1]) if "last" in response.links else 1

        for user in items:
            user_details = get_user_details(user, **request_params)
            if user_details is None:
                continue
            if users is None:
                users = pd.DataFrame(columns=user_details.keys())

            users.loc[len(users)] = user_details.values()

        print(f"Fetched page {page}/{total_pages} - {len(items)} users")

        if "next" not in response.links:
            break

        page += 1
        time.sleep(2)

    return users

In [61]:
user_details = get_users_in_city("Beijing", headers=HEADERS)

Fetched page 1/4 - 100 users
Fetched page 2/4 - 100 users
Fetched page 3/4 - 100 users
Fetched page 4/4 - 60 users


In [6]:
user_details = pd.read_csv("./beijing_users.csv")

In [7]:
user_details.shape

(360, 33)

In [8]:
user_details.head()

Unnamed: 0,login,id,node_id,avatar_url,gravatar_id,url,html_url,followers_url,following_url,gists_url,...,email,hireable,bio,twitter_username,public_repos,public_gists,followers,following,created_at,updated_at
0,michaelliao,470058,MDQ6VXNlcjQ3MDA1OA==,https://avatars.githubusercontent.com/u/470058...,,https://api.github.com/users/michaelliao,https://github.com/michaelliao,https://api.github.com/users/michaelliao/follo...,https://api.github.com/users/michaelliao/follo...,https://api.github.com/users/michaelliao/gists...,...,askxuefeng@gmail.com,,Crypto developer.,liaoxuefeng,99,0,37235,3,2010-11-06T12:21:35Z,2024-10-11T08:36:39Z
1,daimajia,2503423,MDQ6VXNlcjI1MDM0MjM=,https://avatars.githubusercontent.com/u/250342...,,https://api.github.com/users/daimajia,https://github.com/daimajia,https://api.github.com/users/daimajia/followers,https://api.github.com/users/daimajia/followin...,https://api.github.com/users/daimajia/gists{/g...,...,daimajia@gmail.com,,Zhenfund VP of Investment.,daimajia,89,9,24632,271,2012-10-07T02:40:06Z,2024-10-02T20:13:18Z
2,xiaolai,152970,MDQ6VXNlcjE1Mjk3MA==,https://avatars.githubusercontent.com/u/152970...,,https://api.github.com/users/xiaolai,https://github.com/xiaolai,https://api.github.com/users/xiaolai/followers,https://api.github.com/users/xiaolai/following...,https://api.github.com/users/xiaolai/gists{/gi...,...,lixiaolai@gmail.com,,A lifelong student.,xiaolai,54,49,19238,37,2009-11-13T18:29:42Z,2024-10-04T02:13:28Z
3,draveness,6493255,MDQ6VXNlcjY0OTMyNTU=,https://avatars.githubusercontent.com/u/649325...,,https://api.github.com/users/draveness,https://github.com/draveness,https://api.github.com/users/draveness/followers,https://api.github.com/users/draveness/followi...,https://api.github.com/users/draveness/gists{/...,...,i@draven.co,True,HFT / C++ / Go,draven907,50,27,13009,28,2014-01-24T16:22:01Z,2024-10-11T06:43:11Z
4,hongyangAndroid,10704521,MDQ6VXNlcjEwNzA0NTIx,https://avatars.githubusercontent.com/u/107045...,,https://api.github.com/users/hongyangAndroid,https://github.com/hongyangAndroid,https://api.github.com/users/hongyangAndroid/f...,https://api.github.com/users/hongyangAndroid/f...,https://api.github.com/users/hongyangAndroid/g...,...,623565791@qq.com,True,学习ing,,102,9,12985,35,2015-01-26T07:05:45Z,2024-07-03T06:14:21Z


In [9]:
user_details.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 360 entries, 0 to 359
Data columns (total 33 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   login                360 non-null    object 
 1   id                   360 non-null    int64  
 2   node_id              360 non-null    object 
 3   avatar_url           360 non-null    object 
 4   gravatar_id          0 non-null      float64
 5   url                  360 non-null    object 
 6   html_url             360 non-null    object 
 7   followers_url        360 non-null    object 
 8   following_url        360 non-null    object 
 9   gists_url            360 non-null    object 
 10  starred_url          360 non-null    object 
 11  subscriptions_url    360 non-null    object 
 12  organizations_url    360 non-null    object 
 13  repos_url            360 non-null    object 
 14  events_url           360 non-null    object 
 15  received_events_url  360 non-null    obj

In [10]:
# user_details.to_csv("beijing_users.csv", index=False)

In [11]:
user_details["company"].sample(20)

217    Beijing Institute of Technology
175                  CBSR, NLPR, CASIA
42                            @Tencent
346                                NaN
80                                 NaN
69                                 NaN
58                                 美团网
78                                 NaN
195                     Game Framework
160                          @juzibot 
94                                 NaN
211                  Peking University
206                           PerfXLab
163                                NaN
105                                NaN
152                                NaN
307                            zhufeng
146                               青萌数海
277                                NaN
111                                NaN
Name: company, dtype: object

In [12]:
def fix_company_name(company):
    if company == "null":
        return company
    return company.strip().lstrip("@").upper()

In [13]:
cols = ["login", "name", "company", "location", "email",
        "hireable", "bio", "followers", "following", "created_at"]

In [14]:
user_df = user_details[cols].copy(deep=True)
user_df.replace({None: "null", True: "true", "False": "false"}, inplace=True)
user_df["company"] = user_df["company"].apply(fix_company_name)
user_df.head()

Unnamed: 0,login,name,company,location,email,hireable,bio,followers,following,created_at
0,michaelliao,Crypto Michael,,"Beijing, China",askxuefeng@gmail.com,,Crypto developer.,37235,3,2010-11-06T12:21:35Z
1,daimajia,代码家,ZHENFUND BEIJING,"Beijing, China",daimajia@gmail.com,,Zhenfund VP of Investment.,24632,271,2012-10-07T02:40:06Z
2,xiaolai,xiaolai,INBLOCKCHAIN,beijing,lixiaolai@gmail.com,,A lifelong student.,19238,37,2009-11-13T18:29:42Z
3,draveness,Draven,SPECTRA-FUND,"Beijing, China",i@draven.co,True,HFT / C++ / Go,13009,28,2014-01-24T16:22:01Z
4,hongyangAndroid,张鸿洋,WANANDROID.COM,"Beijing,China",623565791@qq.com,True,学习ing,12985,35,2015-01-26T07:05:45Z


In [15]:
user_df.shape

(360, 10)

In [21]:
user_df.to_csv("users.csv", index=False)

#### `repositories.csv` has these users' public repositories. For each user in users.csv, fetch up to the 500 most recently pushed repositories, with fields:

- `login`: The Github user ID (login) of the owner, which, BTW, is not directly in the API response.
- `full_name`: Full name of the repository
- `created_at`: When the repository was created
- `stargazers_count`: Number of stars the repository has
- `watchers_count`: Number of watchers the repository has
- `language`: The programming language the repository is written in
- `has_projects`: Whether the repository has projects enabled
- `has_wiki`: Whether the repository has a wiki
- `license_name`: Name of the license the repository is under (This is under license.key)

In [3]:
def get_user_repos(user, **request_params):
    repos = None
    page = 1
    last_page = 5
    while page <= last_page:
        url = f"{BASE_URL}/users/{user}/repos?per_page={100}&page={page}&sort=pushed"
        response = requests.get(url, **request_params)

        if response.status_code != 200:
            print(f"Error fetching user repos: {response.status_code}")
            return repos

        data = response.json()

        if 'last' in response.links:
            last_page = int(response.links['last']
                            ['url'].split('=')[-3].split('&')[0])

        if last_page > 5:
            last_page = 5

        if repos is None:
            repos = pd.DataFrame(columns=data[0].keys())

        for repo in data:
            repos.loc[len(repos)] = repo.values()

        page += 1
        time.sleep(2)

    return repos

In [4]:
def fetch_repos(users, request_params):
    repos = None
    print(
        f"Fetching repos for {len(users)} users - [{users.index[0]} - {users.index[-1]}]")
    for user in users["login"]:
        user_repos = get_user_repos(user, **request_params)
        if user_repos is None:
            continue
        if repos is None:
            repos = pd.DataFrame(columns=user_repos.columns)

        repos = pd.concat([repos, user_repos], ignore_index=True)

    print(
        f"Finished fetching Repos for {len(users)} users - [{users.index[0]} - {users.index[-1]}]")

    return repos

In [None]:
repos = fetch_repos(user_df.iloc[:10], HEADERS)
repos.shape

In [143]:
repos.to_csv("repos.csv", index=False)

Having `360` users will take a lotttt of time to fetch their 500 repos, so let's do it in multiprocessing.

In [17]:
def fetch_repos_multi(users, processes=10, **request_params):
    user_batches = [users.iloc[i::processes] for i in range(processes)]
    with Pool(processes=processes) as pool:
        results = pool.starmap(
            fetch_repos, [(batch, request_params) for batch in user_batches])

    repos = pd.concat(results, ignore_index=True)
    return repos

In [18]:
user_df.shape

(360, 10)

In [20]:
repos = fetch_repos_multi(user_df, processes=10, headers=HEADERS)

Fetching repos for 36 users - [0 - 350]Fetching repos for 36 users - [1 - 351]Fetching repos for 36 users - [2 - 352]Fetching repos for 36 users - [3 - 353]
Fetching repos for 36 users - [4 - 354]

Fetching repos for 36 users - [6 - 356]
Fetching repos for 36 users - [7 - 357]Fetching repos for 36 users - [8 - 358]Fetching repos for 36 users - [5 - 355]




Fetching repos for 36 users - [9 - 359]
Finished fetching Repos for 36 users - [4 - 354]
Finished fetching Repos for 36 users - [9 - 359]
Finished fetching Repos for 36 users - [2 - 352]
Finished fetching Repos for 36 users - [8 - 358]
Finished fetching Repos for 36 users - [3 - 353]
Finished fetching Repos for 36 users - [6 - 356]
Finished fetching Repos for 36 users - [1 - 351]
Finished fetching Repos for 36 users - [5 - 355]
Finished fetching Repos for 36 users - [7 - 357]
Finished fetching Repos for 36 users - [0 - 350]


Let's verify the total number of repos

In [21]:
repos.shape

(29561, 80)

In [22]:
user_details["public_repos"].apply(lambda x: 500 if x > 500 else x).sum()

np.int64(29556)

So, we have some ambiguity, we are having $5$ extra repos, let's fix that.

In [23]:
repos.head()

Unnamed: 0,id,node_id,name,full_name,private,owner,html_url,description,fork,url,...,allow_forking,is_template,web_commit_signoff_required,topics,visibility,forks,open_issues,watchers,default_branch,permissions
0,713362853,R_kgDOKoUNpQ,liaoxuefeng.com,michaelliao/liaoxuefeng.com,False,"{'login': 'michaelliao', 'id': 470058, 'node_i...",https://github.com/michaelliao/liaoxuefeng.com,Source code of liaoxuefeng.com,False,https://api.github.com/repos/michaelliao/liaox...,...,True,False,False,[],public,16,0,35,main,"{'admin': False, 'maintain': False, 'push': Fa..."
1,54755095,MDEwOlJlcG9zaXRvcnk1NDc1NTA5NQ==,warpdb,michaelliao/warpdb,False,"{'login': 'michaelliao', 'id': 470058, 'node_i...",https://github.com/michaelliao/warpdb,DSL-driven RDBMS interface for Java.,False,https://api.github.com/repos/michaelliao/warpdb,...,True,False,False,"[java, jdbc, mysql, orm, spring6, warp]",public,15,1,89,master,"{'admin': False, 'maintain': False, 'push': Fa..."
2,568280927,R_kgDOId9HXw,gitignore-online-generator,michaelliao/gitignore-online-generator,False,"{'login': 'michaelliao', 'id': 470058, 'node_i...",https://github.com/michaelliao/gitignore-onlin...,A useful gitignore online generator.,True,https://api.github.com/repos/michaelliao/gitig...,...,True,False,False,"[gitignore, gitignore-generator, online]",public,6,0,54,main,"{'admin': False, 'maintain': False, 'push': Fa..."
3,133958676,MDEwOlJlcG9zaXRvcnkxMzM5NTg2NzY=,learngit,michaelliao/learngit,False,"{'login': 'michaelliao', 'id': 470058, 'node_i...",https://github.com/michaelliao/learngit,教程→ https://liaoxuefeng.com/books/git/ 推送请使用UT...,False,https://api.github.com/repos/michaelliao/learngit,...,True,False,False,[git],public,3166,0,435,master,"{'admin': False, 'maintain': False, 'push': Fa..."
4,714544405,R_kgDOKpcVFQ,gitsite-cli,michaelliao/gitsite-cli,False,"{'login': 'michaelliao', 'id': 470058, 'node_i...",https://github.com/michaelliao/gitsite-cli,GitSite Command Line Application,False,https://api.github.com/repos/michaelliao/gitsi...,...,True,False,False,[],public,0,0,4,main,"{'admin': False, 'maintain': False, 'push': Fa..."


In [24]:
repos.columns

Index(['id', 'node_id', 'name', 'full_name', 'private', 'owner', 'html_url',
       'description', 'fork', 'url', 'forks_url', 'keys_url',
       'collaborators_url', 'teams_url', 'hooks_url', 'issue_events_url',
       'events_url', 'assignees_url', 'branches_url', 'tags_url', 'blobs_url',
       'git_tags_url', 'git_refs_url', 'trees_url', 'statuses_url',
       'languages_url', 'stargazers_url', 'contributors_url',
       'subscribers_url', 'subscription_url', 'commits_url', 'git_commits_url',
       'comments_url', 'issue_comment_url', 'contents_url', 'compare_url',
       'merges_url', 'archive_url', 'downloads_url', 'issues_url', 'pulls_url',
       'milestones_url', 'notifications_url', 'labels_url', 'releases_url',
       'deployments_url', 'created_at', 'updated_at', 'pushed_at', 'git_url',
       'ssh_url', 'clone_url', 'svn_url', 'homepage', 'size',
       'stargazers_count', 'watchers_count', 'language', 'has_issues',
       'has_projects', 'has_downloads', 'has_wiki', 'has

In [25]:
repos.to_parquet("./repos.parquet", index=False)