## Imports

In [1]:
import requests
from github import Github

import pandas as pd

## Helper functions

In [2]:
def get_repos(access_token, query, keywords, max_count=1000):
    """
    Get information about repositories based on 'keywords' and query
    each time forming a query like: 'current_keyword' + 'query'

    Parameters:
        access_token (str): GitHub access token
        query (str): query for search without keywords
        keywords (list of strings): keyword to search
        max_count (int): count of requests for each keyword

    Returns:
        pd.DataFrame with columns ['keyword', 'name', 'username', 'html_url', 'language']
            contains len(keywords) * max_count rows or less 
    """

    # list of dict with columns
    # ['keyword', 'name', 'username', 'html_url', 'language']
    repos = []

    g = Github(access_token)

    for keyword in keywords:
        print(f"\n\tProcessing {keyword}")
        search_results = g.search_repositories(query=keyword + " " + query)

        for i, repo in enumerate(search_results):

            # accumulate data into list
            repo_info = {'keyword': keyword,
                         'name': repo.name,
                         'language': repo.language,
                         'html_url': repo.html_url,
                         'username': repo.owner.login}
            repos.append(repo_info)

            print(f"{keyword} - {i+1}")
            if i+1 == max_count:
                break

    print("\n\tDone with getting repos")

    repos_df = pd.DataFrame(repos, columns=['keyword', 'name', 'username',
                                            'html_url', 'language'])

    return repos_df

In [3]:
def get_unique_usernames(repos_df):
    """ Return list of unique usernames """
    
    return repos_df['username'].unique().tolist()

In [4]:
def get_users_table(access_token, logins):
    """ Get information about all users and return a DataFrame """

    # list of dict with columns
    # ['html_url', 'name', 'company', 'location', 
    # 'email', 'hireable', 'public_repos', 'followers']
    users = []

    for login in logins:
        g = Github(access_token)

        user = g.get_user(login)

        # get one row for result table and add it to the list
        user_info = {# 'html_url': user.html_url,
                     'html_url':f'=HYPERLINK("{user.html_url}", "{user.html_url}")',
                     'name': user.name,
                     'company': user.company,
                     'location': user.location,
                     'email': user.email,
                     'hireable': user.hireable,
                     'public_repos': user.public_repos,
                     'followers': user.followers}
        users.append(user_info)

    users_df = pd.DataFrame(users, columns=['html_url', 'name', 'company',
                                            'location', 'email', 'hireable',
                                            'public_repos', 'followers'])

    return users_df

In [5]:
def filter_users_by_location(df, locations):
    """ Filter full_users DataFrame with locations """

    # drop NaN values from location column
    df.dropna(subset=["location"], inplace=True)

    return df[df['location'].str.contains('|'.join(locations))]

In [6]:
def save_info(repos_df, repos_filename,
              full_users_df, full_users_filename,
              users_df, users_filename):
    """ Save three dataframes into excel files """

    if repos_filename:
        repos_df.to_excel(repos_filename, index=False)

    if full_users_filename:
        full_users_df.to_excel(full_users_filename, index=False)

    if users_filename:
        users_df.to_excel(users_filename, index=False)

# Setting parameters

In [7]:
access_token_path = 'data/access_token.txt'

try:
    f = open(access_token_path, 'r')
except OSError:
    print(f"Could not open/read file: {access_token_path}.")
    print("Specify the path to file, containing your GitHub access token.")
    print("To know more, visit: https://github.com/settings/tokens")

In [8]:
with f:
    access_token = f.read()

In [9]:
# query without keywords
# see https://github.com/search/advanced
query = "language:python"

# python list of keywords to search
# ['keyword1', 'keyword2', 'keyword3']
keywords_list = ['ml', 'machine learning']

# count of requests for each keyword
max_count = 5

# locations to filter users
locations = ['China', 'Brooklyn', 'Amsterdam']

In [10]:
# filenames to save dataframes
# False if no need to save
repos_filename = 'data/repos.xlsx'
full_users_filename = False
users_filename = 'data/users.xlsx'

# Main program

## Get information about repos

In [11]:
repos_df = get_repos(access_token=access_token, query=query,
                     keywords=keywords_list, max_count=max_count)

print(f"Got {repos_df.shape[0]} rows in repos_df DataFrame")
repos_df


	Processing ml
ml - 1
ml - 2
ml - 3
ml - 4
ml - 5

	Processing machine learning
machine learning - 1
machine learning - 2
machine learning - 3
machine learning - 4
machine learning - 5

	Done with getting repos
Got 10 rows in repos_df DataFrame


Unnamed: 0,keyword,name,username,html_url,language
0,ml,mlflow,mlflow,https://github.com/mlflow/mlflow,Python
1,ml,mlcourse.ai,Yorko,https://github.com/Yorko/mlcourse.ai,Python
2,ml,numpy-ml,ddbourgin,https://github.com/ddbourgin/numpy-ml,Python
3,ml,MLAlgorithms,rushter,https://github.com/rushter/MLAlgorithms,Python
4,ml,ML-From-Scratch,eriklindernoren,https://github.com/eriklindernoren/ML-From-Scr...,Python
5,machine learning,awesome-machine-learning,josephmisiti,https://github.com/josephmisiti/awesome-machin...,Python
6,machine learning,MachineLearning,wepe,https://github.com/wepe/MachineLearning,Python
7,machine learning,Machine-Learning,Jack-Cherish,https://github.com/Jack-Cherish/Machine-Learning,Python
8,machine learning,MachineLearning_Python,lawlite19,https://github.com/lawlite19/MachineLearning_P...,Python
9,machine learning,machine_learning_examples,lazyprogrammer,https://github.com/lazyprogrammer/machine_lear...,Python


In [12]:
usernames = get_unique_usernames(repos_df)
print(f"Got {len(usernames)} unique users")
usernames

Got 10 unique users


['mlflow',
 'Yorko',
 'ddbourgin',
 'rushter',
 'eriklindernoren',
 'josephmisiti',
 'wepe',
 'Jack-Cherish',
 'lawlite19',
 'lazyprogrammer']

## Get information about users

In [13]:
full_users_df = get_users_table(access_token=access_token, logins=usernames)
print(f"Got {full_users_df.shape[0]} rows in full_users_df DataFrame")
full_users_df

Got 10 rows in full_users_df DataFrame


Unnamed: 0,html_url,name,company,location,email,hireable,public_repos,followers
0,"=HYPERLINK(""https://github.com/mlflow"", ""https...",MLflow,,,,,4,0
1,"=HYPERLINK(""https://github.com/Yorko"", ""https:...",Yury Kashnitsky,RELX,Amsterdam,,,13,1544
2,"=HYPERLINK(""https://github.com/ddbourgin"", ""ht...",David Bourgin,,"Brooklyn, NY",,True,17,570
3,"=HYPERLINK(""https://github.com/rushter"", ""http...",Artem Golubin,,Russia,gh@rushter.com,True,16,968
4,"=HYPERLINK(""https://github.com/eriklindernoren...",Erik Linder-Norén,,"Stockholm, Sweden",eriklindernoren@gmail.com,,24,4259
5,"=HYPERLINK(""https://github.com/josephmisiti"", ...",Joseph Misiti,Math & Pencil,"Brooklyn, NY",,True,233,2956
6,"=HYPERLINK(""https://github.com/wepe"", ""https:/...",wepon,Ant Group,China Hangzhou,masterwepon@163.com,,28,4709
7,"=HYPERLINK(""https://github.com/Jack-Cherish"", ...",Jack Cui,Northeastern University,China,c411184003@gmail.com,,16,6222
8,"=HYPERLINK(""https://github.com/lawlite19"", ""ht...",lawlite,Southeast University,"Nanjing, China",lawlitewang@gmail.com,True,23,704
9,"=HYPERLINK(""https://github.com/lazyprogrammer""...",LazyProgrammer.me,,,,,20,3514


In [14]:
users_df = filter_users_by_location(full_users_df, locations)
print(f"Got {users_df.shape[0]} rows in users_df DataFrame")
users_df

Got 6 rows in users_df DataFrame


Unnamed: 0,html_url,name,company,location,email,hireable,public_repos,followers
1,"=HYPERLINK(""https://github.com/Yorko"", ""https:...",Yury Kashnitsky,RELX,Amsterdam,,,13,1544
2,"=HYPERLINK(""https://github.com/ddbourgin"", ""ht...",David Bourgin,,"Brooklyn, NY",,True,17,570
5,"=HYPERLINK(""https://github.com/josephmisiti"", ...",Joseph Misiti,Math & Pencil,"Brooklyn, NY",,True,233,2956
6,"=HYPERLINK(""https://github.com/wepe"", ""https:/...",wepon,Ant Group,China Hangzhou,masterwepon@163.com,,28,4709
7,"=HYPERLINK(""https://github.com/Jack-Cherish"", ...",Jack Cui,Northeastern University,China,c411184003@gmail.com,,16,6222
8,"=HYPERLINK(""https://github.com/lawlite19"", ""ht...",lawlite,Southeast University,"Nanjing, China",lawlitewang@gmail.com,True,23,704


In [15]:
save_info(repos_df, repos_filename,
          full_users_df, full_users_filename,
          users_df, users_filename)