## Imports

In [1]:
import requests
from github import Github

import pandas as pd

## GitHubUsersParser() class

In [26]:
class GitHubUsersParser():

    def __init__(self):
        pass

    def connect(self, token_or_path, is_path=True):
        """ 
        Create Github() object to request information 

        Parameters:
            token_or_path (str): access token or path to .txt file where it is
                specified. To know more, visit: https://github.com/settings/tokens
            is_path (bool): if true, 'token_or_path' must be path to txt file
                containing access token. If false 'token_or_path' is access token
        """

        if not is_path:
            access_token = token_or_path
        else:
            try:
                f = open(token_or_path, 'r')
                access_token = f.read()
            except OSError:
                print(f"Could not open/read file: {token_or_path}.")
                return None

        print("Your access token was successfully read")
        self.g = Github(access_token)

        # test query
        try:
            repo = self.g.get_repo("PyGithub/PyGithub")
        except Exception as e:
            print("An error occurred while executing the test query.\n" +
                  "Most likely your token is incorrect or expired.")
            print("Error: " + str(e))
            return None

        print("The test request was successfully executed")

    def add_to_users(self, user, repo):
        """ Append information about 'user' and 'repo' into 'users' list """

        try:
            self.users
        except AttributeError:
            print("Calling add_to_users() function " +
                  "before defining 'users' variable\n" +
                  "Call parse_users() function instead " +
                  "or define 'users' variable explicitly")
            return None

        data = {'repo_html_url': repo.html_url,
                'repo_language': repo.language,
                'user_html_url': user.html_url,
                'name': user.name,
                'company': user.company,
                'location': user.location,
                'email': user.email,
                'hireable': user.hireable,
                'public_repos': user.public_repos,
                'followers': user.followers}

        self.users.append(data)

    def save_users_to_xlsx(self, filename):
        """ 
        Save information about 'users' into Excel file

        Parameters:
            filename (str): filename to save including extension (.xlsx)
        """

        try:
            self.users
        except AttributeError:
            print("Calling save_users_to_xlsx() function " +
                  "before defining 'users' variable\n" +
                  "Call parse_users() function instead " +
                  "or define 'users' variable explicitly")
            return None

        # define DataFrame from list of dict
        users_df = pd.DataFrame(
            self.users,
            columns=['repo_html_url', 'repo_language',
                     'user_html_url', 'name', 'company', 'location',
                     'email', 'hireable', 'public_repos', 'followers']
        )
        
        # filling missing values with space to sort Excel table
        users_df.fillna(' ', inplace=True)

        # save file
        users_df.to_excel(filename, index=False)

        print(f"Data about users was saved into '{filename}' " +
              f"file ({users_df.shape[0]} rows).")

    def parse_users(self, query, keywords, max_count, filename):
        """ 
        Parse GitHub users with set parameters. 
        Save information about users into Excel table.

        Parameters:



        """

        try:
            self.g
        except AttributeError:
            print("Calling parse_users() function " +
                  "before defining 'g' variable\n" +
                  "Call connect() function first.")
            return None

        print("Start parsing with the following parameters:\n" +
              f"query = '{query}'\n" +
              f"keywords = {keywords}\n" +
              f"max_count = {max_count}\n" +
              f"filename = '{filename}'\n")

        # array of dict with fields:
        # ['repo_html_url', 'repo_language', 'user_html_url', 'name', 'company',
        #  'location', 'email', 'hireable', 'public_repos', 'followers']
        self.users = []
        count = 0

        # for all users with query
        for user in self.g.search_users(query=query):
            try:
                # at least one of repos contain keyword
                for repo in user.get_repos():
                    # form repo_string as repo name and description
                    description_str = repo.description if repo.description else ''
                    repo_string = repo.name + ' ' + description_str

                    # if any keyword is in any users repository
                    if any(keyword in repo_string for keyword in keywords):
                        # add this user in result table
                        self.add_to_users(user, repo)
                        count += 1
                        print(f"{count}/{max_count} - add {user.name}")
                        # self.save_users_to_xlsx(filename)
                        break

            except Exception as e:
                print("An error occurred while executing the query")
                print("Error: " + str(e))

                self.save_users_to_xlsx(filename)
                return

            if count == max_count:
                self.save_users_to_xlsx(filename)
                return

# Setting parameters

In [27]:
access_token_path = 'data/access_token.txt'
is_path = True
# or set your access token explicitely
# access_token_path = "IppbRe4dzGv5a5WQNffbNXRY2gASYLaE26h8CVjZc"
# is_path = False

In [28]:
# query without keywords
# see https://github.com/search/advanced
query = "language:python location:Moscow"

# result number of users
max_count = 5

# python list of keywords to search
# ['keyword1', 'keyword2', 'keyword3']
keywords_list = ['backend', 'golang', 'django', 'flask', 'fastapi', 'tornado', 'aiohttp', 'asyncio']

# filename to save dataframe (including extension)
filename_to_save = 'data/users.xlsx'

# Main 

In [29]:
# create class instance
github_parser = GitHubUsersParser()

In [30]:
# connect it using access token
github_parser.connect(access_token_path, is_path)

Your access token was successfully read
The test request was successfully executed


In [31]:
# parsing with set parameters
github_parser.parse_users(query=query, 
                          keywords=keywords_list, 
                          max_count=max_count, 
                          filename=filename_to_save)

Start parsing with the following parameters:
query = 'language:python location:Moscow'
keywords = ['backend', 'golang', 'django', 'flask', 'fastapi', 'tornado', 'aiohttp', 'asyncio']
max_count = 5
filename = 'data/users.xlsx'

1/5 - add Grigory Bakunov
2/5 - add Daniil Okhlopkov
3/5 - add Anton
4/5 - add Kirill Klenov
5/5 - add Alexey Goloburdin
Data about users was saved into 'data/users.xlsx' file (5 rows).
