This script is used to initially merge the user profiles from the different strategies. It can also be used to:
* update the users after the results were updated. For example if a collection strategy now has more results, this script can identify the difference and add the new users to the Excel file.
* update the data of all users. Set **UPDATE_EVERYTHING** to True for this.

In [71]:
# Update all GitHub user information? Set to True if yes
UPDATE_EVERYTHING = False

In [72]:
import glob
import time

import pandas as pd

## Merge the result files

In [73]:
data_files = glob.glob("*/results/*.csv")
df_github_names_long = pd.concat(
    [ pd.read_csv(fp) for fp in data_files], 
    axis=0, 
    keys=data_files, 
    names=["source", "row"]
).reset_index("source").reset_index(drop=True)
df_github_names_long['github_user_id'] = df_github_names_long['github_user_id'].str.lower() #lowercase to remove duplicates correctly
df_github_names_long

Unnamed: 0,source,github_user_id,uu_user_id
0,github_search\results\ids_search_repos.csv,jonasmoons,
1,github_search\results\ids_search_repos.csv,gglucass,
2,github_search\results\ids_search_repos.csv,marcelrobeer,
3,github_search\results\ids_search_repos.csv,ekatrukha,
4,github_search\results\ids_search_repos.csv,utrechtuniversity,
...,...,...,...
542,university_profile_pages\results\employees_git...,j535d165,JdeBruin1
543,university_profile_pages\results\employees_git...,asreview,JdeBruin1
544,university_profile_pages\results\employees_git...,zievathustra,JASieverink
545,university_profile_pages\results\employees_git...,southparkfan,FTufan


## Create unique table of users and corresponding SOLIS-ID

In [74]:
df_users = df_github_names_long[["github_user_id", "uu_user_id"]].sort_values("uu_user_id").drop_duplicates("github_user_id").reset_index(drop=True)
df_users

Unnamed: 0,github_user_id,uu_user_id
0,ajinkyakadu125,AAKadu
1,rel=,AAKadu
2,msdslab,AGJvandeSchoot
3,alexandrosstergiou,AGStergiou
4,annawegmann,AMWegmann
...,...,...
419,keesmulder,
420,rianneschouten,
421,ydluo,
422,marnixnaber,


## Update the pipeline with additional users in later runs

In [5]:
try: 
    # If this block is successfully executed it is an update of users
    df_users_annotated = pd.read_excel("unique_users_annotated.xlsx", engine='openpyxl')
    # df_users["github_user_id"] not within df_users_annotated["github_user_id"]
    df_users["new_user"] = False
    df_users.loc[~df_users["github_user_id"].isin(df_users_annotated["github_user_id"].str.lower()), "new_user"] = True
except FileNotFoundError:
    print("No file with annotated user data yet available.")

df_users_annotated

Unnamed: 0,github_user_id,uu_user_id,id,node_id,avatar_url,gravatar_id,url,html_url,followers_url,following_url,...,followers,following,created_at,updated_at,is_student,is_employee,is_currently_employed,is_research_group,final_decision,note
0,ReinierMaas,,9269254.0,MDQ6VXNlcjkyNjkyNTQ=,https://avatars.githubusercontent.com/u/926925...,,https://api.github.com/users/ReinierMaas,https://github.com/ReinierMaas,https://api.github.com/users/ReinierMaas/follo...,https://api.github.com/users/ReinierMaas/follo...,...,15.0,20.0,2014-10-16T14:31:59Z,2021-05-01T22:21:19Z,False,0.0,0.0,0.0,0,
1,rel=,AAKadu,,,,,,,,,...,,,,,False,,,,0,duplicate
2,jhellingman,,8410715.0,MDQ6VXNlcjg0MTA3MTU=,https://avatars.githubusercontent.com/u/841071...,,https://api.github.com/users/jhellingman,https://github.com/jhellingman,https://api.github.com/users/jhellingman/follo...,https://api.github.com/users/jhellingman/follo...,...,14.0,2.0,2014-08-10T20:45:59Z,2021-05-02T17:20:11Z,False,,,,0,
3,LucaScorpion,,5592716.0,MDQ6VXNlcjU1OTI3MTY=,https://avatars.githubusercontent.com/u/559271...,,https://api.github.com/users/LucaScorpion,https://github.com/LucaScorpion,https://api.github.com/users/LucaScorpion/foll...,https://api.github.com/users/LucaScorpion/foll...,...,14.0,21.0,2013-10-02T08:32:38Z,2021-04-21T08:35:39Z,False,,,,0,
4,FloatingPoint,,1568174.0,MDQ6VXNlcjE1NjgxNzQ=,https://avatars.githubusercontent.com/u/156817...,,https://api.github.com/users/FloatingPoint,https://github.com/FloatingPoint,https://api.github.com/users/FloatingPoint/fol...,https://api.github.com/users/FloatingPoint/fol...,...,2.0,0.0,2012-03-23T13:31:36Z,2016-12-22T10:54:32Z,False,,,,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
418,keesmulder,,7806295.0,MDQ6VXNlcjc4MDYyOTU=,https://avatars.githubusercontent.com/u/780629...,,https://api.github.com/users/keesmulder,https://github.com/keesmulder,https://api.github.com/users/keesmulder/followers,https://api.github.com/users/keesmulder/follow...,...,15.0,22.0,2014-06-05T14:26:12Z,2021-03-05T21:53:26Z,False,1.0,0.0,,0,
419,hanstimm,,5745215.0,MDQ6VXNlcjU3NDUyMTU=,https://avatars.githubusercontent.com/u/574521...,,https://api.github.com/users/hanstimm,https://github.com/hanstimm,https://api.github.com/users/hanstimm/followers,https://api.github.com/users/hanstimm/followin...,...,0.0,0.0,2013-10-22T09:02:28Z,2019-11-22T08:26:21Z,False,,,0.0,0,Teacher at HKU
420,Jollyfant,,10128244.0,MDQ6VXNlcjEwMTI4MjQ0,https://avatars.githubusercontent.com/u/101282...,,https://api.github.com/users/Jollyfant,https://github.com/Jollyfant,https://api.github.com/users/Jollyfant/followers,https://api.github.com/users/Jollyfant/followi...,...,11.0,3.0,2014-12-09T10:39:44Z,2021-05-02T15:53:16Z,False,,,,1,Unsure about this one - has a repo with UU tag...
421,growthcharts,,70215749.0,MDEyOk9yZ2FuaXphdGlvbjcwMjE1NzQ5,https://avatars.githubusercontent.com/u/702157...,,https://api.github.com/users/growthcharts,https://github.com/growthcharts,https://api.github.com/users/growthcharts/foll...,https://api.github.com/users/growthcharts/foll...,...,0.0,0.0,2020-08-25T14:52:41Z,2020-10-14T12:35:08Z,False,,,1.0,1,What to do with this one. Researcher works als...


## New users:

In [76]:
try: 
    df_users[df_users["new_user"] == True]
except KeyError:
    print("No new users.")

No new users.


## Retrieve data from GitHub API - package GhApi is used for this

In [77]:
import os
from dotenv import load_dotenv

from ghapi.all import GhApi, pages

load_dotenv()
# if unauthorized API is used, rate limit is lower leading to a ban and waiting time needs to be increased
token = os.getenv('GITHUB_TOKEN') 
api = GhApi(token = token)

In [106]:
def get_userdata(user_list, api, sleep = 6):
    results_github_user_api = []
    for index, user_id in enumerate(user_list):
        try:
            user = dict(api.users.get_by_username(user_id))
            if(len(user)>32): # if the authenticated user is retrieved, there will be extra variables
                entries_to_remove = ('private_gists', 'total_private_repos', 'owned_private_repos', 'disk_usage', 'collaborators', 'two_factor_authentication', 'plan')
                for k in entries_to_remove:
                    user.pop(k, None)
            results_github_user_api.append(user)
        except Exception as e: 
            print("User %s encountered an error." % user_id)
            print(e)
        if(index % 10 == 0):
            print("Processed %d out of %d users." % (index, len(user_list)))
        time.sleep(sleep)
    return pd.DataFrame(results_github_user_api)

# get_userdata(df_users[df_users["new_user"] == True]["github_user_id"], api, sleep)
results_github_user_api_test = get_userdata(["beld78", "clariah"], api, 2)

def update_users(df_users_annotated, df_new_users):
    df_users = df_users_annotated.copy() # don't modify initial df
    for row_new in df_new_users.iterrows():#iterate over rows in first df 
        github_user_id = row_new[1]['login'] #get github_user_id from this row
        keys, values = zip(*[(key, value) for key, value in row_new[1].items() if key != "login"]) # get keys and values
#         print(keys)
        index_row_annotated = df_users.index[df_users['github_user_id'] == github_user_id] #find row index in df2 where login value corresponds to github_user_id, if it exists
        if(len(index_row_annotated)>0): # user exists already
            df_users.loc[index_row_annotated, keys] = values
#             print(df_users.iloc[index_row_annotated])
            pass
        else: # user doesn't exist yet - insert 
#             print(df_users.index)
            new_index = df_users.index.stop + 1
            df_users.loc[new_index, "github_user_id"] = github_user_id
            df_users.loc[new_index, keys] = values
#             print(df_users.iloc[index_row_annotated])
            
    return df_users

# update_users(df_users_annotated, results_github_user_api)


# Automatic filtering of students
def is_student(user_bio):
    """Checks whether a GitHub user is a student. The bio of a user is parsed. 
    If it contains phd the user will not be marked as a student. 
    If the bio contains only the word student the user will be marked as a student. If

    Args:
        user_id (string): user id which is named as "login" from the GitHub Api 

    Returns:
        Boolean: Whether the user is a student or not
    """
    user_bio = str(user_bio).lower()
    if (user_bio != "nan"):
        # PhD students should be included
        mention_phd = "phd" in user_bio
        mention_student = "student" in user_bio
        return (not mention_phd and mention_student)
    else:
        # we can't be sure and therefore keep the user
        return False

Processed 0 out of 2 users.


In [107]:
# function that accepts usernames that should be updated (and sleep argument, default = 6, api)
# returns dataframe


if(token is not None): # authentication
    sleep = 2
else: # no authentication 
    sleep = 6
    
if 'new_user' in df_users.columns: # updating users
    if(UPDATE_EVERYTHING == True): 
        df_users_all = pd.merge(df_users[df_users["new_user"] == True].drop(["uu_user_id"],axis = 1), 
                                df_users_annotated, 
                                on="github_user_id", how="outer")
        results_github_user_api = get_userdata(df_users_all["github_user_id"], api, sleep)
        
    else: # only add new users
        df_users_update = pd.merge(df_users[df_users["new_user"] == True], df_users_annotated, 
                                left_on="github_user_id", right_on="github_user_id", how="left")
        results_github_user_api = get_userdata(df_users_update["github_user_id"], api, sleep)
        
    df_users_enriched = update_users(df_users_annotated , results_github_user_api)    
    df_users_enriched["is_student"] = df_users_enriched['bio'].apply(is_student)
        
else: # first time collecting data
        results_github_user_api = get_userdata(df_users["github_user_id"], api, sleep)
        df_users_enriched = df_users.merge(results_github_user_api, 
                                           left_on="github_user_id", right_on="login", how="left")
        df_users_enriched.drop(["login"], axis = 1, inplace=True)
        df_users_enriched = df_users_enriched.reindex(columns = df_users_enriched.columns.tolist() + 
                                  ["is_student", "is_employee","is_currently_employed", 
                                   "is_research_group", "final_decision", "note"])
        df_users_enriched["is_student"] = df_users_enriched['bio'].apply(is_student)
        

User ajinkyakadu125 encountered an error.
HTTP Error 404: Not Found
Processed 0 out of 10 users.
User rel= encountered an error.
HTTP Error 404: Not Found
User msdslab encountered an error.
HTTP Error 404: Not Found


# Number of identified students

In [100]:
df_users_enriched['is_student'].value_counts()

False    10
Name: is_student, dtype: int64

# Export to Excel file

In [101]:
df_users_enriched.to_excel("unique_users_annotated.xlsx", index=False) 