In [34]:
import glob
import time

import pandas as pd

In [35]:
data_files = glob.glob("*/results/*.csv")

In [36]:
df_github_names_long = pd.concat(
    [ pd.read_csv(fp) for fp in data_files], 
    axis=0, 
    keys=data_files, 
    names=["source", "row"]
).reset_index("source").reset_index(drop=True)
df_github_names_long

Unnamed: 0,source,github_user_id,uu_user_id
0,github_search\results\ids_search_repos.csv,jonasmoons,
1,github_search\results\ids_search_repos.csv,gglucass,
2,github_search\results\ids_search_repos.csv,MarcelRobeer,
3,github_search\results\ids_search_repos.csv,ekatrukha,
4,github_search\results\ids_search_repos.csv,UtrechtUniversity,
...,...,...,...
541,university_profile_pages\results\employees_git...,J535D165,JdeBruin1
542,university_profile_pages\results\employees_git...,asreview,JdeBruin1
543,university_profile_pages\results\employees_git...,zievathustra,JASieverink
544,university_profile_pages\results\employees_git...,Southparkfan,FTufan


In [37]:
df_github_names_crosstab = pd.crosstab(df_github_names_long["github_user_id"], df_github_names_long["source"])
df_github_names_crosstab

source,github_search\results\ids_search_repos.csv,github_search\results\ids_search_users.csv,github_search\results\ids_topic_repos.csv,pure\results\ids_pure_users.csv,university_profile_pages\results\employees_github_usernames.csv
github_user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
19giorgosts,0,1,0,0,0
4009509,0,1,0,0,0
73616e646572,0,0,0,0,1
AJueling,1,1,0,0,0
ARYENN,0,1,0,0,0
...,...,...,...,...,...
yaikohi,0,1,0,0,0
ydluo,0,0,0,1,0
ystouthart,1,0,1,0,0
yuntaoj,0,1,0,0,0


In [38]:
df_github_names_long['github_user_id'] = df_github_names_long['github_user_id'].str.lower()
df_github_names_long

Unnamed: 0,source,github_user_id,uu_user_id
0,github_search\results\ids_search_repos.csv,jonasmoons,
1,github_search\results\ids_search_repos.csv,gglucass,
2,github_search\results\ids_search_repos.csv,marcelrobeer,
3,github_search\results\ids_search_repos.csv,ekatrukha,
4,github_search\results\ids_search_repos.csv,utrechtuniversity,
...,...,...,...
541,university_profile_pages\results\employees_git...,j535d165,JdeBruin1
542,university_profile_pages\results\employees_git...,asreview,JdeBruin1
543,university_profile_pages\results\employees_git...,zievathustra,JASieverink
544,university_profile_pages\results\employees_git...,southparkfan,FTufan


## Basic analytics

In [39]:
df_github_names_long.drop_duplicates(["github_user_id", "source"])["github_user_id"].value_counts()

kevin4998            3
utrechtuniversity    3
j535d165             3
uu-hydro             3
ekatrukha            3
                    ..
ijanszen             1
jordyalkema          1
rvschouwenburg       1
garanas              1
lexverbrugh          1
Name: github_user_id, Length: 422, dtype: int64

## Create unique table of users and corresponding SOLIS-ID

In [40]:
df_users = df_github_names_long[["github_user_id", "uu_user_id"]].sort_values("uu_user_id").drop_duplicates("github_user_id").reset_index(drop=True)
df_users

Unnamed: 0,github_user_id,uu_user_id
0,ajinkyakadu125,AAKadu
1,rel=,AAKadu
2,msdslab,AGJvandeSchoot
3,alexandrosstergiou,AGStergiou
4,annawegmann,AMWegmann
...,...,...
418,keesmulder,
419,rianneschouten,
420,ydluo,
421,marnixnaber,


In [7]:
from ghapi.all import GhApi, pages
import os
from dotenv import load_dotenv

load_dotenv()
# if unauthorized API is used, rate limit is lower leading to a ban and waiting time needs to be increased
token = os.getenv('GITHUB_TOKEN') 
api = GhApi(token = token)
if(token is None):
    sleep = 6
else:
    sleep = 2

In [15]:
results_github_user_api = []

for x in df_users["github_user_id"]:
    try:
        user = dict(api.users.get_by_username(x))
        results_github_user_api.append(user)
    except Exception as e: 
        print("User %s encountered an error." % x)
        print(e)
    if(len(results_github_user_api) % 10 == 0):
        print("Processed %d users." % len(results_github_user_api))
    time.sleep(sleep)

User ajinkyakadu125 encountered an error.
HTTP Error 404: Not Found
Processed 0 users.
User rel= encountered an error.
HTTP Error 404: Not Found
Processed 0 users.
User msdslab encountered an error.
HTTP Error 404: Not Found
Processed 0 users.
Processed 10 users.
Processed 20 users.
Processed 30 users.
Processed 40 users.
Processed 50 users.
Processed 60 users.
Processed 70 users.
Processed 80 users.
Processed 90 users.
Processed 100 users.
Processed 110 users.
Processed 120 users.
Processed 130 users.
Processed 140 users.
Processed 150 users.
Processed 160 users.
Processed 170 users.
Processed 180 users.
Processed 190 users.
Processed 200 users.
Processed 210 users.
Processed 220 users.
Processed 230 users.
Processed 240 users.
Processed 250 users.
Processed 260 users.
Processed 270 users.
Processed 280 users.
Processed 290 users.
Processed 300 users.
Processed 310 users.
Processed 320 users.
Processed 330 users.
Processed 340 users.
Processed 350 users.
Processed 360 users.
Processed

In [41]:
df_github_user_api = pd.DataFrame(results_github_user_api)
df_github_user_api

Unnamed: 0,login,id,node_id,avatar_url,gravatar_id,url,html_url,followers_url,following_url,gists_url,...,email,hireable,bio,twitter_username,public_repos,public_gists,followers,following,created_at,updated_at
0,alexandrosstergiou,25504317,MDQ6VXNlcjI1NTA0MzE3,https://avatars.githubusercontent.com/u/255043...,,https://api.github.com/users/alexandrosstergiou,https://github.com/alexandrosstergiou,https://api.github.com/users/alexandrosstergio...,https://api.github.com/users/alexandrosstergio...,https://api.github.com/users/alexandrosstergio...,...,alexstergiou5@gmail.com,True,Computer Vision and Machine Learning Researcher,,19,0,35,6,2017-02-02T11:23:32Z,2021-05-13T08:07:25Z
1,AnnaWegmann,22360217,MDQ6VXNlcjIyMzYwMjE3,https://avatars.githubusercontent.com/u/223602...,,https://api.github.com/users/AnnaWegmann,https://github.com/AnnaWegmann,https://api.github.com/users/AnnaWegmann/follo...,https://api.github.com/users/AnnaWegmann/follo...,https://api.github.com/users/AnnaWegmann/gists...,...,,,,,2,0,2,2,2016-09-22T02:20:41Z,2021-04-30T18:07:54Z
2,amacanovic,57662333,MDQ6VXNlcjU3NjYyMzMz,https://avatars.githubusercontent.com/u/576623...,,https://api.github.com/users/amacanovic,https://github.com/amacanovic,https://api.github.com/users/amacanovic/followers,https://api.github.com/users/amacanovic/follow...,https://api.github.com/users/amacanovic/gists{...,...,,,,,3,0,2,1,2019-11-12T09:29:13Z,2021-03-17T10:28:21Z
3,,74832,MDQ6VXNlcjc0ODMy,https://avatars.githubusercontent.com/u/74832?v=4,,https://api.github.com/users/nan,https://github.com/nan,https://api.github.com/users/nan/followers,https://api.github.com/users/nan/following{/ot...,https://api.github.com/users/nan/gists{/gist_id},...,kannayoshihiro@gmail.com,,,,0,0,4,0,2009-04-17T12:43:58Z,2021-01-29T03:56:10Z
4,billjee,3830672,MDQ6VXNlcjM4MzA2NzI=,https://avatars.githubusercontent.com/u/383067...,,https://api.github.com/users/billjee,https://github.com/billjee,https://api.github.com/users/billjee/followers,https://api.github.com/users/billjee/following...,https://api.github.com/users/billjee/gists{/gi...,...,,,,,5,1,22,5,2013-03-11T10:22:49Z,2021-05-19T18:43:44Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
418,keesmulder,7806295,MDQ6VXNlcjc4MDYyOTU=,https://avatars.githubusercontent.com/u/780629...,,https://api.github.com/users/keesmulder,https://github.com/keesmulder,https://api.github.com/users/keesmulder/followers,https://api.github.com/users/keesmulder/follow...,https://api.github.com/users/keesmulder/gists{...,...,keestimmulder@gmail.com,,Data Scientist at EY VODW,,19,0,15,22,2014-06-05T14:26:12Z,2021-03-05T21:53:26Z
419,RianneSchouten,22293115,MDQ6VXNlcjIyMjkzMTE1,https://avatars.githubusercontent.com/u/222931...,,https://api.github.com/users/RianneSchouten,https://github.com/RianneSchouten,https://api.github.com/users/RianneSchouten/fo...,https://api.github.com/users/RianneSchouten/fo...,https://api.github.com/users/RianneSchouten/gi...,...,,,Missing Data Specialist / Developer Data & Ana...,,10,0,17,0,2016-09-19T10:28:49Z,2021-05-13T10:32:03Z
420,ydluo,13563303,MDQ6VXNlcjEzNTYzMzAz,https://avatars.githubusercontent.com/u/135633...,,https://api.github.com/users/ydluo,https://github.com/ydluo,https://api.github.com/users/ydluo/followers,https://api.github.com/users/ydluo/following{/...,https://api.github.com/users/ydluo/gists{/gist...,...,,,,,1,0,18,0,2015-07-30T00:37:13Z,2021-04-26T14:35:58Z
421,marnixnaber,31031471,MDQ6VXNlcjMxMDMxNDcx,https://avatars.githubusercontent.com/u/310314...,,https://api.github.com/users/marnixnaber,https://github.com/marnixnaber,https://api.github.com/users/marnixnaber/follo...,https://api.github.com/users/marnixnaber/follo...,https://api.github.com/users/marnixnaber/gists...,...,,,,,1,0,2,0,2017-08-15T08:03:55Z,2021-03-19T08:40:20Z


In [42]:
def is_student(user_bio):
    """Checks whether a GitHub user is a student. The bio of a user is parsed. 
    If it contains phd the user will not be marked as a student. 
    If the bio contains only the word student the user will be marked as a student. If

    Args:
        user_id (string): user id which is named as "login" from the GitHub Api 

    Returns:
        Boolean: Whether the user is a student or not
    """
    user_bio = str(user_bio).lower()
    if (user_bio != "nan"):
        # PhD students should be included
        mention_phd = "phd" in user_bio
        mention_student = "student" in user_bio
        return (not mention_phd and mention_student)
    else:
        # we can't be sure and therefore keep the user
        return False

In [43]:
df_users_enriched = df_users.merge(df_github_user_api, left_on="github_user_id", right_on="login", how="left")
del df_users_enriched["login"]
df_users_enriched

Unnamed: 0,github_user_id,uu_user_id,id,node_id,avatar_url,gravatar_id,url,html_url,followers_url,following_url,...,email,hireable,bio,twitter_username,public_repos,public_gists,followers,following,created_at,updated_at
0,ajinkyakadu125,AAKadu,,,,,,,,,...,,,,,,,,,,
1,rel=,AAKadu,,,,,,,,,...,,,,,,,,,,
2,msdslab,AGJvandeSchoot,,,,,,,,,...,,,,,,,,,,
3,alexandrosstergiou,AGStergiou,25504317.0,MDQ6VXNlcjI1NTA0MzE3,https://avatars.githubusercontent.com/u/255043...,,https://api.github.com/users/alexandrosstergiou,https://github.com/alexandrosstergiou,https://api.github.com/users/alexandrosstergio...,https://api.github.com/users/alexandrosstergio...,...,alexstergiou5@gmail.com,True,Computer Vision and Machine Learning Researcher,,19.0,0.0,35.0,6.0,2017-02-02T11:23:32Z,2021-05-13T08:07:25Z
4,annawegmann,AMWegmann,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
418,keesmulder,,7806295.0,MDQ6VXNlcjc4MDYyOTU=,https://avatars.githubusercontent.com/u/780629...,,https://api.github.com/users/keesmulder,https://github.com/keesmulder,https://api.github.com/users/keesmulder/followers,https://api.github.com/users/keesmulder/follow...,...,keestimmulder@gmail.com,,Data Scientist at EY VODW,,19.0,0.0,15.0,22.0,2014-06-05T14:26:12Z,2021-03-05T21:53:26Z
419,rianneschouten,,,,,,,,,,...,,,,,,,,,,
420,ydluo,,13563303.0,MDQ6VXNlcjEzNTYzMzAz,https://avatars.githubusercontent.com/u/135633...,,https://api.github.com/users/ydluo,https://github.com/ydluo,https://api.github.com/users/ydluo/followers,https://api.github.com/users/ydluo/following{/...,...,,,,,1.0,0.0,18.0,0.0,2015-07-30T00:37:13Z,2021-04-26T14:35:58Z
421,marnixnaber,,31031471.0,MDQ6VXNlcjMxMDMxNDcx,https://avatars.githubusercontent.com/u/310314...,,https://api.github.com/users/marnixnaber,https://github.com/marnixnaber,https://api.github.com/users/marnixnaber/follo...,https://api.github.com/users/marnixnaber/follo...,...,,,,,1.0,0.0,2.0,0.0,2017-08-15T08:03:55Z,2021-03-19T08:40:20Z


In [44]:
df_users_enriched.drop_duplicates(subset='github_user_id', inplace=True)
df_users_enriched.reset_index(drop=True, inplace=True)
df_users_enriched

Unnamed: 0,github_user_id,uu_user_id,id,node_id,avatar_url,gravatar_id,url,html_url,followers_url,following_url,...,email,hireable,bio,twitter_username,public_repos,public_gists,followers,following,created_at,updated_at
0,ajinkyakadu125,AAKadu,,,,,,,,,...,,,,,,,,,,
1,rel=,AAKadu,,,,,,,,,...,,,,,,,,,,
2,msdslab,AGJvandeSchoot,,,,,,,,,...,,,,,,,,,,
3,alexandrosstergiou,AGStergiou,25504317.0,MDQ6VXNlcjI1NTA0MzE3,https://avatars.githubusercontent.com/u/255043...,,https://api.github.com/users/alexandrosstergiou,https://github.com/alexandrosstergiou,https://api.github.com/users/alexandrosstergio...,https://api.github.com/users/alexandrosstergio...,...,alexstergiou5@gmail.com,True,Computer Vision and Machine Learning Researcher,,19.0,0.0,35.0,6.0,2017-02-02T11:23:32Z,2021-05-13T08:07:25Z
4,annawegmann,AMWegmann,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
418,keesmulder,,7806295.0,MDQ6VXNlcjc4MDYyOTU=,https://avatars.githubusercontent.com/u/780629...,,https://api.github.com/users/keesmulder,https://github.com/keesmulder,https://api.github.com/users/keesmulder/followers,https://api.github.com/users/keesmulder/follow...,...,keestimmulder@gmail.com,,Data Scientist at EY VODW,,19.0,0.0,15.0,22.0,2014-06-05T14:26:12Z,2021-03-05T21:53:26Z
419,rianneschouten,,,,,,,,,,...,,,,,,,,,,
420,ydluo,,13563303.0,MDQ6VXNlcjEzNTYzMzAz,https://avatars.githubusercontent.com/u/135633...,,https://api.github.com/users/ydluo,https://github.com/ydluo,https://api.github.com/users/ydluo/followers,https://api.github.com/users/ydluo/following{/...,...,,,,,1.0,0.0,18.0,0.0,2015-07-30T00:37:13Z,2021-04-26T14:35:58Z
421,marnixnaber,,31031471.0,MDQ6VXNlcjMxMDMxNDcx,https://avatars.githubusercontent.com/u/310314...,,https://api.github.com/users/marnixnaber,https://github.com/marnixnaber,https://api.github.com/users/marnixnaber/follo...,https://api.github.com/users/marnixnaber/follo...,...,,,,,1.0,0.0,2.0,0.0,2017-08-15T08:03:55Z,2021-03-19T08:40:20Z


In [20]:
df_users_enriched['is_student'] = df_users_enriched['bio'].apply(is_student)
df_users_enriched['is_student'].value_counts()

False    318
True     108
Name: is_student, dtype: int64

In [22]:
df_users_enriched.to_csv("unique_users.csv", index=False)