In [1]:
import glob
import time

import pandas as pd

In [2]:
data_files = glob.glob("*/results/*.csv")

In [3]:
df_github_names_long = pd.concat(
    [ pd.read_csv(fp) for fp in data_files], 
    axis=0, 
    keys=data_files, 
    names=["source", "row"]
).reset_index("source").reset_index(drop=True)
df_github_names_long

Unnamed: 0,source,github_user_id,uu_user_id
0,github_search/results/ids_search_repos.csv,jonasmoons,
1,github_search/results/ids_search_repos.csv,gglucass,
2,github_search/results/ids_search_repos.csv,MarcelRobeer,
3,github_search/results/ids_search_repos.csv,ekatrukha,
4,github_search/results/ids_search_repos.csv,UtrechtUniversity,
...,...,...,...
518,pure/results/ids_pure_users.csv,keesmulder,
519,pure/results/ids_pure_users.csv,RianneSchouten,
520,pure/results/ids_pure_users.csv,ydluo,
521,pure/results/ids_pure_users.csv,marnixnaber,


In [4]:
df_github_names_crosstab = pd.crosstab(df_github_names_long["github_user_id"], df_github_names_long["source"])
df_github_names_crosstab

source,github_search/results/ids_search_repos.csv,github_search/results/ids_search_users.csv,github_search/results/ids_topic_repos.csv,pure/results/ids_pure_users.csv,university_profile_pages/results/employees_github_from_cv_urls.csv,university_profile_pages/results/employees_github_from_links_urls.csv,university_profile_pages/results/employees_github_from_profile_urls.csv
github_user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
19giorgosts,0,1,0,0,0,0,0
4009509,0,1,0,0,0,0,0
73616e646572,0,0,0,0,0,1,0
AJueling,1,1,0,0,0,0,0
ARYENN,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...
xavierpinho23,0,1,0,0,0,0,0
yaikohi,0,1,0,0,0,0,0
ydluo,0,0,0,1,0,0,0
yuntaoj,0,1,0,0,0,0,0


## Basic analytics

In [6]:
df_github_names_long.drop_duplicates(["github_user_id", "source"])["github_user_id"].value_counts()

ekatrukha            3
kevin4998            3
UU-Hydro             3
UtrechtUniversity    3
Casper-Smet          2
                    ..
Garanas              1
richooms             1
drjwbaker            1
berg0138             1
RubenHooijer         1
Name: github_user_id, Length: 418, dtype: int64

## Create unique table of users and corresponding SOLIS-ID

In [10]:
df_users = df_github_names_long[["github_user_id", "uu_user_id"]].sort_values("uu_user_id").drop_duplicates().reset_index(drop=True)
df_users

Unnamed: 0,github_user_id,uu_user_id
0,ajinkyakadu125,AAKadu
1,msdslab,AGJvandeSchoot
2,alexandrosstergiou,AGStergiou
3,annawegmann,AMWegmann
4,amacanovic,AMacanovic
...,...,...
422,gerkovink,
423,RianneSchouten,
424,ydluo,
425,marnixnaber,


In [12]:
from ghapi.all import GhApi, pages

api = GhApi()

In [14]:
results_github_user_api = []

for x in df_users["github_user_id"]:
    
    time.sleep(1)
    results_github_user_api.append(dict(api.users.get_by_username(x)))

In [19]:
df_github_user_api = pd.DataFrame(results_github_user_api)
df_github_user_api

Unnamed: 0,login,id,node_id,avatar_url,gravatar_id,url,html_url,followers_url,following_url,gists_url,...,email,hireable,bio,twitter_username,public_repos,public_gists,followers,following,created_at,updated_at
0,ydluo,13563303,MDQ6VXNlcjEzNTYzMzAz,https://avatars.githubusercontent.com/u/135633...,,https://api.github.com/users/ydluo,https://github.com/ydluo,https://api.github.com/users/ydluo/followers,https://api.github.com/users/ydluo/following{/...,https://api.github.com/users/ydluo/gists{/gist...,...,,,,,1,0,18,0,2015-07-30T00:37:13Z,2021-03-26T14:37:03Z
1,marnixnaber,31031471,MDQ6VXNlcjMxMDMxNDcx,https://avatars.githubusercontent.com/u/310314...,,https://api.github.com/users/marnixnaber,https://github.com/marnixnaber,https://api.github.com/users/marnixnaber/follo...,https://api.github.com/users/marnixnaber/follo...,https://api.github.com/users/marnixnaber/gists...,...,,,,,1,0,2,0,2017-08-15T08:03:55Z,2021-03-19T08:40:20Z
2,gjlbeckers-uu,20772557,MDQ6VXNlcjIwNzcyNTU3,https://avatars.githubusercontent.com/u/207725...,,https://api.github.com/users/gjlbeckers-uu,https://github.com/gjlbeckers-uu,https://api.github.com/users/gjlbeckers-uu/fol...,https://api.github.com/users/gjlbeckers-uu/fol...,https://api.github.com/users/gjlbeckers-uu/gis...,...,,,,,2,0,1,1,2016-08-01T12:29:08Z,2018-12-10T11:20:53Z


In [21]:
df_users_enriched = df_users.merge(df_github_user_api, left_on="github_user_id", right_on="login", how="left")
del df_users_enriched["login"]

df_users_enriched.to_csv("unique_users.csv", index=False)

Unnamed: 0,github_user_id,uu_user_id,id,node_id,avatar_url,gravatar_id,url,html_url,followers_url,following_url,...,email,hireable,bio,twitter_username,public_repos,public_gists,followers,following,created_at,updated_at
0,ajinkyakadu125,AAKadu,,,,,,,,,...,,,,,,,,,,
1,msdslab,AGJvandeSchoot,,,,,,,,,...,,,,,,,,,,
2,alexandrosstergiou,AGStergiou,,,,,,,,,...,,,,,,,,,,
3,annawegmann,AMWegmann,,,,,,,,,...,,,,,,,,,,
4,amacanovic,AMacanovic,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
422,gerkovink,,,,,,,,,,...,,,,,,,,,,
423,RianneSchouten,,,,,,,,,,...,,,,,,,,,,
424,ydluo,,13563303.0,MDQ6VXNlcjEzNTYzMzAz,https://avatars.githubusercontent.com/u/135633...,,https://api.github.com/users/ydluo,https://github.com/ydluo,https://api.github.com/users/ydluo/followers,https://api.github.com/users/ydluo/following{/...,...,,,,,1.0,0.0,18.0,0.0,2015-07-30T00:37:13Z,2021-03-26T14:37:03Z
425,marnixnaber,,31031471.0,MDQ6VXNlcjMxMDMxNDcx,https://avatars.githubusercontent.com/u/310314...,,https://api.github.com/users/marnixnaber,https://github.com/marnixnaber,https://api.github.com/users/marnixnaber/follo...,https://api.github.com/users/marnixnaber/follo...,...,,,,,1.0,0.0,2.0,0.0,2017-08-15T08:03:55Z,2021-03-19T08:40:20Z
