In [1]:
import numpy as np
import pandas as pd

In [37]:
users = pd.read_csv("users.csv")
repos = pd.read_csv("./repositories.csv")

In [3]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 360 entries, 0 to 359
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   login         360 non-null    object
 1   name          341 non-null    object
 2   company       233 non-null    object
 3   location      360 non-null    object
 4   email         252 non-null    object
 5   hireable      360 non-null    bool  
 6   bio           272 non-null    object
 7   public_repos  360 non-null    int64 
 8   followers     360 non-null    int64 
 9   following     360 non-null    int64 
 10  created_at    360 non-null    object
dtypes: bool(1), int64(3), object(7)
memory usage: 28.6+ KB


#### Q1: Who are the top 5 users in Beijing with the highest number of followers? List their login in order, comma-separated.

In [4]:
top_5_followers_users = users.iloc[users["followers"].nlargest(
    5).index]["login"]

top_5_followers_users

0        michaelliao
1           daimajia
2            xiaolai
3          draveness
4    hongyangAndroid
Name: login, dtype: object

In [5]:
",".join(top_5_followers_users)

'michaelliao,daimajia,xiaolai,draveness,hongyangAndroid'

#### Q2: Who are the 5 earliest registered GitHub users in Beijing? List their login in ascending order of created_at, comma-separated.

In [6]:
earliest_5_users = users.iloc[pd.to_datetime(
    users["created_at"]).nsmallest(5).index]["login"]
earliest_5_users

295           robin
111           nwind
125           reeze
72            kejun
98     ZhangHanDong
Name: login, dtype: object

In [7]:
",".join(earliest_5_users)

'robin,nwind,reeze,kejun,ZhangHanDong'

#### Q3: What are the 3 most popular license among these users? Ignore missing licenses. List the license_name in order, comma-separated.

In [8]:
repos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29561 entries, 0 to 29560
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   login             29561 non-null  object
 1   full_name         29561 non-null  object
 2   created_at        29561 non-null  object
 3   stargazers_count  29561 non-null  int64 
 4   watchers_count    29561 non-null  int64 
 5   language          20759 non-null  object
 6   has_projects      29561 non-null  bool  
 7   has_wiki          29561 non-null  bool  
 8   license_name      16070 non-null  object
dtypes: bool(2), int64(2), object(5)
memory usage: 1.6+ MB


In [16]:
top_3_license = repos["license_name"].value_counts(
    dropna=False).nlargest(5).index
top_3_license

Index([nan, 'mit', 'apache-2.0', 'other', 'gpl-3.0'], dtype='object', name='license_name')

In [17]:
",".join([str(license) for license in top_3_license])

'nan,mit,apache-2.0,other,gpl-3.0'

#### Q4: Which company do the majority of these developers work at?

In [18]:
users["company"].value_counts(dropna=False)

company
NaN                    127
BYTEDANCE               12
ALIBABA                  9
PEKING UNIVERSITY        7
TSINGHUA UNIVERSITY      7
                      ... 
FMSOFT                   1
TENCENTARC               1
RESIX-OS                 1
BUPT                     1
MT                       1
Name: count, Length: 180, dtype: int64

#### Q5: Which programming language is most popular among these users?

In [19]:
repos["language"].value_counts(dropna=False)

language
NaN                    8802
JavaScript             4446
Python                 3282
Java                   2197
Go                     1361
                       ... 
M4                        1
NSIS                      1
Pug                       1
Earthly                   1
Game Maker Language       1
Name: count, Length: 144, dtype: int64

#### Q6: Which programming language is the second most popular among users who joined after 2020?

In [20]:
users_after_2020 = users[pd.to_datetime(users["created_at"]).dt.year > 2020]

In [21]:
repos[repos["login"].isin(users_after_2020["login"])
      ]["language"].value_counts(dropna=False)

language
Python              10
NaN                  9
HTML                 6
TypeScript           5
C++                  3
JavaScript           3
Jupyter Notebook     3
Astro                2
Vue                  2
C#                   2
Name: count, dtype: int64

#### Q7: Which language has the highest average number of stars per repository?

In [22]:
repos["stargazers_count"].value_counts()

stargazers_count
0       16458
1        3877
2        1619
3         785
4         584
        ...  
1019        1
4979        1
618         1
979         1
6242        1
Name: count, Length: 1098, dtype: int64

In [23]:
repos.groupby("language")["stargazers_count"].mean().nlargest(10)

language
Jinja         3420.000000
Solidity      1286.666667
VBScript       698.000000
Lex            643.000000
TeX            294.222222
Stylus         232.400000
TypeScript     205.325088
Java           190.801092
MATLAB         189.840000
VBA            182.000000
Name: stargazers_count, dtype: float64

#### Q8: Let's define `leader_strength` as `followers / (1 + following)`. Who are the top 5 in terms of leader_strength? List their login in order, comma-separated.

In [24]:
users["leader_strength"] = users[["followers", "following"]].apply(
    lambda x: x["followers"] / (1 + x["following"]), axis=1)

In [25]:
top_5_leaders = users.iloc[users["leader_strength"].nlargest(5).index]["login"]
top_5_leaders

0     michaelliao
9       ityouknow
14    liuhuanyong
32         thunlp
35        shenghy
Name: login, dtype: object

In [26]:
",".join(top_5_leaders)

'michaelliao,ityouknow,liuhuanyong,thunlp,shenghy'

#### Q9: What is the correlation between the number of followers and the number of public repositories among users in Beijing?

In [27]:
users[["followers", "public_repos"]].corr()

Unnamed: 0,followers,public_repos
followers,1.0,0.032818
public_repos,0.032818,1.0


#### Q10: Does creating more repos help users get more followers? Using regression, estimate how many additional followers a user gets per additional public repository

In [28]:
slope, intercept = np.polyfit(users["public_repos"], users["followers"], 1)
round(slope, 3)

np.float64(0.655)

#### Q11: Do people typically enable projects and wikis together? What is the correlation between a repo having projects enabled and having wiki enabled?

In [29]:
repos[["has_projects", "has_wiki"]].corr()

Unnamed: 0,has_projects,has_wiki
has_projects,1.0,0.280184
has_wiki,0.280184,1.0


#### Q12: Do hireable users follow more people than those who are not hireable?

In [38]:
round(users.groupby("hireable")["followers"].mean().diff().iloc[-1], 3)

np.float64(114.894)

In [39]:
hireable_avg_following = users[users['hireable'] == True]['following'].mean()
non_hireable_avg_following = users[users['hireable']
                                   == False]['following'].mean()
difference = hireable_avg_following - non_hireable_avg_following
round(difference, 3)

np.float64(149.014)

#### Q13: Some developers write long bios. Does that help them get more followers? What's the correlation of the length of their bio (in Unicode characters) with followers? (Ignore people without bios)

In [166]:
users["bio_length"] = users["bio"].apply(
    lambda x: len(str(x).split()) if str(x).strip() != "" else 0)

In [167]:
users[users["bio_length"] > 0][["bio_length", "followers"]].corr()

Unnamed: 0,bio_length,followers
bio_length,1.0,0.019545
followers,0.019545,1.0


In [168]:
bio_length, followers = users[users["bio_length"]
                              > 0]["bio_length"], users[users["bio_length"] > 0]["followers"]

In [169]:
slope, intercept = np.polyfit(bio_length, followers, 1)
round(slope, 3)

np.float64(10.731)

In [46]:
users_with_bio = users[(users['bio'].notna()) & (users['bio'] != '')].copy()
users_with_bio.loc[:, 'bio_len'] = users_with_bio['bio'].apply(
    lambda x: len(x.split()))

np.polyfit(users_with_bio['bio_len'], users_with_bio['followers'], 1)

array([ -11.02357918, 1860.54361296])

#### Q14: Who created the most repositories on weekends (UTC)? List the top 5 users' login in order, comma-separated

In [179]:
top_5_users_of_weekend_repos = repos[pd.to_datetime(repos["created_at"]).dt.dayofweek.isin(
    [5, 6])]["login"].value_counts().nlargest(5).index
top_5_users_of_weekend_repos

Index(['LinuxSuRen', 'zhufengnodejs', 'i5ting', 'mozillazg', 'hailiang-wang'], dtype='object', name='login')

In [180]:
",".join(top_5_users_of_weekend_repos)

'LinuxSuRen,zhufengnodejs,i5ting,mozillazg,hailiang-wang'

#### Q15: Do people who are hireable share their email addresses more often?

In [188]:
users.groupby("hireable")["email"].count(), users.groupby("hireable")["email"].count()

hireable
False    176
True      76
Name: email, dtype: int64

In [192]:
email_by_hireable = users.groupby('hireable').apply(
    lambda group: (group['email'].notna().mean()))
email_by_hireable.diff()

  email_by_hireable = users.groupby('hireable').apply(


hireable
False         NaN
True     0.072939
dtype: float64

In [47]:
fraction_hierable = users[users['hireable'] == True]['email'].notna().mean()
fraction_non_hierable = users[users['hireable']
                              == False]['email'].notna().mean()
diff = fraction_hierable - fraction_non_hierable
round(diff, 3)

np.float64(0.073)

#### Q16: Let's assume that the last word in a user's name is their surname (ignore missing names, trim and split by whitespace.) What's the most common surname? (If there's a tie, list them all, comma-separated, alphabetically)

In [49]:
users["surname"] = users["name"].apply(
    lambda x: x.strip().split()[-1] if pd.notna(x) else x)
max_count = users["surname"].value_counts().max()
users["surname"].value_counts()[users["surname"].value_counts() == max_count]

surname
Zhang    11
Name: count, dtype: int64