Scrap Code to get users.csv and repositories.csv

In [1]:
import requests
import csv

GITHUB_TOKEN = ""
HEADERS = {"Authorization": f"token {GITHUB_TOKEN}"}

def get_users_in_basel():
    users = []
    query = "location:Zurich+followers:>50"
    page = 1
    per_page = 100
    total_users = 0

    while True:
        url = f"https://api.github.com/search/users?q={query}&per_page={per_page}&page={page}"
        response = requests.get(url, headers=HEADERS)
        print(f"Fetching page {page}...")

        if response.status_code != 200:
            print("Error fetching data:", response.json())
            break

        data = response.json()
        users.extend(data['items'])
        total_users += len(data['items'])

        if len(data['items']) < per_page:
            break

        page += 1

    detailed_users = []
    for user in users:
        user_info = get_user_details(user['login'])
        detailed_users.append(user_info)

    return detailed_users

def get_user_details(username):
    user_url = f"https://api.github.com/users/{username}"
    user_data = requests.get(user_url, headers=HEADERS).json()

    return {
        'login': user_data['login'],
        'name': user_data['name'],
        'company': clean_company_name(user_data['company']),
        'location': user_data['location'],
        'email': user_data['email'],
        'hireable': user_data['hireable'],
        'bio': user_data['bio'],
        'public_repos': user_data['public_repos'],
        'followers': user_data['followers'],
        'following': user_data['following'],
        'created_at': user_data['created_at'],
    }

def clean_company_name(company):
    if company:
        company = company.strip().upper()
        if company.startswith('@'):
            company = company[1:]
    return company

def get_user_repos(username):
    repos_url = f"https://api.github.com/users/{username}/repos?per_page=500"
    response = requests.get(repos_url, headers=HEADERS)
    repos_data = response.json()

    repos = []
    for repo in repos_data:
        repos.append({
            'login': username,
            'full_name': repo['full_name'],
            'created_at': repo['created_at'],
            'stargazers_count': repo['stargazers_count'],
            'watchers_count': repo['watchers_count'],
            'language': repo['language'],
            'has_projects': repo['has_projects'],
            'has_wiki': repo['has_wiki'],
            'license_name': repo['license']['key'] if repo['license'] else None,
        })

    return repos

def save_users_to_csv(users):
    with open('users.csv', mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=['login', 'name', 'company', 'location', 'email', 'hireable', 'bio', 'public_repos', 'followers', 'following', 'created_at'])
        writer.writeheader()
        writer.writerows(users)

def save_repos_to_csv(repos):
    with open('repositories.csv', mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=['login', 'full_name', 'created_at', 'stargazers_count', 'watchers_count', 'language', 'has_projects', 'has_wiki', 'license_name'])
        writer.writeheader()
        writer.writerows(repos)

if __name__ == "__main__":
    users = get_users_in_basel()
    save_users_to_csv(users)

    all_repos = []
    for user in users:
        repos = get_user_repos(user['login'])
        all_repos.extend(repos)

    save_repos_to_csv(all_repos)
    print("Done")

Fetching page 1...
Fetching page 2...
Fetching page 3...
Fetching page 4...
Fetching page 5...
Done


Question 1: Who are the top 5 users in Zurich with the highest number of followers? List their login in order, comma-separated.

In [3]:
import pandas as pd

users_df = pd.read_csv('users.csv')

top_users = users_df.sort_values(by='followers', ascending=False).head(5)

top_logins = ', '.join(top_users['login'].tolist())

print("Top 5 Users in Zurich with the highest followers:", top_logins)


Top 5 Users in Zurich with the highest followers: IDouble, TheOfficialFloW, Seldaek, riscv, JonnyBurger


Question 2: Who are the 5 earliest registered GitHub users in Zurich? List their login in ascending order of created_at, comma-separated.

In [5]:
import pandas as pd

users_df = pd.read_csv('users.csv')

users_df['created_at'] = pd.to_datetime(users_df['created_at'])

earliest_users = users_df.sort_values(by='created_at', ascending=True).head(5)

earliest_logins = ', '.join(earliest_users['login'].tolist())

print("5 earliest registered GitHub users in Zurich:", earliest_logins)


5 earliest registered GitHub users in Zurich: lejoe, uwolfer, matthiask, oscardelben, panterch


Question 3: What are the 3 most popular license among these users? Ignore missing licenses. List the license_name in order, comma-separated.

In [6]:
import pandas as pd

repos_df = pd.read_csv('repositories.csv')

repos_df = repos_df.dropna(subset=['license_name'])

top_licenses = repos_df['license_name'].value_counts().head(3)

top_license_names = ', '.join(top_licenses.index.tolist())

print("3 most popular licenses among these users:", top_license_names)


3 most popular licenses among these users: mit, other, apache-2.0


Question 4: Which company do the majority of these developers work at?

In [7]:
import pandas as pd

users_df = pd.read_csv('users.csv')

users_df = users_df[users_df['company'].notna() & (users_df['company'] != '')]

top_company = users_df['company'].value_counts().idxmax()

print("The majority of these developers work at:", top_company)


The majority of these developers work at: GOOGLE


Question 5: Which programming language is most popular among these users?

In [8]:
import pandas as pd

repos_df = pd.read_csv('repositories.csv')

repos_df = repos_df.dropna(subset=['language'])

top_language = repos_df['language'].value_counts().idxmax()

print("The most popular programming language among these users is:", top_language)


The most popular programming language among these users is: Python


Question 6: Which programming language is the second most popular among users who joined after 2020?

In [9]:
import pandas as pd

users_df = pd.read_csv('users.csv')
repos_df = pd.read_csv('repositories.csv')

users_df['created_at'] = pd.to_datetime(users_df['created_at'])

recent_users = users_df[users_df['created_at'].dt.year > 2020]

recent_usernames = recent_users['login'].tolist()

recent_repos = repos_df[repos_df['login'].isin(recent_usernames)]

recent_repos = recent_repos.dropna(subset=['language'])

language_counts = recent_repos['language'].value_counts()

second_most_popular_language = language_counts.index[1] if len(language_counts) > 1 else None

print("The second most popular programming language among users who joined after 2020 is:", second_most_popular_language)


The second most popular programming language among users who joined after 2020 is: JavaScript


Question 7: Which language has the highest average number of stars per repository?

In [10]:
import pandas as pd

repos_df = pd.read_csv('repositories.csv')

repos_df = repos_df.dropna(subset=['language', 'stargazers_count'])

average_stars_per_language = repos_df.groupby('language')['stargazers_count'].mean()

top_language = average_stars_per_language.idxmax()
highest_average_stars = average_stars_per_language.max()

print(f"The language with the highest average stars per repository is: {top_language} with an average of {highest_average_stars:.2f} stars.")


The language with the highest average stars per repository is: BitBake with an average of 364.00 stars.


Question 8: Let's define leader_strength as followers / (1 + following). Who are the top 5 in terms of leader_strength? List their login in order, comma-separated.

In [11]:
import pandas as pd

users_df = pd.read_csv('users.csv')

users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])

top_leader_strength_users = users_df.sort_values(by='leader_strength', ascending=False).head(5)

top_leader_logins = ', '.join(top_leader_strength_users['login'].tolist())

print("Top 5 users in terms of leader_strength:", top_leader_logins)


Top 5 users in terms of leader_strength: riscv, bpasero, Seldaek, egamma, ethz-asl


Question 9: What is the correlation between the number of followers and the number of public repositories among users in Zurich?

In [12]:
import pandas as pd

users_df = pd.read_csv('users.csv')

correlation = users_df['followers'].corr(users_df['public_repos'])

print("Correlation between the number of followers and the number of public repositories:", correlation)


Correlation between the number of followers and the number of public repositories: 0.06497852178552568


Question 10: Does creating more repos help users get more followers? Using regression, estimate how many additional followers a user gets per additional public repository.

In [14]:
import pandas as pd
import statsmodels.api as sm

users_df = pd.read_csv('users.csv')

X = users_df['public_repos']
y = users_df['followers']

X = sm.add_constant(X)

model = sm.OLS(y, X).fit()
slope = model.params['public_repos']

print(f"Estimated additional followers per additional public repository: {slope:.3f}")


Estimated additional followers per additional public repository: 1.476


Question 11: Do people typically enable projects and wikis together? What is the correlation between a repo having projects enabled and having wiki enabled?

In [18]:
import pandas as pd

repos_df = pd.read_csv('repositories.csv')

repos_df['has_projects_binary'] = repos_df['has_projects'].astype(int)
repos_df['has_wiki_binary'] = repos_df['has_wiki'].astype(int)

correlation = repos_df['has_projects_binary'].corr(repos_df['has_wiki_binary'])

print(f"Correlation between having projects enabled and having wiki enabled: {correlation:.3f}")


Correlation between having projects enabled and having wiki enabled: 0.333


Question 12: Do hireable users follow more people than those who are not hireable?

In [20]:
import pandas as pd

users_df = pd.read_csv('users.csv')

hireable_users = users_df[users_df['hireable'] == True]
non_hireable_users = users_df[users_df['hireable'].isna() | (users_df['hireable'] == False)]

average_hireable_following = hireable_users['following'].mean() if not hireable_users.empty else 0
average_non_hireable_following = non_hireable_users['following'].mean() if not non_hireable_users.empty else 0

difference = average_hireable_following - average_non_hireable_following

print(f'Difference in average following (hireable - non-hireable): {difference:.3f}')


Difference in average following (hireable - non-hireable): -841.534


Question 13: Some developers write long bios. Does that help them get more followers? What's the impact of the length of their bio (in Unicode words, split by whitespace) with followers? (Ignore people without bios)

In [24]:
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd

# Load the users data
users_df = pd.read_csv('users.csv')

# Filter users with a bio
users_with_bio = users_df[users_df['bio'].notna()]

# Calculate the bio word count using .loc to avoid SettingWithCopyWarning
users_with_bio.loc[:, 'bio_word_count'] = users_with_bio['bio'].apply(lambda x: len(x.split()))

# Prepare the features and target variable
X = users_with_bio[['bio_word_count']]
y = users_with_bio['followers']

# Fit the linear regression model
regression_model = LinearRegression().fit(X, y)
slope = regression_model.coef_[0]

# Print the slope, which indicates how many additional followers are gained per additional word in the bio
print(slope)


41.17240193733559


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_bio.loc[:, 'bio_word_count'] = users_with_bio['bio'].apply(lambda x: len(x.split()))


Question 14: Who created the most repositories on weekends (UTC)? List the top 5 users' login in order, comma-separated

In [23]:
import pandas as pd

repos_df = pd.read_csv('repositories.csv')

repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])

repos_df['day_of_week'] = repos_df['created_at'].dt.dayofweek

weekend_repos = repos_df[repos_df['day_of_week'].isin([5, 6])]

weekend_counts = weekend_repos.groupby('login').size().reset_index(name='repo_count')

top_weekend_users = weekend_counts.sort_values(by='repo_count', ascending=False).head(5)

top_users_logins = top_weekend_users['login'].tolist()

print("Top 5 users who created the most repositories on weekends:", ", ".join(top_users_logins))


Top 5 users who created the most repositories on weekends: kynan, yati-sagade, dw5, nicnocquee, sspreitzer


Question 15: Do people who are hireable share their email addresses more often?

In [26]:
import pandas as pd

users_df = pd.read_csv('users.csv')

hireable_users = users_df[users_df['hireable'] == True]
non_hireable_users = users_df[users_df['hireable'].isna() | (users_df['hireable'] == False)]

hireable_email_count = hireable_users['email'].notna().sum()
non_hireable_email_count = non_hireable_users['email'].notna().sum()

total_hireable_users = len(hireable_users)
total_non_hireable_users = len(non_hireable_users)

hireable_email_proportion = hireable_email_count / total_hireable_users if total_hireable_users > 0 else 0
non_hireable_email_proportion = non_hireable_email_count / total_non_hireable_users if total_non_hireable_users > 0 else 0

email_proportion_difference = hireable_email_proportion - non_hireable_email_proportion

print(f'Difference in email sharing proportion: {email_proportion_difference:.3f}')


Difference in email sharing proportion: 0.074


Question 16: Let's assume that the last word in a user's name is their surname (ignore missing names, trim and split by whitespace.) What's the most common surname? (If there's a tie, list them all, comma-separated, alphabetically)

In [27]:
import pandas as pd
from collections import Counter

users_df = pd.read_csv('users.csv')

valid_users = users_df[users_df['name'].notna()]

valid_users['surname'] = valid_users['name'].apply(lambda x: x.strip().split()[-1])

surname_counts = Counter(valid_users['surname'])

max_count = max(surname_counts.values())

most_common_surnames = sorted([surname for surname, count in surname_counts.items() if count == max_count])

result = ', '.join(most_common_surnames)

print(f'Most common surname(s): {result}')


Most common surname(s): Li, Wang


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_users['surname'] = valid_users['name'].apply(lambda x: x.strip().split()[-1])
