In [1]:
import requests
import csv
import pandas as pd

In [21]:
users = []
with open("users.csv", mode="r", newline="", encoding="utf-8") as file:
    reader = csv.DictReader(file)
    for row in reader:
        users.append([row["login"], row["public_repos"]])

a = sorted([int(i[1]) for i in users], reverse=True)
a[:5]

[593, 448, 416, 391, 334]

In [22]:
df = pd.read_csv('users.csv')
repos_df = pd.read_csv('repositories.csv')

1. Who are the top 5 users in Paris with the highest number of followers? List their login in order, comma-separated.


In [23]:
top_5_logins = df.sort_values(by='followers', ascending=False).head(5)['login'].tolist()
print(','.join(top_5_logins))

IDouble,TheOfficialFloW,Seldaek,riscv,JonnyBurger


2. Who are the 5 earliest registered GitHub users in Paris? List their login in ascending order of created_at, comma-separated.


In [24]:
earliest_users = df.sort_values(by='created_at').head(5)['login'].tolist()
print(','.join(earliest_users))

lejoe,uwolfer,matthiask,oscardelben,panterch


3. What are the 3 most popular license among these users? Ignore missing licenses. List the license_name in order

In [25]:
popular_licenses = repos_df['license_name'].dropna().value_counts().head(3)
top_3_licenses = popular_licenses.index.tolist()
print(','.join(top_3_licenses))

mit,other,apache-2.0


4. Which company do the majority of these developers work at?

In [26]:
most_common_company = df['company'].value_counts().idxmax()
print(most_common_company)

GOOGLE


5. Which programming language is most popular among these users?


In [27]:
most_common_language = repos_df['language'].value_counts().idxmax()
print(most_common_language)

Python


6. Which programming language is the second most popular among users who joined after 2020?

In [28]:
df['created_at'] = pd.to_datetime(df['created_at'])
recent_users = df[df['created_at'] > '2020-01-01']

recent_user_logins = recent_users['login'].tolist()
recent_repos = repos_df[repos_df['login'].isin(recent_user_logins)]

language_counts = recent_repos['language'].value_counts()
second_most_common_language = language_counts.index[1]  # Index 1 for second most
print(second_most_common_language)


JavaScript


7. Which language has the highest average number of stars per repository?


In [29]:
average_stars_per_language = repos_df.groupby('language')['stargazers_count'].mean()
highest_avg_stars_language = average_stars_per_language.idxmax()
print(highest_avg_stars_language)


BitBake


8. Let's define leader_strength as followers / (1 + following). Who are the top 5 in terms of leader_strength? List their login in order, comma-separated.


In [30]:
df['leader_strength'] = df['followers'] / (1 + df['following'])
top_leaders = df.nlargest(5, 'leader_strength')['login']
top_leaders_list = ','.join(top_leaders)
print(top_leaders_list)

riscv,bpasero,Seldaek,egamma,ethz-asl


9. What is the correlation between the number of followers and the number of public repositories among users in Paris?
Correlation between followers and repos (to 3 decimal places, e.g. 0.123 or -0.123)

In [31]:
correlation = df['followers'].corr(df['public_repos'])
correlation_formatted = round(correlation, 3)
print(correlation_formatted)

0.066


10. Does creating more repos help users get more followers? Using regression, estimate how many additional followers a user gets per additional public repository.
Regression slope of followers on repos (to 3 decimal places, e.g. 0.123 or -0.123)

In [32]:
from scipy.stats import linregress
slope, intercept, r_value, p_value, std_err = linregress(df['public_repos'], df['followers'])
slope_formatted = round(slope, 3)
print(slope_formatted)

1.468


11. Do people typically enable projects and wikis together? What is the correlation between a repo having projects enabled and having wiki enabled?
Correlation between projects and wiki enabled (to 3 decimal places, e.g. 0.123 or -0.123)

In [33]:
repos_df = pd.read_csv('repositories.csv')

print("Unique values in has_projects:", repos_df['has_projects'].unique())
print("Unique values in has_wiki:", repos_df['has_wiki'].unique())

repos_df['has_projects'] = repos_df['has_projects'].map({True: 1, False: 0}).fillna(0).astype(int)
repos_df['has_wiki'] = repos_df['has_wiki'].map({True: 1, False: 0}).fillna(0).astype(int)
repos_df = repos_df.dropna(subset=['has_projects', 'has_wiki'])

correlation = repos_df['has_projects'].corr(repos_df['has_wiki'])
correlation_formatted = round(correlation, 3)

print("Correlation between projects and wiki enabled:", correlation_formatted)

Unique values in has_projects: [ True False]
Unique values in has_wiki: [ True False]
Correlation between projects and wiki enabled: 0.31


12. Do hireable users follow more people than those who are not hireable?
Average of following per user for hireable=true minus the average following for the rest (to 3 decimal places, e.g. 12.345 or -12.345)

In [34]:
import pandas as pd
import numpy as np

def analyze_following_by_hireable(df):
    """
    Analyze the difference in average following count between hireable and non-hireable users
    
    Parameters:
    df: pandas DataFrame with hireable and following columns
    
    Returns:
    float: difference in average following count (hireable - non-hireable)
    dict: detailed statistics
    """
    # Ensure hireable is treated as boolean
    df['hireable'] = df['hireable'].fillna(False)
    
    # Calculate average following for hireable users
    hireable_avg = df[df['hireable']]['following'].mean()
    
    # Calculate average following for non-hireable users
    non_hireable_avg = df[~df['hireable']]['following'].mean()
    
    # Calculate the difference
    difference = hireable_avg - non_hireable_avg
    
    # Gather additional statistics
    stats = {
        'difference': round(difference, 3),
        'hireable_avg': round(hireable_avg, 3),
        'non_hireable_avg': round(non_hireable_avg, 3),
        'hireable_count': len(df[df['hireable']]),
        'non_hireable_count': len(df[~df['hireable']]),
        'hireable_std': df[df['hireable']]['following'].std(),
        'non_hireable_std': df[~df['hireable']]['following'].std()
    }
    
    return stats

# Read the data
df = pd.read_csv('users.csv')

# Run the analysis
results = analyze_following_by_hireable(df)

print(f"Difference in average following (hireable - non-hireable): {results['difference']}")

Difference in average following (hireable - non-hireable): -831.825


13. Some developers write long bios. Does that help them get more followers? What's the correlation of the length of their bio (in Unicode characters) with followers? (Ignore people without bios)
Regression slope of followers on bio word count (to 3 decimal places, e.g. 12.345 or -12.345)

In [35]:
import pandas as pd

# Load the users.csv file into a DataFrame
df = pd.read_csv('users.csv')
from sklearn.linear_model import LinearRegression
users_with_bio = df[(df['bio'].notna()) & (df['bio'] != '')].copy()
users_with_bio.loc[:, 'bio_len'] = users_with_bio['bio'].str.len()

X = users_with_bio['bio_len'].values.reshape(-1,1)
y = users_with_bio['followers']

lr2 = LinearRegression()
lr2.fit(X, y)
lr2.coef_[0]

5.422877174220235

14. Who created the most repositories on weekends (UTC)? List the top 5 users' login in order, comma-separated


In [36]:
import pandas as pd

repos_df = pd.read_csv('repositories.csv')
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])
weekend_repos = repos_df[repos_df['created_at'].dt.dayofweek >= 5]
user_repo_counts = weekend_repos['login'].value_counts()
top_5_users = user_repo_counts.head(5).index.tolist()
top_5_users_string = ','.join(top_5_users)

print(top_5_users_string)

JonnyBurger,syzer,kynan,nicnocquee,shuhei


15. Do people who are hireable share their email addresses more often?
[fraction of users with email when hireable=true] minus [fraction of users with email for the rest] (to 3 decimal places, e.g. 0.123 or -0.123)

In [37]:
import pandas as pd
import numpy as np

def analyze_email_sharing(df):
    """
    Analyze the difference in email sharing rates between hireable and non-hireable users
    
    Parameters:
    df: pandas DataFrame with hireable and email columns
    
    Returns:
    float: difference in email sharing rates
    """
    # Ensure hireable is treated as boolean
    df['hireable'] = df['hireable'].fillna(False)
    
    # Calculate proportion with email for hireable users
    hireable_users = df[df['hireable']]
    hireable_with_email = hireable_users['email'].notna().mean()
    
    # Calculate proportion with email for non-hireable users
    non_hireable_users = df[~df['hireable']]
    non_hireable_with_email = non_hireable_users['email'].notna().mean()
    
    # Calculate the difference
    difference = hireable_with_email - non_hireable_with_email
    
    return round(difference, 3)

# Read the data
df = pd.read_csv('users.csv')

# Calculate the difference in proportions
diff = analyze_email_sharing(df)
print(f"Difference in email sharing rates (hireable - non-hireable): {diff}")

Difference in email sharing rates (hireable - non-hireable): 0.069


16. Let's assume that the last word in a user's name is their surname (ignore missing names, trim and split by whitespace.) What's the most common surname? (If there's a tie, list them all, comma-separated, alphabetically)
Number of users with the most common surname

In [38]:
import pandas as pd
from collections import Counter

df['surname'] = df['name'].str.strip().str.split().str[-1]
surname_counts = Counter(df['surname'].dropna())

most_common_surname_count = surname_counts.most_common()
most_common_count = most_common_surname_count[0][1]

most_common_surnames = [surname for surname, count in most_common_surname_count if count == most_common_count]
most_common_surnames.sort()
most_common_surnames_output = ', '.join(most_common_surnames)

print("Most common surnames:")
print(most_common_surnames_output)

Most common surnames:
Li, Wang
