#Upload users.csv and repositories.csv before running this


###Question 1

In [None]:
def get_top_5_users_with_highest_followers(filename="users.csv"):
    users = []
    with open(filename, mode="r", encoding="utf-8") as file:
        reader = csv.DictReader(file)
        for row in reader:
            row["followers"] = int(row["followers"])
            users.append(row)

    top_5_users = sorted(users, key=lambda x: x["followers"], reverse=True)[:5]
    top_5_logins = [user["login"] for user in top_5_users]
    return ", ".join(top_5_logins)


top_5_logins = get_top_5_users_with_highest_followers()
print("Top 5 users in Shanghai with the highest number of followers:", top_5_logins)

Top 5 users in Shanghai with the highest number of followers: peng-zhihui, ruanyf, phodal, liyupi, stormzhang


###Question 2

In [None]:
from datetime import datetime
import csv

def get_earliest_5_users(filename="users.csv"):
    users = []
    with open(filename, mode="r", encoding="utf-8") as file:
        reader = csv.DictReader(file)
        for row in reader:
            row["created_at"] = datetime.strptime(row["created_at"], "%Y-%m-%dT%H:%M:%SZ")
            users.append(row)

    earliest_5_users = sorted(users, key=lambda x: x["created_at"])[:5]
    earliest_5_logins = [user["login"] for user in earliest_5_users]
    return ", ".join(earliest_5_logins)

earliest_5_logins = get_earliest_5_users()
print("The 5 earliest registered GitHub users in Shanghai:", earliest_5_logins)

The 5 earliest registered GitHub users in Shanghai: osteele, mrluanma, ShiningRay, rainux, why404


###Question 3

In [None]:
from collections import Counter

def get_top_3_licenses(filename="repositories.csv"):
    licenses = []

    with open(filename, mode="r", encoding="utf-8") as file:
        reader = csv.DictReader(file)
        for row in reader:
            license_name = row["license_name"]
            if license_name:
                licenses.append(license_name)

    license_counts = Counter(licenses).most_common(3)
    top_3_licenses = [license[0] for license in license_counts]
    return ", ".join(top_3_licenses)

top_3_licenses = get_top_3_licenses()
print("The 3 most popular licenses among these users:", top_3_licenses)

The 3 most popular licenses among these users: mit, apache-2.0, other


###Question 4

In [None]:
from collections import Counter

def most_common_company(filename="users.csv"):
    companies = []

    with open(filename, mode="r", encoding="utf-8") as file:
        reader = csv.DictReader(file)
        for row in reader:
            company = row["company"]
            if company:
                companies.append(company)

    most_common_company = Counter(companies).most_common(1)[0][0]

    return most_common_company

majority_company = most_common_company()
print("Company with the majority of developers:", majority_company)


Company with the majority of developers: BYTEDANCE


###Question 5

In [None]:
def most_popular_language(filename="repositories.csv"):
    languages = []

    with open(filename, mode="r", encoding="utf-8") as file:
        reader = csv.DictReader(file)
        for row in reader:
            language = row["language"]
            if language:
                languages.append(language)

    most_common_language = Counter(languages).most_common(1)[0][0]

    return most_common_language

popular_language = most_popular_language()
print("Most popular programming language:", popular_language)


Most popular programming language: JavaScript


###Question 6

In [None]:
def second_popular_language_after_2020(users_file="users.csv", repos_file="repositories.csv"):
    user_join_dates = {}
    languages = []

    with open(users_file, mode="r", encoding="utf-8") as file:
        reader = csv.DictReader(file)
        for row in reader:
            if datetime.strptime(row["created_at"], "%Y-%m-%dT%H:%M:%SZ").year >= 2020:
                user_join_dates[row["login"]] = True

    with open(repos_file, mode="r", encoding="utf-8") as file:
        reader = csv.DictReader(file)
        for row in reader:
            if row["login"] in user_join_dates:
                language = row["language"]
                if language:
                    languages.append(language)

    language_counts = Counter(languages).most_common(2)
    second_most_common_language = language_counts[1][0] if len(language_counts) > 1 else ""

    return second_most_common_language

second_popular_language = second_popular_language_after_2020()
print("Second most popular language after 2020:", second_popular_language)


Second most popular language after 2020: Go


###Question 7

In [None]:
from collections import defaultdict

def language_with_highest_avg_stars(filename="repositories.csv"):
    language_stars = defaultdict(list)

    with open(filename, mode="r", encoding="utf-8") as file:
        reader = csv.DictReader(file)
        for row in reader:
            language = row["language"]
            if language:
                stars = int(row["stargazers_count"])
                language_stars[language].append(stars)

    avg_stars = {lang: sum(stars)/len(stars) for lang, stars in language_stars.items()}

    highest_avg_language = max(avg_stars, key=avg_stars.get)

    return highest_avg_language

highest_avg_stars_language = language_with_highest_avg_stars()
print("Language with highest average stars:", highest_avg_stars_language)


Language with highest average stars: Vue


###Question 8

In [None]:
def top_5_leader_strength(filename="users.csv"):
    users = []

    with open(filename, mode="r", encoding="utf-8") as file:
        reader = csv.DictReader(file)
        for row in reader:
            followers = int(row["followers"])
            following = int(row["following"])
            leader_strength = followers / (1 + following)
            users.append((row["login"], leader_strength))

    # Sort by leader strength in descending order and get the top 5
    top_5_users = sorted(users, key=lambda x: x[1], reverse=True)[:5]

    # Extract the login names of the top 5 users
    top_5_logins = [user[0] for user in top_5_users]

    return ",".join(top_5_logins)

# Get the result
top_5_leaders = top_5_leader_strength()
print("Top 5 users by leader strength:", top_5_leaders)


Top 5 users by leader strength: ruanyf,peng-zhihui,espressif,vnpy,bilibili


###Question 9

In [None]:
import csv
import numpy as np

def followers_repos_correlation(filename="users.csv"):
    followers = []
    public_repos = []

    with open(filename, mode="r", encoding="utf-8") as file:
        reader = csv.DictReader(file)
        for row in reader:
            followers.append(int(row["followers"]))
            public_repos.append(int(row["public_repos"]))

    correlation = np.corrcoef(followers, public_repos)[0, 1]

    return correlation

correlation_value = followers_repos_correlation()
print("Correlation between followers and public repos:", correlation_value)


Correlation between followers and public repos: -0.0050976020145691615


###Question 10

In [None]:
import csv
import numpy as np
from sklearn.linear_model import LinearRegression

def repos_followers_regression(filename="users.csv"):
    followers = []
    public_repos = []

    with open(filename, mode="r", encoding="utf-8") as file:
        reader = csv.DictReader(file)
        for row in reader:
            followers.append(int(row["followers"]))
            public_repos.append(int(row["public_repos"]))

    X = np.array(public_repos).reshape(-1, 1)
    y = np.array(followers)

    model = LinearRegression().fit(X, y)
    additional_followers_per_repo = model.coef_[0]

    return additional_followers_per_repo

additional_followers = repos_followers_regression()
print("Additional followers per additional repo:", additional_followers)


Additional followers per additional repo: -0.05431772559230106


###Question 11

In [2]:
import csv
import numpy as np

def projects_wiki_correlation(filename="repositories.csv"):
    has_projects = []
    has_wiki = []

    with open(filename, mode="r", encoding="utf-8") as file:
        reader = csv.DictReader(file)
        for row in reader:
            has_projects.append(1 if row["has_projects"] == "True" else 0)
            has_wiki.append(1 if row["has_wiki"] == "True" else 0)

    if len(has_projects) != len(has_wiki) or len(has_projects) == 0:
        return None

    correlation = np.corrcoef(has_projects, has_wiki)[0, 1]

    return round(correlation, 3)

projects_wiki_corr = projects_wiki_correlation()
print("Correlation between projects and wiki enabled:", projects_wiki_corr)

Correlation between projects and wiki enabled: 0.309


###Question 12

In [None]:
import csv
import numpy as np

def hireable_following_difference(filename="users.csv"):
    hireable_following = []
    not_hireable_following = []

    with open(filename, mode="r", encoding="utf-8") as file:
        reader = csv.DictReader(file)
        for row in reader:
            following = int(row["following"])
            if row["hireable"] == "true":
                hireable_following.append(following)
            else:
                not_hireable_following.append(following)

    avg_hireable_following = np.mean(hireable_following) if hireable_following else 0
    avg_not_hireable_following = np.mean(not_hireable_following) if not_hireable_following else 0

    difference = avg_hireable_following - avg_not_hireable_following

    return round(difference, 3)

following_difference = hireable_following_difference()
print("Average following difference (hireable - not hireable):", following_difference)


Average following difference (hireable - not hireable): -203.623


###Question 13

In [None]:
import csv
import numpy as np
from sklearn.linear_model import LinearRegression

def bio_length_followers_regression(filename="users.csv"):
    bio_lengths = []
    followers = []

    with open(filename, mode="r", encoding="utf-8") as file:
        reader = csv.DictReader(file)
        for row in reader:
            bio = row["bio"]
            if bio:
                bio_length = len(bio.split())
                bio_lengths.append(bio_length)
                followers.append(int(row["followers"]))

    if len(bio_lengths) < 2:
        return None

    X = np.array(bio_lengths).reshape(-1, 1)
    y = np.array(followers)

    model = LinearRegression().fit(X, y)
    slope = model.coef_[0]

    return round(slope, 3)

bio_followers_slope = bio_length_followers_regression()
print("Impact of bio length on followers (slope):", bio_followers_slope)


Impact of bio length on followers (slope): -42.232


###Question 14

In [None]:
from collections import Counter
from datetime import datetime

def top_5_weekend_creators(filename="repositories.csv"):
    weekend_creators = Counter()

    with open(filename, mode="r", encoding="utf-8") as file:
        reader = csv.DictReader(file)
        for row in reader:
            created_at = datetime.strptime(row["created_at"], "%Y-%m-%dT%H:%M:%SZ")
            if created_at.weekday() >= 5:
                weekend_creators[row["login"]] += 1

    top_5 = [user for user, _ in weekend_creators.most_common(5)]

    return ",".join(top_5)

top_5_weekend_users = top_5_weekend_creators()
print("Top 5 users who created the most repos on weekends:", top_5_weekend_users)


Top 5 users who created the most repos on weekends: davideuler,songquanpeng,taoso,xen0n,leeight


###Question 15

In [None]:
import pandas as pd

def analyze_email_sharing(users_csv_path='users.csv'):
    df = pd.read_csv(users_csv_path)

    df['has_email'] = df['email'].notna() & (df['email'] != '')

    hireable_mask = df['hireable'] == True
    if hireable_mask.any():
        hireable_email_fraction = df[hireable_mask]['has_email'].mean()
    else:
        hireable_email_fraction = 0

    non_hireable_mask = df['hireable'] != True
    if non_hireable_mask.any():
        non_hireable_email_fraction = df[non_hireable_mask]['has_email'].mean()
    else:
        non_hireable_email_fraction = 0

    difference = round(hireable_email_fraction - non_hireable_email_fraction, 3)

    print(f"Total users: {len(df)}")
    print(f"Hireable users with email: {df[hireable_mask]['has_email'].sum()}/{hireable_mask.sum()}")
    print(f"Non-hireable users with email: {df[non_hireable_mask]['has_email'].sum()}/{non_hireable_mask.sum()}")
    print(f"Hireable fraction: {hireable_email_fraction:.3f}")
    print(f"Non-hireable fraction: {non_hireable_email_fraction:.3f}")

    return difference

result = analyze_email_sharing()
print(f"\nFinal result: {result:.3f}")

Total users: 742
Hireable users with email: 160/215
Non-hireable users with email: 353/527
Hireable fraction: 0.744
Non-hireable fraction: 0.670

Final result: 0.074


###Question 16

In [None]:
from collections import Counter

def most_common_surname(filename="users.csv"):
    surnames = []

    with open(filename, mode="r", encoding="utf-8") as file:
        reader = csv.DictReader(file)
        for row in reader:
            name = row["name"]
            if name:
                surname = name.strip().split()[-1]
                surnames.append(surname)

    surname_counts = Counter(surnames)
    max_count = max(surname_counts.values())
    most_common_surnames = [surname for surname, count in surname_counts.items() if count == max_count]

    return ",".join(sorted(most_common_surnames))

common_surname = most_common_surname()
print("Most common surname(s):", common_surname)


Most common surname(s): Zhang
