In [None]:
import requests
import pandas as pd
import os
import time

os.environ['GITHUB_TOKEN'] = 'my-token'

# GitHub API setup
GITHUB_TOKEN = os.getenv('GITHUB_TOKEN')  # Use an environment variable for the token
HEADERS = {
    'Authorization': f'token {GITHUB_TOKEN}'
}
BASE_URL = 'https://api.github.com'

# Function to get users from Shanghai with over 200 followers
def get_users_in_shanghai(min_followers=200, city='Shanghai'):
    users = []
    page = 1
    while True:
        url = f"{BASE_URL}/search/users?q=location:{city}+followers:>{min_followers}&per_page=100&page={page}"
        response = requests.get(url, headers=HEADERS)
        if response.status_code != 200:
            print(f"Error fetching users: {response.status_code} {response.text}")
            break
        data = response.json()
        users += data.get('items', [])
        if 'next' not in response.links:
            break
        page += 1
        time.sleep(1)  # Respect GitHub's rate limiting
    return users

# Function to clean company name
def clean_company(company):
    if company:
        company = company.strip().replace("@", "").upper()
    return company

# Function to fetch detailed user data
def get_user_details(login):
    url = f"{BASE_URL}/users/{login}"
    response = requests.get(url, headers=HEADERS)
    if response.status_code != 200:
        print(f"Error fetching user details for {login}: {response.status_code} {response.text}")
        return {}
    return response.json()

# Fetch repository information for each user
def get_user_repos(login):
    repos = []
    page = 1
    while True:
        url = f"{BASE_URL}/users/{login}/repos?per_page=100&page={page}"
        response = requests.get(url, headers=HEADERS)
        if response.status_code != 200:
            print(f"Error fetching repos for {login}: {response.status_code} {response.text}")
            break
        data = response.json()
        if not data:
            break
        repos += data
        page += 1
        time.sleep(1)  # Respect GitHub's rate limiting
        if len(repos) >= 500:  # Limit to 500 repositories
            break
    return repos

# Writing users to CSV using pandas
def write_users_csv(users):
    user_data = []
    for user in users:
        details = get_user_details(user['login'])
        user_data.append([
            details.get('login'), details.get('name'),
            clean_company(details.get('company')),
            details.get('location'), details.get('email'),
            details.get('hireable'), details.get('bio'),
            details.get('public_repos'), details.get('followers'),
            details.get('following'), details.get('created_at')
        ])
        time.sleep(1)  # Respect GitHub's rate limiting

    df = pd.DataFrame(user_data, columns=[
        'login', 'name', 'company', 'location', 'email',
        'hireable', 'bio', 'public_repos', 'followers',
        'following', 'created_at'
    ])
    df.to_csv('users.csv', index=False)

# Writing repositories to CSV using pandas
def write_repos_csv(users):
    repo_data = []
    for user in users:
        repos = get_user_repos(user['login'])
        for repo in repos:
            license_name = repo.get('license', {}).get('name') if repo.get('license') else 'None'
            repo_data.append([
                user['login'], repo.get('full_name'), repo.get('created_at'),
                repo.get('stargazers_count'), repo.get('watchers_count'),
                repo.get('language'), repo.get('has_projects'), repo.get('has_wiki'),
                license_name
            ])
        time.sleep(1)  # Respect GitHub's rate limiting

    df = pd.DataFrame(repo_data, columns=[
        'login', 'full_name', 'created_at', 'stargazers_count',
        'watchers_count', 'language', 'has_projects', 'has_wiki',
        'license_name'
    ])
    df.to_csv('repositories.csv', index=False)

# Main Execution
if __name__ == '__main__':
    users = get_users_in_shanghai(min_followers=200)
    write_users_csv(users)
    write_repos_csv(users)


In [None]:
import csv

def write_users_csv(users):
    with open('users.csv', mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow([
            'login', 'name', 'company', 'location', 'email',
            'hireable', 'bio', 'public_repos', 'followers',
            'following', 'created_at'
        ])
        for user in users:
            writer.writerow([
                user['login'],
                user['name'],
                clean_company(user.get('company')),
                user['location'],
                user['email'],
                user['hireable'],
                user['bio'],
                user['public_repos'],
                user['followers'],
                user['following'],
                user['created_at']
            ])


In [None]:
import pandas as pd

users = pd.read_csv('users.csv')
repos = pd.read_csv('repositories.csv')

In [None]:
users['company'] = users['company'].str.strip().str.lstrip('@').str.upper()

In [None]:
users.head()

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
0,peng-zhihui,稚晖,HUAWEI RESEARCH,Shanghai,prime_zhihui@foxmail.com,,野生钢铁侠本侠。,56,80545,9,2015-06-22T04:59:39Z
1,ruanyf,Ruan YiFeng,,"Shanghai, China",yifeng.ruan@gmail.com,,,72,79285,0,2011-07-10T01:07:17Z
2,phodal,Fengda Huang,THOUGHTWORKS,"Shanghai / Hangzhou, China",h@phodal.com,True,I'm digging holes.,367,20050,9,2010-11-08T11:46:51Z
3,liyupi,程序员鱼皮,编程学习公众号【程序员鱼皮】,China Shanghai,592789970@qq.com,True,speak less do more！前腾讯全栈开发，现科技公司创始人,82,17339,29,2017-02-26T08:44:22Z
4,stormzhang,stormzhang,"BOOHEE, INC.","Shanghai, China",stormzhang.dev@gmail.com,,微信公众号：stormzhang,5,15879,91,2012-09-03T02:58:11Z


In [None]:
users.fillna('', inplace=True)

In [None]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 738 entries, 0 to 737
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   login         738 non-null    object
 1   name          738 non-null    object
 2   company       738 non-null    object
 3   location      738 non-null    object
 4   email         738 non-null    object
 5   hireable      738 non-null    object
 6   bio           738 non-null    object
 7   public_repos  738 non-null    int64 
 8   followers     738 non-null    int64 
 9   following     738 non-null    int64 
 10  created_at    738 non-null    object
dtypes: int64(3), object(8)
memory usage: 63.5+ KB


In [None]:
users['hireable'] = users['hireable'].map({True: 'true', False: 'false'})


In [None]:
users.head()

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
0,peng-zhihui,稚晖,HUAWEI RESEARCH,Shanghai,prime_zhihui@foxmail.com,,野生钢铁侠本侠。,56,80545,9,2015-06-22T04:59:39Z
1,ruanyf,Ruan YiFeng,,"Shanghai, China",yifeng.ruan@gmail.com,,,72,79285,0,2011-07-10T01:07:17Z
2,phodal,Fengda Huang,THOUGHTWORKS,"Shanghai / Hangzhou, China",h@phodal.com,True,I'm digging holes.,367,20050,9,2010-11-08T11:46:51Z
3,liyupi,程序员鱼皮,编程学习公众号【程序员鱼皮】,China Shanghai,592789970@qq.com,True,speak less do more！前腾讯全栈开发，现科技公司创始人,82,17339,29,2017-02-26T08:44:22Z
4,stormzhang,stormzhang,"BOOHEE, INC.","Shanghai, China",stormzhang.dev@gmail.com,,微信公众号：stormzhang,5,15879,91,2012-09-03T02:58:11Z


In [None]:
repos.head()

Unnamed: 0,login,full_name,created_at,stargazers_count,watchers_count,language,has_projects,has_wiki,license_name
0,peng-zhihui,peng-zhihui/A-Eye,2019-10-02T15:54:46Z,1180,1180,C,True,True,
1,peng-zhihui,peng-zhihui/AimRT,2024-09-23T10:32:35Z,69,69,,True,True,Other
2,peng-zhihui,peng-zhihui/arduino-boost,2017-05-17T06:28:34Z,54,54,C++,True,True,Boost Software License 1.0
3,peng-zhihui,peng-zhihui/BilibiliLottery,2020-03-28T14:40:45Z,222,222,HTML,True,True,
4,peng-zhihui,peng-zhihui/BluetoothTouch,2020-11-26T14:51:25Z,489,489,Java,True,True,GNU General Public License v3.0


In [None]:
repos['has_projects'] = repos['has_projects'].map({True: 'true', False: 'false'})
repos['has_wiki'] = repos['has_wiki'].map({True: 'true', False: 'false'})

In [None]:
repos.fillna('', inplace=True)

## Q1

In [None]:
# Get the top 5 users by followers
top_5_users = users.sort_values(by='followers', ascending=False).head(5)
print(top_5_users[['login', 'followers']])


         login  followers
0  peng-zhihui      80545
1       ruanyf      79285
2       phodal      20050
3       liyupi      17339
4   stormzhang      15879


## Q2

In [None]:
# Convert 'created_at' column to datetime format
users['created_at'] = pd.to_datetime(users['created_at'])

# Sort users by 'created_at' in ascending order and select the top 5
earliest_users = users.sort_values(by='created_at', ascending=True).head(5)

# Get the login names of these users
earliest_users_logins = earliest_users['login'].tolist()

# Display the result as a comma-separated string
print(','.join(earliest_users_logins))


osteele,mrluanma,ShiningRay,rainux,why404


## 3

In [None]:
# Filter out rows where 'license_name' is missing or empty
repos_with_license = repos[repos['license_name'].notna() & (repos['license_name'] != '')]

# Count the occurrences of each license and get the top 3
top_licenses = repos_with_license['license_name'].value_counts().head(3)

# Get the license names as a list
top_licenses_list = top_licenses.index.tolist()

# Display the result as a comma-separated string
print(','.join(top_licenses_list))


MIT License,Apache License 2.0,Other


## 4

In [None]:

valid_companies = users[users['company'] != '']

# Group by 'company' and count the occurrences, then find the most common company
most_common_company = valid_companies['company'].value_counts().idxmax()

# Display the result
print(most_common_company)

BYTEDANCE


## 5

In [None]:
# Exclude rows where 'language' is missing or empty
valid_languages = repos[repos['language'].notna() & (repos['language'] != '')]

# Count the occurrences of each language and find the most common one
most_popular_language = valid_languages['language'].value_counts().idxmax()

# Display the result
print(most_popular_language)


JavaScript


## 6

In [None]:
users['created_at'] = pd.to_datetime(users['created_at'])

users_after_2020 = users[users['created_at'] > '2020-01-01']

# Filter repositories to keep only those from the users who joined after 2020
after_2020_repos = repos[repos['login'].isin(users_after_2020['login'])]

# Count occurrences of each programming language
language_counts = after_2020_repos['language'].value_counts()

# Get the second most popular language
second_most_popular_language = language_counts.index[1]  # Index 1 is the second most popular

print(second_most_popular_language)


Go


## 7

In [None]:
valid_repos = repos[repos['language'].notna() & (repos['language'] != '') & repos['stargazers_count'].notna()]

# Step 2: Group by 'language' and calculate the average number of stars per repository
average_stars_per_language = valid_repos.groupby('language')['stargazers_count'].mean()

# Step 3: Identify the language with the highest average number of stars
highest_average_language = average_stars_per_language.idxmax()

# Display the result
print(f"Language: {highest_average_language}")


Language: Rich Text Format


## 8

In [None]:
users['leader_strength'] = users['followers'] / (1 + users['following'])

# Step 2: Sort users by leader_strength in descending order
top_leaders = users.sort_values('leader_strength', ascending=False).head(5)

# Step 3: Get the login of the top 5 users
top_leader_logins = top_leaders['login'].tolist()

# Display the result as a comma-separated string
print(','.join(top_leader_logins))

ruanyf,peng-zhihui,espressif,vnpy,bilibili


## 9

In [None]:
correlation = users['followers'].corr(users['public_repos'])

# Print the correlation to 3 decimal places
print(f"{correlation:.3f}")

-0.005


## 10

In [None]:
import statsmodels.api as sm

# Define independent variable (public_repos) and dependent variable (followers)
X = users['public_repos']
y = users['followers']

# Add a constant to the independent variable for the intercept
X = sm.add_constant(X)

# Perform linear regression
model = sm.OLS(y, X).fit()

# Get the slope (coefficient of public_repos)
slope = model.params['public_repos']

# Print the slope to 3 decimal places
print(f"{slope:.3f}")

-0.056


## 11

In [None]:
# Convert 'has_projects' and 'has_wiki' columns to boolean
repos['has_projects'] = repos['has_projects']
repos['has_wiki'] = repos['has_wiki']

# Calculate the correlation between has_projects and has_wiki
correlation = repos['has_projects'].corr(repos['has_wiki'])

# Print the correlation to 3 decimal places
print(f"{correlation:.3f}")

0.302


## 12

In [None]:
hireable_with_email = users[users['hireable'] == True]['email'].notna().mean()

# Calculate the fraction of users with email when hireable is False
not_hireable_with_email = users[users['hireable'] == False]['email'].notna().mean()

# Calculate the difference
difference = hireable_with_email - not_hireable_with_email

# Print the result to 3 decimal places
print(f"{difference:.3f}")


nan


##13

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression

# Load the users data
users_df = pd.read_csv('users.csv')

# Calculate the length of bios
users_df['bio_length'] = users_df['bio'].dropna().apply(len)

# Filter out users without bios
filtered_df = users_df[users_df['bio_length'].notna()]

# Prepare the data for regression
X = filtered_df[['bio_length']]
y = filtered_df['followers']

# Perform linear regression
model = LinearRegression()
model.fit(X, y)

# Get the regression slope
regression_slope = model.coef_[0]

# Output the result
print(f"Regression slope of followers on bio length: {regression_slope:.3f}")

Regression slope of followers on bio length: -5.417


## 14

In [None]:
repos['created_at'] = pd.to_datetime(repos['created_at'])

repos['is_weekend'] = repos['created_at'].dt.weekday >= 5

weekend_repos = repos[repos['is_weekend']]

# Get the top 5 users by login
top = weekend_repos['login'].value_counts().head(5).index.tolist()

print(','.join(top))


losfair,j5s,Yuan-ManX,gonnavis,yuzd


##15

In [None]:
# Step 1: Count total users and users with email addresses for hireable users
hireable_users = users[users['hireable'] == True]
non_hireable_users = users[users['hireable'] == False]

# Count total users
total_hireable = len(hireable_users)
total_non_hireable = len(non_hireable_users)

# Count users with email addresses
hireable_with_email = hireable_users['email'].notna().sum()
non_hireable_with_email = non_hireable_users['email'].notna().sum()

# Step 2: Calculate fractions
if total_hireable > 0:
    hireable_email_fraction = hireable_with_email / total_hireable
else:
    hireable_email_fraction = 0

if total_non_hireable > 0:
    non_hireable_email_fraction = non_hireable_with_email / total_non_hireable
else:
    non_hireable_email_fraction = 0

# Step 3: Calculate the difference
difference = hireable_email_fraction - non_hireable_email_fraction

# Display the result rounded to 3 decimal places
print(f"Difference in email sharing fraction: {difference:.3f}")


Difference in email sharing fraction: 0.000


## 16

In [None]:
# Remove missing names and trim whitespace
users['name'] = users['name'].dropna().str.strip()

# Extract surnames (last word in the name)
users['surname'] = users['name'].str.split().str[-1]

# Count occurrences of each surname
surname_counts = users['surname'].value_counts()

# Find the maximum count
max_count = surname_counts.max()

# Get the most common surnames with the maximum count
most_common_surnames = surname_counts[surname_counts == max_count].index.tolist()

# Sort the surnames alphabetically
most_common_surnames.sort()

# Print the result
print("The most common surname(s):", ", ".join(most_common_surnames))


The most common surname(s): Zhang
