In [62]:
import requests
import pandas as pd
import time

In [63]:
# GitHub API credentials
GITHUB_TOKEN = ' '
HEADERS = {
    "Authorization": f"Bearer {GITHUB_TOKEN}"
}

In [71]:
#Working
def fetch_users():
    users = []
    query = "location:Shanghai followers:>200"
    page = 1
    while True:
        url = f"https://api.github.com/search/users?q={query}&per_page=100&page={page}"
        response = requests.get(url, headers=HEADERS)

        if response.status_code == 200:
            data = response.json()
            items = data.get("items", [])

            # If there are no more users, break out of the loop
            if not items:
                break

            # Fetch detailed user info for each item
            for item in items:
                user_url = item["url"]
                user_data = requests.get(user_url, headers=HEADERS).json()

                # Append user info to users list
                users.append({
                    "login": user_data.get("login", ""),
                    "name": user_data.get("name", ""),
                    "company": clean_company_name(user_data.get("company", "")),
                    "location": user_data.get("location", ""),
                    "email": user_data.get("email", ""),
                    "hireable": str(user_data.get("hireable", "")),
                    "bio": user_data.get("bio", ""),
                    "public_repos": user_data.get("public_repos", 0),
                    "followers": user_data.get("followers", 0),
                    "following": user_data.get("following", 0),
                    "created_at": user_data.get("created_at", "")
                })

                # Respect GitHub API rate limits
                time.sleep(0.5)

            # Move to the next page
            page += 1
        else:
            print(f"Error fetching users: {response.status_code}")
            break

    return users


In [72]:
import re

# Function to clean up company names
def clean_company_name(company):

  if pd.notna(company):  # Check if company is a string
      company = str(company)
      # Remove all whitespace characters (including non-breaking spaces)
      company = re.sub(r'\s+', '', company).strip()  # Replace multiple whitespace with a single space
      if company.startswith('@'):
          company = company[1:]  # Remove leading '@'
      company = company.upper()  # Convert to uppercase
  else:
      company = ''  # Set to empty string if not a valid string
  return company


In [73]:
users = fetch_users()
users_df = pd.DataFrame(users)
users_df['company'].apply(clean_company_name)
users_df.to_csv("users.csv", index=False)


In [88]:
# Function to fetch repositories with specific fields
def fetch_repositories(username):
    url = f"https://api.github.com/users/{username}/repos?sort=pushed&per_page=100"
    all_repos = []
    page = 1

    while len(all_repos) < 500:
        response = requests.get(f"{url}&page={page}", headers=HEADERS)

        if response.status_code != 200:
            print(f"Error fetching repositories for {username}: {response.status_code}")
            break

        repos = response.json()

        if not repos:  # Break if there are no more repos
            break

        for repo in repos:
            # Append required fields for each repository
            all_repos.append({
                "login": username,  # GitHub user ID
                "full_name": repo.get("full_name", ""),
                "created_at": repo.get("created_at", ""),
                "stargazers_count": repo.get("stargazers_count", 0),
                "watchers_count": repo.get("watchers_count", 0),
                "language": repo.get("language", ""),
                "has_projects": repo.get("has_projects", False),
                "has_wiki": repo.get("has_wiki", False),
                "license_name": repo.get("license", {}).get("key", "") if repo.get("license") else ""
            })

        page += 1
        time.sleep(1)  # Respect rate limits

    return all_repos[:500]  # Return up to 500 most recent repos

In [77]:
users_df = pd.read_csv("users.csv")
users = users_df.to_dict(orient='records')
print(users[0])

{'login': 'peng-zhihui', 'name': '稚晖', 'company': 'HUAWEIRESEARCH', 'location': 'Shanghai', 'email': 'prime_zhihui@foxmail.com', 'hireable': nan, 'bio': '野生钢铁侠本侠。', 'public_repos': 59, 'followers': 80748, 'following': 9, 'created_at': '2015-06-22T04:59:39Z'}


In [89]:
# Main function to fetch user repositories and save to CSV
def main():
#     users = [{'login': 'username1'}, {'login': 'username2'}]  # Placeholder users list
    repos = []

    for user in users:
        username = user["login"]
        user_repos = fetch_repositories(username)

        if user_repos:
            repos.extend(user_repos)

        # Respect rate limits
        time.sleep(1)

    # Save data to repositories.csv
    repos_df = pd.DataFrame(repos)
    repos_df.to_csv("repositories.csv", index=False)

if __name__ == "__main__":
    main()


In [58]:
df = pd.read_csv("users.csv")

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 742 entries, 0 to 741
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   login         742 non-null    object
 1   name          706 non-null    object
 2   company       453 non-null    object
 3   location      742 non-null    object
 4   email         513 non-null    object
 5   hireable      215 non-null    object
 6   bio           537 non-null    object
 7   public_repos  742 non-null    int64 
 8   followers     742 non-null    int64 
 9   following     742 non-null    int64 
 10  created_at    742 non-null    object
dtypes: int64(3), object(8)
memory usage: 63.9+ KB


In [55]:
df['company'].apply(clean_company_name)

Unnamed: 0,company
0,HUAWEIRESEARCH
1,
2,THOUGHTWORKS
3,编程学习公众号【程序员鱼皮】
4,"BOOHEE,INC."
...,...
737,TENCENT
738,
739,
740,ALIBABA


In [27]:
df

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
0,peng-zhihui,稚晖,HUAWEI RESEARCH,Shanghai,prime_zhihui@foxmail.com,,野生钢铁侠本侠。,59,80747,9,2015-06-22T04:59:39Z
1,ruanyf,Ruan YiFeng,,"Shanghai, China",yifeng.ruan@gmail.com,,,72,79331,0,2011-07-10T01:07:17Z
2,phodal,Fengda Huang,THOUGHTWORKS,"Shanghai / Hangzhou, China",h@phodal.com,True,I'm digging holes.,367,20066,9,2010-11-08T11:46:51Z
3,liyupi,程序员鱼皮,编程学习公众号【程序员鱼皮】,China Shanghai,592789970@qq.com,True,speak less do more！前腾讯全栈开发，现科技公司创始人,82,17408,29,2017-02-26T08:44:22Z
4,stormzhang,stormzhang,"BOOHEE, INC.","Shanghai, China",stormzhang.dev@gmail.com,,微信公众号：stormzhang,5,15879,91,2012-09-03T02:58:11Z
...,...,...,...,...,...,...,...,...,...,...,...
737,black-binary,Black Binary,TENCENT,Shanghai,,,Reverse-Engineering/CTF Player\r\n\r\nC/C++/Ru...,29,202,57,2015-02-19T09:15:35Z
738,windoze,徐辰,,"Shanghai, China",windoze@0d0a.com,,,221,201,0,2009-11-03T02:12:35Z
739,nighca,Hanxing Yang,,Shanghai,nighca@live.cn,True,Engineer,100,201,56,2012-03-02T11:22:16Z
740,EricGao888,Eric Gao,ALIBABA,"Shanghai, China",chufenggao@gmail.com,,"PMC-member@Apache DolphinScheduler, SDE@Alibab...",81,201,126,2017-12-28T05:33:48Z


14. Who created the most repositories on weekends (UTC)? List the top 5 users' login in order, comma-separated
Users login

In [90]:
# Load the repositories CSV file
repos_df = pd.read_csv('repositories.csv')
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'], utc=True)

# Extract the day of the week (0=Monday, 6=Sunday)
repos_df['day_of_week'] = repos_df['created_at'].dt.dayofweek

# Filter for weekends (Saturday=5, Sunday=6)
weekend_repos = repos_df[repos_df['day_of_week'].isin([5, 6])]

# Count the number of weekend-created repositories per user
weekend_repo_counts = weekend_repos['login'].value_counts()

# Get the top 5 users
top_5_weekend_creators = weekend_repo_counts.head(5)

# Extract their login names in order
top_5_logins = ', '.join(top_5_weekend_creators.index.tolist())
print(top_5_logins)

losfair, gonnavis, j5s, shadowcz007, Yuan-ManX


What is the correlation between the number of followers and the number of public repositories among users in Shanghai?
Correlation between followers and repos (to 3 decimal places, e.g. 0.123 or -0.123)

In [91]:
correlation_followers_repos = users_df['followers'].corr(users_df['public_repos'])

correlation_followers_repos_rounded = round(correlation_followers_repos, 3)
correlation_followers_repos_rounded


-0.005