In [None]:
!curl "https://api.github.com/search/users?q=location:Zurich+followers:>50"

{
  "total_count": 474,
  "incomplete_results": false,
  "items": [
    {
      "login": "IDouble",
      "id": 18186995,
      "node_id": "MDQ6VXNlcjE4MTg2OTk1",
      "avatar_url": "https://avatars.githubusercontent.com/u/18186995?v=4",
      "gravatar_id": "",
      "url": "https://api.github.com/users/IDouble",
      "html_url": "https://github.com/IDouble",
      "followers_url": "https://api.github.com/users/IDouble/followers",
      "following_url": "https://api.github.com/users/IDouble/following{/other_user}",
      "gists_url": "https://api.github.com/users/IDouble/gists{/gist_id}",
      "starred_url": "https://api.github.com/users/IDouble/starred{/owner}{/repo}",
      "subscriptions_url": "https://api.github.com/users/IDouble/subscriptions",
      "organizations_url": "https://api.github.com/users/IDouble/orgs",
      "repos_url": "https://api.github.com/users/IDouble/repos",
      "events_url": "https://api.github.com/users/IDouble/events{/privacy}",
      "received_events

In [None]:
import requests
import time
import pandas as pd

# Replace with your GitHub token
#hiding my private access token
TOKEN = 'xxx'
HEADERS = {'Authorization': f'token {TOKEN}'}

def clean_company_name(company):
    """Clean up company names."""
    if company:
        return company.strip().lstrip('@').upper()
    return None

def get_users_in_zurich():
    url = 'https://api.github.com/search/users'
    all_users = []
    page = 1
    per_page = 100  # Max number of results per page

    while True:
        params = {'q': 'location:Zurich followers:>50', 'page': page, 'per_page': per_page}
        response = requests.get(url, headers=HEADERS, params=params)

        if response.status_code != 200:
            print(f"Error fetching users: {response.status_code}")
            break

        users = response.json().get('items', [])
        total_count = response.json().get('total_count', 0)

        if not users:
            print("No more users found.")
            break  # Exit if no more users are found

        all_users.extend(users)
        print(f"Fetched page {page}: {len(users)} users, total fetched: {len(all_users)}")

        # Check if we've fetched all users
        if len(all_users) >= total_count:
            break

        page += 1
        time.sleep(4)  # Throttle requests to avoid rate limiting

    return all_users

def get_user_details(username):
    url = f'https://api.github.com/users/{username}'
    response = requests.get(url, headers=HEADERS)

    if response.status_code != 200:
        print(f"Error fetching details for {username}: {response.status_code}")
        return {}

    return response.json()

def main():
    users = get_users_in_zurich()

    # Create a list to store user data
    user_data = []

    for user in users:
        username = user['login']
        print(f'Fetching details for: {username}')

        user_details = get_user_details(username)

        # Extract relevant details
        user_info = {
            'login': user_details.get('login'),
            'name': user_details.get('name'),
            'company': clean_company_name(user_details.get('company')),
            'location': user_details.get('location'),
            'email': user_details.get('email'),
            'hireable': user_details.get('hireable'),
            'bio': user_details.get('bio'),
            'public_repos': user_details.get('public_repos'),
            'followers': user_details.get('followers'),
            'following': user_details.get('following'),
            'created_at': user_details.get('created_at'),
        }

        user_data.append(user_info)

        # Wait for 4 seconds before the next request
        time.sleep(4)

    # Convert the list of user data to a DataFrame
    df = pd.DataFrame(user_data)
    print(len(df))
    # Save the DataFrame to a CSV file
    df.to_csv('users.csv', index=False)
    print("Data saved to users.csv")

if __name__ == "__main__":
    main()

Fetched page 1: 100 users, total fetched: 100
Fetched page 2: 100 users, total fetched: 200
Fetched page 3: 100 users, total fetched: 300
Fetched page 4: 100 users, total fetched: 400
Fetched page 5: 74 users, total fetched: 474
Fetching details for: IDouble
Fetching details for: TheOfficialFloW
Fetching details for: Seldaek
Fetching details for: riscv
Fetching details for: JonnyBurger
Fetching details for: bpasero
Fetching details for: egamma
Fetching details for: ethz-asl
Fetching details for: sahildua2305
Fetching details for: joaomoreno
Fetching details for: klaudiosinani
Fetching details for: sbrannen
Fetching details for: Juriy
Fetching details for: sarlinpe
Fetching details for: sustrik
Fetching details for: LorenzMeier
Fetching details for: jwagner
Fetching details for: jaspervdj
Fetching details for: lsmith77
Fetching details for: videlalvaro
Fetching details for: cvg
Fetching details for: filipw
Fetching details for: SimonHoiberg
Fetching details for: daviddao
Fetching detail

In [None]:
import requests
import pandas as pd
import time

# Replace with your GitHub token
#hiding my private access token
TOKEN = 'xxx'
HEADERS = {'Authorization': f'token {TOKEN}'}

def check_rate_limit():
    """Check the current rate limit and wait if necessary."""
    url = 'https://api.github.com/rate_limit'
    response = requests.get(url, headers=HEADERS)

    if response.status_code != 200:
        print(f"Error checking rate limit: {response.status_code}")
        return True  # Treat as if we are over the limit to avoid further requests

    rate_limits = response.json()
    remaining = rate_limits['rate']['remaining']
    reset_time = rate_limits['rate']['reset']

    if remaining == 0:
        wait_time = reset_time - int(time.time())
        print(f"Rate limit exceeded. Waiting for {wait_time} seconds.")
        time.sleep(wait_time + 10)  # Wait until reset + some buffer time
        return False  # After waiting, we can continue

    return True  # Rate limit is fine

def fetch_repositories(username):
    """Fetch up to 500 most recently pushed repositories for a given user."""
    url = f'https://api.github.com/users/{username}/repos'
    params = {
        'sort': 'pushed',
        'direction': 'desc',
        'per_page': 100,
        'page': 1
    }

    all_repos = []

    while True:
        if not check_rate_limit():
            continue  # Check the rate limit again after waiting

        response = requests.get(url, headers=HEADERS, params=params)
        if response.status_code != 200:
            print(f"Error fetching repos for {username}: {response.status_code}")
            break

        repos = response.json()
        if not repos:
            break

        all_repos.extend(repos)

        # Check if we've fetched 500 or more
        if len(all_repos) >= 500:
            break

        params['page'] += 1
        time.sleep(1)  # Throttle requests to avoid hitting rate limits

    # Return the first 500 repositories
    return all_repos[:500]

def main():
    # Read users from users.csv
    users_df = pd.read_csv('users.csv')

    repository_data = []

    for index, row in users_df.iterrows():
        username = row['login']  # Assuming the column is named 'login'
        print(f'Fetching repositories for: {username}')

        repos = fetch_repositories(username)

        for repo in repos:
            repository_info = {
                'login': username,
                'full_name': repo['full_name'],
                'created_at': repo['created_at'],
                'stargazers_count': repo['stargazers_count'],
                'watchers_count': repo['watchers_count'],
                'language': repo['language'],
                'has_projects': repo.get('has_projects', False),
                'has_wiki': repo.get('has_wiki', False),
                'license_name': repo['license']['name'] if repo.get('license') else None,
            }
            repository_data.append(repository_info)

        time.sleep(4)  # Throttle requests between different users

    # Create a DataFrame and save to CSV
    repositories_df = pd.DataFrame(repository_data)
    repositories_df.to_csv('repositories.csv', index=False)
    print("Data saved to repositories.csv")
    print(len(repositories_df))

if __name__ == "__main__":
    main()


Fetching repositories for: IDouble
Fetching repositories for: TheOfficialFloW
Fetching repositories for: Seldaek
Fetching repositories for: riscv
Fetching repositories for: JonnyBurger
Fetching repositories for: bpasero
Fetching repositories for: egamma
Fetching repositories for: ethz-asl
Fetching repositories for: sahildua2305
Fetching repositories for: joaomoreno
Fetching repositories for: klaudiosinani
Fetching repositories for: sbrannen
Fetching repositories for: Juriy
Fetching repositories for: sarlinpe
Fetching repositories for: sustrik
Fetching repositories for: LorenzMeier
Fetching repositories for: jwagner
Fetching repositories for: jaspervdj
Fetching repositories for: lsmith77
Fetching repositories for: videlalvaro
Fetching repositories for: cvg
Fetching repositories for: filipw
Fetching repositories for: SimonHoiberg
Fetching repositories for: daviddao
Fetching repositories for: FujiwaraChoki
Fetching repositories for: sandy081
Fetching repositories for: mhils
Fetching repos

In [None]:
import pandas as pd

def main():
    # Read the repositories from repositories.csv
    repos_df = pd.read_csv('repositories.csv')

    # Convert 'created_at' to datetime to filter users who joined after 2020
    repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])

    # Filter repositories based on the user's creation date
    # Here we assume that the user joined in the year of repository creation
    # If you have a separate column for user join date, adjust this accordingly
    recent_repos_df = repos_df[repos_df['created_at'] >= '2020-01-01']

    # Count the occurrences of each language
    language_counts = recent_repos_df['language'].value_counts()

    # Check if there are at least 2 different languages
    if len(language_counts) < 2:
        print("Not enough languages found.")
        return

    # Get the second most popular language
    second_most_popular = language_counts.index[1]  # Index 1 gives the second most popular

    print(f"The second most popular programming language among users who joined after 2020 is: {second_most_popular}")

if __name__ == "__main__":
    main()


The second most popular programming language among users who joined after 2020 is: JavaScript


In [None]:
import pandas as pd

def main():
    # Read the repositories from repositories.csv
    repos_df = pd.read_csv('repositories.csv')

    # Group by language and calculate the average number of stars per repository
    average_stars_per_language = repos_df.groupby('language')['stargazers_count'].mean()

    # Find the language with the highest average number of stars
    highest_avg_stars_language = average_stars_per_language.idxmax()
    highest_avg_stars_value = average_stars_per_language.max()

    print(f"The language with the highest average number of stars per repository is: {highest_avg_stars_language}")
    print(f"Average stars per repository: {highest_avg_stars_value:.2f}")

if __name__ == "__main__":
    main()


The language with the highest average number of stars per repository is: BitBake
Average stars per repository: 364.00


In [None]:
import pandas as pd

def main():
    # Read the repositories from repositories.csv
    repos_df = pd.read_csv('repositories.csv')

    # Ensure the 'has_projects' and 'has_wiki' columns are boolean
    repos_df['has_projects'] = repos_df['has_projects'].astype(bool)
    repos_df['has_wiki'] = repos_df['has_wiki'].astype(bool)

    # Calculate the correlation between 'has_projects' and 'has_wiki'
    correlation = repos_df['has_projects'].corr(repos_df['has_wiki'])

    print(f"The correlation between projects and wiki enabled is: {correlation:.3f}")

if __name__ == "__main__":
    main()



The correlation between projects and wiki enabled is: 0.312


In [None]:
import pandas as pd

def main():
    # Read the repositories from repositories.csv
    repos_df = pd.read_csv('repositories.csv')

    # Ensure the 'has_projects' and 'has_wiki' columns are boolean
    repos_df['has_projects'] = repos_df['has_projects'].astype(bool)
    repos_df['has_wiki'] = repos_df['has_wiki'].astype(bool)

    # Calculate the correlation between 'has_projects' and 'has_wiki'
    correlation = repos_df['has_projects'].astype(int).corr(repos_df['has_wiki'].astype(int))

    print(f"The correlation between projects and wiki enabled is: {correlation:.3f}")

if __name__ == "__main__":
    main()


The correlation between projects and wiki enabled is: 0.312


In [None]:
import pandas as pd

def main():
    # Read the repositories from repositories.csv
    repos_df = pd.read_csv('repositories.csv')

    # Convert 'created_at' to datetime
    repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])

    # Extract the day of the week (0=Monday, 6=Sunday)
    repos_df['day_of_week'] = repos_df['created_at'].dt.dayofweek

    # Filter for weekend days (Saturday=5, Sunday=6)
    weekend_repos_df = repos_df[repos_df['day_of_week'].isin([5, 6])]

    # Count the number of repositories created by each user on weekends
    weekend_counts = weekend_repos_df['login'].value_counts()

    # Get the user who created the most on weekends
    if not weekend_counts.empty:
        top_user = weekend_counts.idxmax()
        top_count = weekend_counts.max()
        print(f"The user who created the most repositories on weekends is: {top_user} with {top_count} repositories.")
    else:
        print("No repositories were created on weekends.")

if __name__ == "__main__":
    main()


The user who created the most repositories on weekends is: JonnyBurger with 82 repositories.
