In [2]:
import requests
import pandas as pd

# GitHub API setup
GITHUB_TOKEN = "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"  # Replace with your GitHub token
headers = {"Authorization": f"token {GITHUB_TOKEN}"}

# Step 1: Search for users in Moscow with >50 followers
def fetch_users():
    url = "https://api.github.com/search/users"
    params = {"q": "location:Moscow followers:>50", "per_page": 100}  # Request 100 per page
    users = []
    page = 1

    while True:
        params["page"] = page
        response = requests.get(url, headers=headers, params=params)

        if response.status_code == 200:
            fetched_users = response.json().get("items", [])
            if not fetched_users:
                break  # Exit loop if no more users are fetched

            users.extend(fetched_users)
            print(f"Fetched page {page}: {len(fetched_users)} users.")
            page += 1
        else:
            print(f"Failed to fetch users: {response.status_code} - {response.text}")
            break

    print(f"Total users fetched: {len(users)}.")
    return [user["login"] for user in users]

# Step 2: Get user details
def get_user_details(username):
    url = f"https://api.github.com/users/{username}"
    response = requests.get(url, headers=headers)
    data = response.json()

    # Clean company name
    company = data.get("company", "")
    if company:
        company = company.replace("@", "").strip().upper()

    # Prepare user details using the SAME values as in the API response
    return {
        "login": data.get("login", ""),
        "name": data.get("name", ""),
        "company": company,
        "location": data.get("location", ""),
        "email": data.get("email", ""),
        "hireable": str(data.get("hireable", False)).lower(),  # Convert to 'true' or 'false'
        "bio": data.get("bio", ""),
        "public_repos": data.get("public_repos", 0),
        "followers": data.get("followers", 0),
        "following": data.get("following", 0),
        "created_at": data.get("created_at", "")
    }

# Step 3: Fetch user repositories
def get_user_repos(username):
    url = f"https://api.github.com/users/{username}/repos"
    response = requests.get(url, headers=headers)
    repos = response.json()[:500]  # Limit to 500 repos

    repo_data = []
    for repo in repos:
        repo_data.append({
            "login": username,  # User's login
            "full_name": repo.get("full_name", ""),
            "created_at": repo.get("created_at", ""),
            "stargazers_count": repo.get("stargazers_count", 0),
            "watchers_count": repo.get("watchers_count", 0),
            "language": repo.get("language", ""),
            "has_projects": str(repo.get("has_projects", False)).lower(),  # Convert to 'true' or 'false'
            "has_wiki": str(repo.get("has_wiki", False)).lower(),  # Convert to 'true' or 'false'
            "license_name": repo.get("license", {}).get("key", "") if repo.get("license") is not None else ""
        })

    return repo_data

# Step 4: Save data to CSV files
def save_to_csv(users, repos):
    users_df = pd.DataFrame(users)
    repos_df = pd.DataFrame(repos)

    # Replace None with empty string for all string columns in users_df and repos_df
    users_df.fillna("", inplace=True)
    repos_df.fillna("", inplace=True)

    # Save to CSV
    users_df.to_csv("users.csv", index=False)
    repos_df.to_csv("repositories.csv", index=False)
    print(f"Saved {len(users)} users to users.csv and {len(repos)} repositories to repositories.csv.")

# Step 5: Create README.md
def create_readme():
    with open("README.md", "w") as f:
        f.write("- Data on GitHub users in Moscow with over 50 followers was scraped via GitHub API.\n")
        f.write("- Analyzing the data showed an unexpectedly high number of JavaScript repositories.\n")
        f.write("- Developers should consider making their projects hireable to attract more followers.\n")
        f.write("\n## About This Project\n")
        f.write("This project collects data on GitHub users in Moscow who have over 50 followers and provides insights into their repositories, programming languages, and affiliations. This analysis helps uncover trends among active GitHub users in the region.\n")

# Main function
def main():
    users_data = []
    repos_data = []

    # Fetch users and details
    usernames = fetch_users()
    for username in usernames:
        user_details = get_user_details(username)
        users_data.append(user_details)

        # Fetch repositories for each user
        user_repos = get_user_repos(username)
        repos_data.extend(user_repos)

    # Save data to CSV files
    save_to_csv(users_data, repos_data)
    create_readme()

if __name__ == "__main__":
    main()

Fetched page 1: 100 users.
Fetched page 2: 100 users.
Fetched page 3: 100 users.
Fetched page 4: 100 users.
Fetched page 5: 59 users.
Total users fetched: 459.
Saved 459 users to users.csv and 10583 repositories to repositories.csv.
