<a href="https://colab.research.google.com/github/Praviniitm/Project_Moscow/blob/main/Scrape_Github_Users_and_Repos.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests

In [None]:
import time
import csv

In [None]:
GITHUB_TOKEN = "Mytoken"
headers = {"Authorization": f"token {GITHUB_TOKEN}"}
def check_rate_limit():
    url = "https://api.github.com/rate_limit"
    response = requests.get(url, headers=headers)
    rate_limit = response.json()
    remaining = rate_limit["rate"]["remaining"]
    reset_time = rate_limit["rate"]["reset"]

    # If limit is almost reached, wait until it resets
    if remaining < 10:
        wait_time = reset_time - int(time.time())
        print(f"Rate limit almost reached. Waiting for {wait_time} seconds.")
        time.sleep(wait_time + 1)

In [None]:
def clean_company_name(company):
    if company:
        return company.strip().lstrip("@").upper()
    return None

In [None]:
def get_users_in_moscow():
    url = "https://api.github.com/search/users"
    users = []
    page = 1

    while True:
        check_rate_limit()
        params = {
            "q": "location:Moscow",
            "per_page": 30,
            "page": page
        }
        response = requests.get(url, headers=headers, params=params)
        data = response.json()

        if "items" not in data:
            break

        for user in data["items"]:
            check_rate_limit()
            user_data = requests.get(user["url"], headers=headers).json()
            if user_data.get("followers", 0) > 50:
                user_info = {
                    "login": user_data.get("login"),
                    "name": user_data.get("name"),
                    "company": clean_company_name(user_data.get("company")),
                    "location": user_data.get("location"),
                    "email": user_data.get("email"),
                    "hireable": user_data.get("hireable"),
                    "bio": user_data.get("bio"),
                    "public_repos": user_data.get("public_repos"),
                    "followers": user_data.get("followers"),
                    "following": user_data.get("following"),
                    "created_at": user_data.get("created_at")
                }
                users.append(user_info)

        if len(data["items"]) < 30:
            break
        page += 1

    return users

In [None]:
def get_repositories(user_login):
    repos_url = f"https://api.github.com/users/{user_login}/repos"
    repositories = []
    page = 1

    while len(repositories) < 500:
        check_rate_limit()
        params = {
            "sort": "pushed",
            "direction": "desc",
            "per_page": 100,
            "page": page
        }
        response = requests.get(repos_url, headers=headers, params=params)
        repos_data = response.json()

        for repo in repos_data:
            repositories.append({
                "login": user_login,
                "full_name": repo.get("full_name"),
                "created_at": repo.get("created_at"),
                "pushed_at": repo.get("pushed_at"),
                "stargazers_count": repo.get("stargazers_count"),
                "watchers_count": repo.get("watchers_count"),
                "language": repo.get("language"),
                "has_projects": repo.get("has_projects"),
                "has_wiki": repo.get("has_wiki"),
                "license_name": repo.get("license", {}).get("key") if repo.get("license") else None
            })

            if len(repositories) >= 500:
                break

        if len(repos_data) < 100:  # No more repositories
            break
        page += 1

    return repositories

# Fetch users in Moscow with over 50 followers
users = get_users_in_moscow()

# Save user data to users.csv
with open("users.csv", "w", newline="", encoding="utf-8") as csvfile:
    fieldnames = ["login", "name", "company", "location", "email", "hireable", "bio",
                  "public_repos", "followers", "following", "created_at"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for user in users:
        writer.writerow(user)

# Fetch repositories for each user and save to repositories.csv
with open("repositories.csv", "w", newline="", encoding="utf-8") as csvfile:
    fieldnames = ["login", "full_name", "created_at", "pushed_at", "stargazers_count",
                  "watchers_count", "language", "has_projects", "has_wiki", "license_name"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for user in users:
        repos = get_repositories(user["login"])
        for repo in repos:
            writer.writerow(repo)