In [None]:
import requests
import time
import pandas as pd

# Optional: GitHub token to avoid rate limits-471
GITHUB_TOKEN = "xxx......xxx"       # add token
HEADERS = {"Authorization": f"token {GITHUB_TOKEN}"} if GITHUB_TOKEN else {}

# Clean and format company names
def clean_company_name(company):
    if not company:
        return None
    return company.strip().lstrip('@').upper()

# Fetch detailed user information
def fetch_user_details(username):
    url = f"https://api.github.com/users/{username}"
    response = requests.get(url, headers=HEADERS)
    if response.status_code != 200:
        print(f"Error fetching user details for {username}: {response.json()}")
        return None
    return response.json()

# Fetch users in Austin with 100+ followers, dynamically handle pagination
def fetch_users_in_Berlin():
    url = "https://api.github.com/search/users"
    query = "location:Berlin followers:>200"
    users = []
    page = 1

    while True:
        params = {"q": query, "per_page": 100, "page": page}
        response = requests.get(url, headers=HEADERS, params=params)

        if response.status_code != 200:
            print(f"Error: {response.json()}")
            break

        data = response.json().get("items", [])
        if not data:
            break  # No more data, exit the loop

        users.extend(data)
        print(f"Fetched {len(data)} users from page {page}.")
        page += 1  # Move to the next page
        time.sleep(2)  # Be polite to the API

    return users

# Fetch all repositories for a user, dynamically handle pagination
def fetch_repositories(username):
    url = f"https://api.github.com/users/{username}/repos"
    repos = []
    page = 1

    while True:
        params = {"per_page": 100, "page": page}
        response = requests.get(url, headers=HEADERS, params=params)

        if response.status_code != 200:
            print(f"Error fetching repos for {username}: {response.json()}")
            break

        data = response.json()
        if not data:
            break  # No more data, exit the loop

        repos.extend(data)
        print(f"Fetched {len(data)} repos from page {page} for {username}.")
        page += 1  # Move to the next page
        time.sleep(1)  # Be polite to the API

    return repos

# Main function to collect data and export to Excel
def main():
    users_data = []
    repos_data = []

    users = fetch_users_in_austin()
    print(f"Total users fetched: {len(users)}")

    for user in users:
        username = user["login"]
        user_details = fetch_user_details(username)

        if user_details:
            users_data.append({
                "login": user_details["login"],
                "name": user_details.get("name"),
                "company": clean_company_name(user_details.get("company")),
                "location": user_details.get("location"),
                "email": user_details.get("email"),
                "hireable": user_details.get("hireable"),
                "bio": user_details.get("bio"),
                "public_repos": user_details.get("public_repos"),
                "followers": user_details.get("followers"),
                "following": user_details.get("following"),
                "created_at": user_details.get("created_at"),
            })

        print(f"Fetching repositories for {username}...")
        repos = fetch_repositories(username)

        for repo in repos:
            repos_data.append({
                "login": username,
                "full_name": repo["full_name"],
                "created_at": repo["created_at"],
                "stargazers_count": repo["stargazers_count"],
                "watchers_count": repo["watchers_count"],
                "language": repo["language"],
                "has_projects": repo["has_projects"],
                "has_wiki": repo["has_wiki"],
                "license_name": repo["license"]["name"] if repo.get("license") else None
            })

    # Convert data to DataFrames and save to Excel
    users_df = pd.DataFrame(users_data)
    repos_df = pd.DataFrame(repos_data)

    users_df.to_excel("usersall.xlsx", index=False)
    repos_df.to_excel("repositoriesall.xlsx", index=False)

    print("Data saved to users.xlsx and repositories.xlsx")

if __name__ == "__main__":
    main()


Fetched 100 users from page 1.
Fetched 100 users from page 2.
Fetched 100 users from page 3.
Fetched 100 users from page 4.
Fetched 100 users from page 5.
Fetched 98 users from page 6.
Total users fetched: 598
Fetching repositories for tiangolo...
Fetched 71 repos from page 1 for tiangolo.
Fetching repositories for schacon...
Fetched 100 repos from page 1 for schacon.
Fetched 100 repos from page 2 for schacon.
Fetched 15 repos from page 3 for schacon.
Fetching repositories for rwieruch...
Fetched 100 repos from page 1 for rwieruch.
Fetched 51 repos from page 2 for rwieruch.
Fetching repositories for shuding...
Fetched 100 repos from page 1 for shuding.
Fetched 49 repos from page 2 for shuding.
Fetching repositories for android10...
Fetched 79 repos from page 1 for android10.
Fetching repositories for marijnh...
Fetched 54 repos from page 1 for marijnh.
Fetching repositories for mxmnk...
Fetched 7 repos from page 1 for mxmnk.
Fetching repositories for nikic...
Fetched 100 repos from pag

# **Question No - 13**

In [1]:
import pandas as pd
import statsmodels.api as sm

# Load the users data from the CSV file
users_df = pd.read_csv('users.csv', encoding='ISO-8859-1')

# Filter out users without bios
users_with_bios = users_df[users_df['bio'].notna()]

# Calculate the length of the bio in words
#users_with_bios['bio_word_count'] = users_with_bios['bio'].str.split(" ").str.len()

# The error was here: users_with_bio was used instead of users_with_bios
users_with_bios['bio_word_count'] = users_with_bios['bio'].apply(lambda x: len(x.split()))


# Prepare the data for regression
X = users_with_bios['bio_word_count']  # Independent variable
y = users_with_bios['followers']        # Dependent variable

# Add a constant to the independent variable for the regression
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X).fit()

# Get the regression slope (coefficient for bio_word_count)
slope = model.params['bio_word_count']

# Print the slope rounded to three decimal places
print(f'Regression slope of followers on bio word count: {slope:.4f}')

Regression slope of followers on bio word count: 28.7012


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_bios['bio_word_count'] = users_with_bios['bio'].apply(lambda x: len(x.split()))
