In [None]:
import requests
import csv
import time
import os

# GitHub API URL
BASE_URL = "https://api.github.com"
TOKEN = "Github Token"  # Replace with your GitHub token

# Headers for API Authentication
headers = {
    "Authorization": f"token {TOKEN}"
}

# Function to get detailed user information
def get_user_info(username):
    url = f"{BASE_URL}/users/{username}"
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.json()
    return {}

# Function to get users from Barcelona with >100 followers
def get_users():
    users = []
    page = 1
    while True:
        url = f"{BASE_URL}/search/users?q=location:Barcelona+followers:>100&per_page=100&page={page}"
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            print(f"Failed to fetch users: {response.status_code}")
            break

        data = response.json()
        if not data.get("items"):
            break

        users.extend(data["items"])
        page += 1
        time.sleep(1)  # to avoid rate limiting

    return users

# Function to get repositories of a user
def get_user_repositories(username):
    url = f"{BASE_URL}/users/{username}/repos"
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.json()
    return []

# Write detailed user information to users.csv
def write_users_csv(users):
    with open("users.csv", "w", newline='') as file:
        writer = csv.writer(file)
        writer.writerow([
            "login", "name", "company", "location", "email", "hireable", "bio",
            "public_repos", "followers", "following", "created_at"
        ])
        for user in users:
            user_info = get_user_info(user["login"])
            writer.writerow([
                user_info.get("login", ""), user_info.get("name", ""),
                (user_info.get("company", "") or "").replace("@", "").strip().upper(),
                user_info.get("location", ""), user_info.get("email", ""),
                user_info.get("hireable"), user_info.get("bio", ""),
                user_info.get("public_repos", 0), user_info.get("followers", 0),
                user_info.get("following", 0), user_info.get("created_at", "")
            ])
            time.sleep(1)  # to avoid rate limiting

# Write repositories to repositories.csv
def write_repos_csv(users):
    with open("repositories.csv", "w", newline='') as file:
        writer = csv.writer(file)
        writer.writerow([
            "login", "full_name", "created_at", "stargazers_count", "watchers_count",
            "language", "has_projects", "has_wiki", "license_name"
        ])

        for user in users:
            user_info = get_user_info(user["login"])
            repos = get_user_repositories(user["login"])
            for repo in repos:
                if not repo:  # Ensure repo data exists
                    continue
                writer.writerow([
                    user_info.get("login"), repo.get("full_name", ""),
                    repo.get("created_at", ""), repo.get("stargazers_count", 0),
                    repo.get("watchers_count", 0), repo.get("language", ""),
                    repo.get("has_projects", False), repo.get("has_wiki", False),

                ])
            time.sleep(1)  # to avoid rate limiting

# Main function
def main():
    users = get_users()
    write_users_csv(users)
    write_repos_csv(users)

if __name__ == "__main__":
    main()


In [None]:
from google.colab import files

# Download the CSV files
files.download('users.csv')
files.download('repositories.csv')


Performing Various Operations to find the required data


In [None]:
import pandas as pd
from google.colab import files

# Step 1: Upload the CSV file
uploaded = files.upload()

# Step 2: Load the uploaded CSV file into a DataFrame (replace 'filename.csv' with the name of your uploaded file)
df = pd.read_csv(list(uploaded.keys())[0])

# Step 3: Check for the required columns and calculate 'leader_strength'
if all(col in df.columns for col in ['login', 'followers', 'following']):
    # Calculate leader_strength
    df['leader_strength'] = df['followers'] / (1 + df['following'])

    # Step 4: Save the updated DataFrame to a new CSV file
    output_filename = "output_with_leader_strength.csv"
    df.to_csv(output_filename, index=False)

    # Step 5: Download the output file
    files.download(output_filename)
    print(f"File '{output_filename}' has been created and downloaded.")
else:
    print("The uploaded file does not contain the required columns: 'login', 'followers', 'following'")


Saving q6.csv to q6.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

File 'output_with_leader_strength.csv' has been created and downloaded.


In [None]:
import pandas as pd
from google.colab import files

# Step 1: Upload the CSV file
uploaded = files.upload()

# Step 2: Load the uploaded CSV file into a DataFrame
df = pd.read_csv(list(uploaded.keys())[0])

# Ensure the file has the required columns: 'projects_enabled' and 'wiki_enabled'
if all(col in df.columns for col in ['has_projects', 'has_wiki']):
    # Step 3: Calculate the correlation between projects_enabled and wiki_enabled
    correlation = df['has_projects'].corr(df['has_wiki'])

    # Step 4: Print the correlation to 3 decimal places
    print(f"The correlation between having projects enabled and having wiki enabled is {correlation:.3f}.")
else:
    print("The uploaded file does not contain the required columns: 'projects_enabled' and 'wiki_enabled'")


In [None]:
import pandas as pd
from google.colab import files

# Step 1: Upload the CSV file
uploaded = files.upload()

# Step 2: Load the uploaded CSV file into a DataFrame
df = pd.read_csv(list(uploaded.keys())[0])

# Ensure the file has the required columns: 'hireable' and 'following'
if all(col in df.columns for col in ['hireable', 'following']):
    # Step 3: Calculate the average 'following' for hireable and non-hireable users
    hireable_avg_following = df[df['hireable'] == True]['following'].mean()
    non_hireable_avg_following = df[df['hireable'] == False]['following'].mean()

    # Step 4: Calculate the difference and print it to 3 decimal places
    difference = hireable_avg_following - non_hireable_avg_following
    print(f"The difference in average following (hireable - non-hireable) is {difference:.3f}")
else:
    print("The uploaded file does not contain the required columns: 'hireable' and 'following'")


In [None]:
import pandas as pd
from google.colab import files

# Step 1: Upload the CSV file
uploaded = files.upload()

# Step 2: Load the uploaded CSV file into a DataFrame
df = pd.read_csv(list(uploaded.keys())[0])

# Ensure the file has the required columns: 'bio' and 'followers'
if all(col in df.columns for col in ['bio', 'followers']):
    # Step 3: Filter out rows with empty or missing bios
    df = df[df['bio'].notna() & (df['bio'].str.strip() != '')]

    # Step 4: Calculate the word count of each bio (split by whitespace)
    df['bio_length'] = df['bio'].apply(lambda x: len(x.split()))

    # Step 5: Calculate the correlation between bio length and followers
    correlation = df['bio_length'].corr(df['followers'])

    # Step 6: Print the correlation to 3 decimal places
    print(f"The correlation between bio length and followers is {correlation:.3f}")
else:
    print("The uploaded file does not contain the required columns: 'bio' and 'followers'")


In [None]:
import pandas as pd
from google.colab import files

# Step 1: Upload the CSV file
uploaded = files.upload()

# Step 2: Load the uploaded CSV file into a DataFrame
df = pd.read_csv(list(uploaded.keys())[0])

# Ensure the file has the required columns: 'login' and 'created_at'
if all(col in df.columns for col in ['login', 'created_at']):
    # Step 3: Convert 'created_at' to datetime format
    df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')

    # Drop rows with invalid dates
    df = df.dropna(subset=['created_at'])

    # Step 4: Extract the day of the week (0=Monday, ..., 6=Sunday)
    df['day_of_week'] = df['created_at'].dt.dayofweek

    # Step 5: Filter for weekends (Saturday = 5, Sunday = 6)
    weekend_repos = df[df['day_of_week'].isin([5, 6])]

    # Step 6: Count the number of weekend repositories for each user
    weekend_counts = weekend_repos['login'].value_counts().head(5)

    # Step 7: Print the top 5 users' logins, comma-separated
    top_5_users = ", ".join(weekend_counts.index)
    print(f"The top 5 users who created the most repositories on weekends are: {top_5_users}")
else:
    print("The uploaded file does not contain the required columns: 'login' and 'created_at'")


In [None]:
import pandas as pd
from google.colab import files

# Step 1: Upload the CSV file
uploaded = files.upload()

# Step 2: Load the uploaded CSV file into a DataFrame
df = pd.read_csv(list(uploaded.keys())[0])

# Ensure the file has the required column: 'license_name'
if 'license_name' in df.columns:
    # Step 3: Filter out missing licenses
    df = df[df['license_name'].notna()]

    # Step 4: Count occurrences of each license
    license_counts = df['license_name'].value_counts()

    # Step 5: Get the top 3 licenses
    top_3_licenses = license_counts.head(3)

    # Step 6: Prepare the output as a comma-separated string
    top_3_license_names = ", ".join(top_3_licenses.index)

    # Step 7: Print the most popular licenses
    print(f"The 3 most popular licenses are: {top_3_license_names}")
else:
    print("The uploaded file does not contain the required column: 'license_name'")


In [None]:
import pandas as pd
from google.colab import files

# Step 1: Upload the CSV file
uploaded = files.upload()

# Step 2: Load the uploaded CSV file into a DataFrame (replace 'filename.csv' with the name of your uploaded file)
df = pd.read_csv(list(uploaded.keys())[0])

# Ensure the file has the required columns: 'language' and 'stars'
if all(col in df.columns for col in ['language', 'stargazers_count']):
    # Step 3: Group by language and calculate the average stars
    avg_stars_per_language = df.groupby('language')['stargazers_count'].mean()

    # Step 4: Find the language with the highest average stars
    top_language = avg_stars_per_language.idxmax()
    top_avg_stars = avg_stars_per_language.max()

    print(f"The language with the highest average stars per repository is '{top_language}' with an average of {top_avg_stars:.2f} stars.")
else:
    print("The uploaded file does not contain the required columns: 'language' and 'stars'")


Saving q6.csv to q6 (1).csv
The language with the highest average stars per repository is 'Vim Script' with an average of 4386.86 stars.


In [None]:
import pandas as pd
from google.colab import files

# Step 1: Upload the CSV file
uploaded = files.upload()

# Step 2: Load the uploaded CSV file into a DataFrame
df = pd.read_csv(list(uploaded.keys())[0])

# Ensure the file has the required columns: 'followers' and 'public_repos'
if all(col in df.columns for col in ['followers', 'public_repos']):
    # Step 3: Calculate the correlation between followers and public_repos
    correlation = df['followers'].corr(df['public_repos'])

    # Step 4: Print the correlation to 3 decimal places
    print(f"The correlation between followers and public repositories is {correlation:.3f}.")
else:
    print("The uploaded file does not contain the required columns: 'followers' and 'public_repos'")


Saving q6.csv to q6 (2).csv
The correlation between followers and public repositories is 0.071.


In [None]:
import pandas as pd
import statsmodels.api as sm
from google.colab import files

# Step 1: Upload the CSV file
uploaded = files.upload()

# Step 2: Load the uploaded CSV file into a DataFrame
df = pd.read_csv(list(uploaded.keys())[0])

# Ensure the file has the required columns: 'followers' and 'public_repos'
if all(col in df.columns for col in ['followers', 'public_repos']):
    # Step 3: Clean the data by removing rows with NaN or inf values in 'followers' or 'public_repos'
    df = df.replace([float('inf'), float('-inf')], float('nan')).dropna(subset=['followers', 'public_repos'])

    # Define the independent (X) and dependent (y) variables
    X = df['public_repos']  # independent variable
    y = df['followers']     # dependent variable

    # Add a constant to the independent variable to represent the intercept
    X = sm.add_constant(X)

    # Step 4: Perform linear regression
    model = sm.OLS(y, X).fit()
    results_summary = model.summary()

    # Get the coefficient of 'public_repos' (slope)
    slope = model.params['public_repos']

    print(f"Regression analysis results:\n\n{results_summary}")
    print(f"\nEstimated additional followers per additional public repository: {slope:.3f}")
else:
    print("The uploaded file does not contain the required columns: 'followers' and 'public_repos'")



Saving q6.csv to q6 (4).csv
Regression analysis results:

                            OLS Regression Results                            
Dep. Variable:              followers   R-squared:                       0.005
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     1.708
Date:                Thu, 31 Oct 2024   Prob (F-statistic):              0.192
Time:                        04:34:04   Log-Likelihood:                -2972.9
No. Observations:                 337   AIC:                             5950.
Df Residuals:                     335   BIC:                             5957.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------

In [None]:
import pandas as pd
from google.colab import files

# Step 1: Upload the CSV file
uploaded = files.upload()

# Step 2: Load the uploaded CSV file into a DataFrame
df = pd.read_csv(list(uploaded.keys())[0])

# Ensure the file has the required columns: 'projects_enabled' and 'wiki_enabled'
if all(col in df.columns for col in ['has_projects', 'has_wiki']):
    # Step 3: Calculate the correlation between projects_enabled and wiki_enabled
    correlation = df['has_projects'].corr(df['has_wiki'])

    # Step 4: Print the correlation to 3 decimal places
    print(f"The correlation between having projects enabled and having wiki enabled is {correlation:.3f}.")
else:
    print("The uploaded file does not contain the required columns: 'projects_enabled' and 'wiki_enabled'")


In [None]:
import pandas as pd
from google.colab import files

# Step 1: Upload the CSV file
uploaded = files.upload()

# Step 2: Load the uploaded CSV file into a DataFrame
df = pd.read_csv(list(uploaded.keys())[0])

# Ensure the file has the required columns: 'projects_enabled' and 'wiki_enabled'
if all(col in df.columns for col in ['has_projects', 'has_wiki']):
    # Step 3: Calculate the correlation between projects_enabled and wiki_enabled
    correlation = df['has_projects'].corr(df['has_wiki'])

    # Step 4: Print the correlation to 3 decimal places
    print(f"The correlation between having projects enabled and having wiki enabled is {correlation:.3f}.")
else:
    print("The uploaded file does not contain the required columns: 'projects_enabled' and 'wiki_enabled'")
