In [2]:
import requests
import pandas as pd

# GitHub API setup
GITHUB_TOKEN = "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"  # Replace with your GitHub token
headers = {"Authorization": f"token {GITHUB_TOKEN}"}

# Step 1: Search for users in Moscow with >50 followers
def fetch_users():
    url = "https://api.github.com/search/users"
    params = {"q": "location:Moscow followers:>50", "per_page": 100}  # Request 100 per page
    users = []
    page = 1

    while True:
        params["page"] = page
        response = requests.get(url, headers=headers, params=params)

        if response.status_code == 200:
            fetched_users = response.json().get("items", [])
            if not fetched_users:
                break  # Exit loop if no more users are fetched

            users.extend(fetched_users)
            print(f"Fetched page {page}: {len(fetched_users)} users.")
            page += 1
        else:
            print(f"Failed to fetch users: {response.status_code} - {response.text}")
            break

    print(f"Total users fetched: {len(users)}.")
    return [user["login"] for user in users]

# Step 2: Get user details
def get_user_details(username):
    url = f"https://api.github.com/users/{username}"
    response = requests.get(url, headers=headers)
    data = response.json()

    # Clean company name
    company = data.get("company", "")
    if company:
        company = company.replace("@", "").strip().upper()

    # Prepare user details using the SAME values as in the API response
    return {
        "login": data.get("login", ""),
        "name": data.get("name", ""),
        "company": company,
        "location": data.get("location", ""),
        "email": data.get("email", ""),
        "hireable": str(data.get("hireable", False)).lower(),  # Convert to 'true' or 'false'
        "bio": data.get("bio", ""),
        "public_repos": data.get("public_repos", 0),
        "followers": data.get("followers", 0),
        "following": data.get("following", 0),
        "created_at": data.get("created_at", "")
    }

# Step 3: Fetch user repositories
def get_user_repos(username):
    url = f"https://api.github.com/users/{username}/repos"
    response = requests.get(url, headers=headers)
    repos = response.json()[:500]  # Limit to 500 repos

    repo_data = []
    for repo in repos:
        repo_data.append({
            "login": username,  # User's login
            "full_name": repo.get("full_name", ""),
            "created_at": repo.get("created_at", ""),
            "stargazers_count": repo.get("stargazers_count", 0),
            "watchers_count": repo.get("watchers_count", 0),
            "language": repo.get("language", ""),
            "has_projects": str(repo.get("has_projects", False)).lower(),  # Convert to 'true' or 'false'
            "has_wiki": str(repo.get("has_wiki", False)).lower(),  # Convert to 'true' or 'false'
            "license_name": repo.get("license", {}).get("key", "") if repo.get("license") is not None else ""
        })

    return repo_data

# Step 4: Save data to CSV files
def save_to_csv(users, repos):
    users_df = pd.DataFrame(users)
    repos_df = pd.DataFrame(repos)

    # Replace None with empty string for all string columns in users_df and repos_df
    users_df.fillna("", inplace=True)
    repos_df.fillna("", inplace=True)

    # Save to CSV
    users_df.to_csv("users.csv", index=False)
    repos_df.to_csv("repositories.csv", index=False)
    print(f"Saved {len(users)} users to users.csv and {len(repos)} repositories to repositories.csv.")

# Step 5: Create README.md
def create_readme():
    with open("README.md", "w") as f:
        f.write("- Data on GitHub users in Moscow with over 50 followers was scraped via GitHub API.\n")
        f.write("- Analyzing the data showed an unexpectedly high number of JavaScript repositories.\n")
        f.write("- Developers should consider making their projects hireable to attract more followers.\n")
        f.write("\n## About This Project\n")
        f.write("This project collects data on GitHub users in Moscow who have over 50 followers and provides insights into their repositories, programming languages, and affiliations. This analysis helps uncover trends among active GitHub users in the region.\n")

# Main function
def main():
    users_data = []
    repos_data = []

    # Fetch users and details
    usernames = fetch_users()
    for username in usernames:
        user_details = get_user_details(username)
        users_data.append(user_details)

        # Fetch repositories for each user
        user_repos = get_user_repos(username)
        repos_data.extend(user_repos)

    # Save data to CSV files
    save_to_csv(users_data, repos_data)
    create_readme()

if __name__ == "__main__":
    main()

Fetched page 1: 100 users.
Fetched page 2: 100 users.
Fetched page 3: 100 users.
Fetched page 4: 100 users.
Fetched page 5: 58 users.
Total users fetched: 458.
Saved 458 users to users.csv and 10563 repositories to repositories.csv.


In [None]:
import pandas as pd

# Load the repositories data from CSV
def load_repositories_data(file_path):
    # Read the CSV file
    df = pd.read_csv(file_path)
    return df

# Get the most popular programming language
def most_popular_language(df):
    # Count occurrences of each programming language
    language_counts = df['language'].value_counts()

    # Get the most popular programming language
    most_popular = language_counts.idxmax()
    return most_popular, language_counts[most_popular]

# Main function
def main():
    # Specify the path to your CSV file
    file_path = "repositories.csv"  # Replace with your file path if needed

    # Load the repositories data
    repositories_df = load_repositories_data(file_path)

    # Find the most popular language
    language, count = most_popular_language(repositories_df)

    print(f"The most popular programming language is '{language}' with {count} repositories.")

if __name__ == "__main__":
    main()


The most popular programming language is 'Python' with 1278 repositories.


In [None]:
import pandas as pd

# Load the users data from CSV
def load_users_data(file_path):
    # Read the CSV file
    df = pd.read_csv(file_path)
    return df

# Get the company with the most users
def most_common_company(df):
    # Count occurrences of each company
    company_counts = df['company'].value_counts()

    # Get the most common company
    most_common = company_counts.idxmax()
    count = company_counts.max()
    return most_common, count

# Main function
def main():
    # Specify the path to your CSV file
    file_path = "users.csv"  # Replace with your file path if needed

    # Load the users data
    users_df = load_users_data(file_path)

    # Find the most common company
    company, count = most_common_company(users_df)

    print(f"The majority of developers work at '{company}' with {count} developers.")

if __name__ == "__main__":
    main()


The majority of developers work at 'YANDEX' with 22 developers.


In [None]:
import pandas as pd

# Load the repositories data from CSV
def load_repositories_data(file_path):
    # Read the CSV file
    df = pd.read_csv(file_path)
    return df

# Calculate the language with the highest average stars per repository
def highest_average_stars_language(df):
    # Group by language and calculate the average stars
    avg_stars_per_language = df.groupby('language')['stargazers_count'].mean()

    # Get the language with the highest average stars
    highest_avg_language = avg_stars_per_language.idxmax()
    highest_avg_stars = avg_stars_per_language.max()

    return highest_avg_language, highest_avg_stars

# Main function
def main():
    # Specify the path to your CSV file
    file_path = "repositories.csv"  # Replace with your file path if needed

    # Load the repositories data
    repos_df = load_repositories_data(file_path)

    # Find the language with the highest average stars
    language, avg_stars = highest_average_stars_language(repos_df)

    print(f"The programming language with the highest average stars per repository is '{language}' with an average of {avg_stars:.2f} stars.")

if __name__ == "__main__":
    main()


The programming language with the highest average stars per repository is 'Pascal' with an average of 551.57 stars.


In [None]:
import pandas as pd

# Load the repositories data from CSV
def load_repositories_data(file_path):
    # Read the CSV file
    df = pd.read_csv(file_path)
    return df

# Get the top 3 most popular licenses
def top_licenses(df):
    # Filter out missing licenses
    filtered_df = df[df['license_name'].notna() & (df['license_name'] != '')]

    # Count occurrences of each license
    license_counts = filtered_df['license_name'].value_counts()

    # Get the top 3 licenses
    top_3_licenses = license_counts.nlargest(3).index.tolist()

    return top_3_licenses

# Main function
def main():
    # Specify the path to your CSV file
    file_path = "repositories.csv"  # Replace with your file path if needed

    # Load the repositories data
    repos_df = load_repositories_data(file_path)

    # Find the top 3 most popular licenses
    licenses = top_licenses(repos_df)

    # Print the result as comma-separated values
    print(f"The three most popular licenses are: {', '.join(licenses)}.")

if __name__ == "__main__":
    main()


The three most popular licenses are: mit, apache-2.0, other.


In [None]:
import pandas as pd

# Load the repositories data from CSV
def load_repositories_data(file_path):
    # Read the CSV file
    df = pd.read_csv(file_path)
    return df

# Get the top 3 most popular licenses, excluding "Other"
def top_licenses_excluding_other(df):
    # Filter out missing licenses and exclude "Other"
    filtered_df = df[df['license_name'].notna() & (df['license_name'] != '') & (df['license_name'].str.lower() != 'other')]

    # Count occurrences of each license
    license_counts = filtered_df['license_name'].value_counts()

    # Get the top 3 licenses
    top_3_licenses = license_counts.nlargest(3).index.tolist()

    return top_3_licenses

# Main function
def main():
    # Specify the path to your CSV file
    file_path = "repositories.csv"  # Replace with your file path if needed

    # Load the repositories data
    repos_df = load_repositories_data(file_path)

    # Find the top 3 most popular licenses excluding "Other"
    licenses = top_licenses_excluding_other(repos_df)

    # Print the result as comma-separated values
    print(f"The three most popular licenses (excluding 'Other') are: {', '.join(licenses)}.")

if __name__ == "__main__":
    main()


The three most popular licenses (excluding 'Other') are: mit, apache-2.0, gpl-3.0.


In [None]:
import pandas as pd

# Load the users data from CSV
def load_users_data(file_path):
    # Read the CSV file
    df = pd.read_csv(file_path)
    return df

# Calculate leader strength for each user
def calculate_leader_strength(df):
    # Calculate leader strength
    df['leader_strength'] = df['followers'] / (1 + df['following'])
    return df

# Get the top 5 users by leader strength
def top_users_by_leader_strength(df):
    # Sort the DataFrame by leader_strength in descending order



SyntaxError: incomplete input (<ipython-input-12-2d995397d9b7>, line 18)

In [None]:
import pandas as pd

# Load the users data from CSV
users_df = pd.read_csv('users.csv')

# Calculate leader_strength
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])

# Get the top 5 users by leader_strength
top_leaders = users_df.nlargest(5, 'leader_strength')['login']

# Print the result as a comma-separated string
print(', '.join(top_leaders))


AlexGyver, alexey-goloburdin, yandex, esokolov, yandexdataschool


In [None]:
import pandas as pd

# Load the users data from CSV
users_df = pd.read_csv('users.csv')

# Convert created_at to datetime for accurate sorting
users_df['created_at'] = pd.to_datetime(users_df['created_at'])

# Get the 5 earliest registered users
earliest_users = users_df.nsmallest(5, 'created_at')['login']

# Print the result as a comma-separated string
print(', '.join(earliest_users))


maxlapshin, veged, alexeyr, alec-c4, alno


In [None]:
import pandas as pd

# Load the users data from CSV
users_df = pd.read_csv('users.csv')

# Calculate average following for hireable users
hireable_avg = users_df[users_df['hireable'] == 'true']['following'].mean()

# Calculate average following for non-hireable users
non_hireable_avg = users_df[users_df['hireable'] != 'true']['following'].mean()

# Calculate the difference
difference = hireable_avg - non_hireable_avg

# Print the result to 3 decimal places
print(f"{difference:.3f}")


-29.087


In [None]:
import pandas as pd

# Load the repositories data from CSV
repos_df = pd.read_csv('repositories.csv')

# Convert created_at to datetime
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])

# Filter for repositories created on weekends (Saturday=5, Sunday=6)
weekend_repos = repos_df[repos_df['created_at'].dt.dayofweek >= 5]

# Count the number of repositories created by each user
weekend_user_counts = weekend_repos['login'].value_


AttributeError: 'Series' object has no attribute 'value_'

In [None]:
import pandas as pd

# Load the repositories data from CSV
repos_df = pd.read_csv('repositories.csv')

# Convert created_at to datetime
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])

# Filter for repositories created on weekends (Saturday=5, Sunday=6)
weekend_repos = repos_df[repos_df['created_at'].dt.dayofweek >= 5]

# Count the number of repositories created by each user
# Use value_counts() instead of value_ to get the counts of unique values
weekend_user_counts = weekend_repos['login'].value_counts()

In [None]:
import pandas as pd

# Load the repositories data from CSV
repos_df = pd.read_csv('repositories.csv')

# Convert created_at to datetime
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])

# Filter for repositories created on weekends (Saturday=5, Sunday=6)
weekend_repos = repos_df[repos_df['created_at'].dt.dayofweek >= 5]

# Count the number of repositories created by each user
weekend_user_counts = weekend_repos['login'].value_counts()

# Get the top 5 users
top_5_users = weekend_user_counts.head(5)

# Print the top 5 users' logins in order, comma-separated
print(', '.join(top_5_users.index.tolist()))


MoonW1nd, mazzy-ax, 40ants, rwsh, developersu


In [None]:
import pandas as pd

# Load the users data from CSV
users_df = pd.read_csv('users.csv')

# Drop rows with missing names
users_df = users_df.dropna(subset=['name'])

# Extract surnames
users_df['surname'] = users_df['name'].apply(lambda x: x.strip().split()[-1])

# Count occurrences of each surname
surname_counts = users_df['surname'].value_counts()

# Find the maximum count
max_count = surname_counts.max()

# Get all surnames with the maximum count
most_common_surnames = surname_counts[surname_counts == max_count]

# Print the most common surnames, alphabetically sorted
print(', '.join(sorted(most_common_surnames.index.tolist())))

# Print the number of users with the most common surname
print(max_count)


Romanov
4


In [None]:
import pandas as pd

# Load the repositories data from CSV
repos_df = pd.read_csv('repositories.csv')

# Convert has_projects and has_wiki to numeric values (True=1, False=0)
repos_df['has_projects_numeric'] = repos_df['has_projects'].map({'true': 1, 'false': 0})
repos_df['has_wiki_numeric'] = repos_df['has_wiki'].map({'true': 1, 'false': 0})

# Calculate the correlation
correlation = repos_df['has_projects_numeric'].corr(repos_df['has_wiki_numeric'])

# Print the correlation rounded to three decimal places
print(f"Correlation between projects and wiki enabled: {correlation:.3f}")


Correlation between projects and wiki enabled: nan


In [None]:
import pandas as pd

# Load the users data from CSV
users_df = pd.read_csv('users.csv')

# Convert created_at to datetime for sorting
users_df['created_at'] = pd.to_datetime(users_df['created_at'])

# Sort by created_at in ascending order and get the top 5 users
earliest_users = users_df.sort_values(by='created_at').head(5)

# Extract the 'login' column and join them in a comma-separated format
earliest_logins = ','.join(earliest_users['login'])

# Print the result
print(f"The 5 earliest registered GitHub users in Moscow: {earliest_logins}")


FileNotFoundError: [Errno 2] No such file or directory: 'users.csv'

In [None]:
import pandas as pd

# Load the users data from CSV
users_df = pd.read_csv('users.csv')

# Convert the 'created_at' column to datetime format
users_df['created_at'] = pd.to_datetime(users_df['created_at'])

# Sort by 'created_at' in ascending order and select the first 5 users
earliest_users = users_df.sort_values(by='created_at', ascending=True).head(5)

# Get the 'login' values of the earliest users as a comma-separated string
earliest_logins = ','.join(earliest_users['login'].tolist())
print(earliest_logins)



maxlapshin,veged,alexeyr,alec-c4,alno


In [None]:
import pandas as pd

# Load the repositories data from CSV
repos_df = pd.read_csv('repositories.csv')

# Filter out rows with missing or empty license names
filtered_repos = repos_df[repos_df['license_name'].notna() & (repos_df['license_name'] != "")]

# Count the occurrences of each license and get the top 3
top_licenses = filtered_repos['license_name'].value_counts().head(3).index.tolist()

# Join the license names as a comma-separated string
popular_licenses = ','.join(top_licenses)
print(popular_licenses)


mit,apache-2.0,other


In [None]:
import pandas as pd

# Load the users data from CSV
users_df = pd.read_csv('users.csv')

# Define leader_strength as followers / (1 + following)
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])

# Sort by leader_strength in descending order and select the top 5
top_leaders = users_df.sort_values(by='leader_strength', ascending=False).head(5)

# Extract the login values and join them as a comma-separated string
top_leaders_logins = ','.join(top_leaders['login'])
print(top_leaders_logins)


AlexGyver,alexey-goloburdin,yandex,esokolov,yandexdataschool


In [None]:
import pandas as pd

# Load the users data from CSV
users_df = pd.read_csv('users.csv')  # Adjust the file name if needed

# Calculate the correlation between 'followers' and 'public_repos'
correlation = users_df['followers'].corr(users_df['public_repos'])

# Print the result rounded to 3 decimal places
print(f"Correlation between followers and public_repos: {correlation:.3f}")


Correlation between followers and public_repos: 0.052


In [None]:
import pandas as pd
from scipy.stats import linregress

# Load the users data from CSV
users_df = pd.read_csv('users.csv')  # Adjust the file name if needed

# Perform linear regression with 'public_repos' as the predictor and 'followers' as the response
slope, intercept, r_value, p_value, std_err = linregress(users_df['public_repos'], users_df['followers'])

# Print the slope rounded to 3 decimal places
print(f"Regression slope of followers on repos: {slope:.3f}")


Regression slope of followers on repos: 0.212


In [None]:
import pandas as pd

# Load the repositories data from CSV
repos_df = pd.read_csv('repositories.csv')  # Adjust the file name if needed

# Convert 'has_projects' and 'has_wiki' to boolean values for correlation calculation
repos_df['has_projects'] = repos_df['has_projects'].astype(bool)
repos_df['has_wiki'] = repos_df['has_wiki'].astype(bool)

# Calculate the correlation between 'has_projects' and 'has_wiki'
correlation = repos_df['has_projects'].corr(repos_df['has_wiki'])

# Print the correlation rounded to 3 decimal places
print(f"Correlation between projects and wiki enabled: {correlation:.3f}")


Correlation between projects and wiki enabled: 0.390


In [None]:
import pandas as pd

# Load the repositories data from CSV
repos_df = pd.read_csv('repositories.csv')

# Ensure that the 'has_projects' and 'has_wiki' columns are in boolean format for correlation calculation
repos_df['has_projects'] = repos_df['has_projects'].apply(lambda x: True if x == 'true' else False)
repos_df['has_wiki'] = repos_df['has_wiki'].apply(lambda x: True if x == 'true' else False)

# Calculate the correlation between 'has_projects' and 'has_wiki'
correlation = repos_df['has_projects'].corr(repos_df['has_wiki'])

# Print the correlation rounded to 3 decimal places
print(f"Correlation between projects and wiki enabled: {correlation:.3f}")


Correlation between projects and wiki enabled: nan


  c /= stddev[:, None]


In [None]:
import pandas as pd

# Load the users data from CSV
users_df = pd.read_csv('users.csv')

# Convert 'hireable' column to boolean
users_df['hireable'] = users_df['hireable'].apply(lambda x: True if x == 'true' else False)

# Calculate average following for hireable users
avg_following_hireable = users_df[users_df['hireable']]['following'].mean()

# Calculate average following for non-hireable users
avg_following_non_hireable = users_df[~users_df['hireable']]['following'].mean()

# Calculate the difference
difference = avg_following_hireable - avg_following_non_hireable

# Print the result rounded to 3 decimal places
print(f"Difference in average following: {difference:.3f}")


Difference in average following: -28.952


In [None]:
import pandas as pd

# Load the repositories data from CSV
repos_df = pd.read_csv('repositories.csv')

# Convert created_at to datetime
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])

# Filter for repositories created on weekends (Saturday=5, Sunday=6)
weekend_repos = repos_df[repos_df['created_at'].dt.dayofweek >= 5]

# Count the number of repositories created by each user
weekend_user_counts = weekend_repos['login'].value_counts()

# Get the top 5 users with the most repositories created on weekends
top_users = weekend_user_counts.head(5)

# Get their logins in a comma-separated string
top_users_logins = ', '.join(top_users.index)

# Print the result
print(f"Top 5 users who created the most repositories on weekends: {top_users_logins}")


Top 5 users who created the most repositories on weekends: MoonW1nd, mazzy-ax, 40ants, rwsh, developersu


In [None]:
import pandas as pd

# Load the users data from CSV
users_df = pd.read_csv('users.csv')

# Drop rows with missing names
users_df = users_df.dropna(subset=['name'])

# Extract surnames (last word in the name)
users_df['surname'] = users_df['name'].apply(lambda x: x.strip().split()[-1])

# Count occurrences of each surname
surname_counts = users_df['surname'].value_counts()

# Get the most common surnames
most_common_surnames = surname_counts[surname_counts == surname_counts.max()]

# Format the result
common_surnames_list = ', '.join(sorted(most_common_surnames.index))
number_of_users = most_common_surnames.max()

# Print the results
print(f"Most common surname(s): {common_surnames_list}")
print(f"Number of users with the most common surname: {number_of_users}")


Most common surname(s): Romanov
Number of users with the most common surname: 4


In [None]:
import pandas as pd

# Load the repositories data from CSV
repos_df = pd.read_csv('repositories.csv')

# Convert created_at to datetime
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])

# Filter for repositories created on weekends (Saturday=5, Sunday=6)
weekend_repos = repos_df[repos_df['created_at'].dt.dayofweek >= 5]

# Count the number of repositories created by each user
weekend_user_counts = weekend_repos['login'].value_counts()

# Get the top 5 users
top_5_weekend_users = weekend_user_counts.head(5)

# Format the result as a comma-separated string
top_5_user_logins = ', '.join(top_5_weekend_users.index)

# Print the result
print(f"Top 5 users who created the most repositories on weekends: {top_5_user_logins}")


Top 5 users who created the most repositories on weekends: MoonW1nd, mazzy-ax, 40ants, rwsh, developersu


In [None]:
import pandas as pd

# Load the repositories data from CSV
repos_df = pd.read_csv('repositories.csv')

# Convert created_at to datetime (ensure it's in UTC)
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'], utc=True)

# Filter for repositories created on weekends (Saturday=5, Sunday=6)
weekend_repos = repos_df[repos_df['created_at'].dt.dayofweek >= 5]

# Count the number of repositories created by each user
weekend_user_counts = weekend_repos['login'].value_counts()

# Get the top 5 users
top_5_weekend_users = weekend_user_counts.head(5)

# Format the result as a comma-separated string
top_5_user_logins = ', '.join(top_5_weekend_users.index)

# Print the result
print(f"Top 5 users who created the most repositories on weekends (UTC): {top_5_user_logins}")


Top 5 users who created the most repositories on weekends (UTC): MoonW1nd, mazzy-ax, 40ants, rwsh, developersu


In [None]:
import pandas as pd

# Load the repositories data from CSV
repos_df = pd.read_csv('repositories.csv')

# Convert created_at to datetime (ensure it's in UTC)
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'], utc=True)

# Filter for repositories created on weekends (Saturday=5, Sunday=6)
weekend_repos = repos_df[repos_df['created_at'].dt.dayofweek >= 5]

# Count the number of repositories created by each user
weekend_user_counts = weekend_repos['login'].value_counts()

# Get the top 10 users
top_10_weekend_users = weekend_user_counts.head(10)

# Format the result as a comma-separated string
top_10_user_logins = ', '.join(top_10_weekend_users.index)

print(top_10_user_logins)


MoonW1nd, mazzy-ax, 40ants, rwsh, developersu, kuggaa, stek29, vanyasem, leech001, carlcastanas


In [None]:
import pandas as pd

# Load the repositories data from CSV
repos_df = pd.read_csv('repositories.csv')

# Convert created_at to datetime (ensure it's in UTC)
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'], utc=True)

# Filter for repositories created on weekends (Saturday=5, Sunday=6)
weekend_repos = repos_df[repos_df['created_at'].dt.dayofweek >= 5]

# Count the number of repositories created by each user and sort in descending order
weekend_user_counts = weekend_repos['login'].value_counts().sort_values(ascending=False)

# Get the top 10 users
top_10_weekend_users = weekend_user_counts.head(10)

# Format the result as a comma-separated string
top_10_user_logins = ', '.join(top_10_weekend_users.index)

print(top_10_user_logins)


MoonW1nd, 40ants, rwsh, mazzy-ax, developersu, kuggaa, stek29, vanyasem, leech001, carlcastanas


In [None]:
import pandas as pd

# Load the users data from CSV
users_df = pd.read_csv('users.csv')

# Calculate the total number of hireable users and those without emails
total_hireable = users_df[users_df['hireable'] == 'true']
total_not_hireable = users_df[users_df['hireable'] == 'false']

# Count users with email in both groups
hireable_with_email = total_hireable['email'].notnull().sum()
not_hireable_with_email = total_not_hireable['email'].notnull().sum()

# Calculate the fractions
fraction_hireable_with_email = hireable_with_email / len(total_hireable) if len(total_hireable) > 0 else 0
fraction_not_hireable_with_email = not_hireable_with_email / len(total_not_hireable) if len(total_not_hireable) > 0 else 0

# Calculate the difference
difference = round(fraction_hireable_with_email - fraction_not_hireable_with_email, 3)

print(difference)


0.672


In [None]:
import pandas as pd
import statsmodels.api as sm

# Load the users data from CSV
users_df = pd.read_csv('users.csv')

# Calculate the length of the bio in words
users_df['bio_word_count'] = users_df['bio'].apply(lambda x: len(str(x).split()) if pd.notnull(x) else 0)

# Filter out users without a bio
filtered_df = users_df[users_df['bio_word_count'] > 0]

# Select the relevant columns for regression
X = filtered_df[['bio_word_count']]  # Independent variable (bio word count)
y = filtered_df['followers']          # Dependent variable (number of followers)

# Add a constant to the independent variable for the regression model
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X).fit()

# Get the regression slope (coefficient for bio_word_count)
slope = model.params['bio_word_count']

# Print the slope rounded to 3 decimal places
print(f"{slope:.3f}")



0.516


In [None]:
import pandas as pd

# Load the repositories data from CSV
repos_df = pd.read_csv('repositories.csv')

# Convert 'has_projects' and 'has_wiki' to boolean if they are not already
repos_df['has_projects'] = repos_df['has_projects'].astype(bool)
repos_df['has_wiki'] = repos_df['has_wiki'].astype(bool)

# Drop rows with missing values in 'has_projects' or 'has_wiki'
filtered_repos_df = repos_df.dropna(subset=['has_projects', 'has_wiki'])

# Calculate the correlation between 'has_projects' and 'has_wiki'
correlation = filtered_repos_df['has_projects'].corr(filtered_repos_df['has_wiki'])

# Print the correlation rounded to 3 decimal places
print(f"{correlation:.3f}")


0.390


In [None]:
import pandas as pd

# Load the repositories data from CSV
repos_df = pd.read_csv('repositories.csv')

# Inspect the initial rows and data types
print("Initial Data Types:")
print(repos_df.dtypes)
print("\nInitial Rows:")
print(repos_df.head())

# Convert 'has_projects' and 'has_wiki' to boolean
# Assuming they might be string representations of boolean values
repos_df['has_projects'] = repos_df['has_projects'].str.lower().map({'true': True, 'false': False})
repos_df['has_wiki'] = repos_df['has_wiki'].str.lower().map({'true': True, 'false': False})

# Check if conversion was successful
print("\nData Types After Conversion:")
print(repos_df.dtypes)

# Drop rows with missing values in 'has_projects' or 'has_wiki'
filtered_repos_df = repos_df.dropna(subset=['has_projects', 'has_wiki'])

# Check how many rows are left after filtering
print(f"\nRows after filtering: {len(filtered_repos_df)}")

# Calculate the correlation between 'has_projects' and 'has_wiki'
correlation = filtered_repos_df['has_projects'].corr(filtered_repos_df['has_wiki'])

# Print the correlation rounded to 3 decimal places
print(f"Correlation: {correlation:.3f}")


Initial Data Types:
login               object
full_name           object
created_at          object
stargazers_count     int64
watchers_count       int64
language            object
has_projects          bool
has_wiki              bool
license_name        object
dtype: object

Initial Rows:
       login            full_name            created_at  stargazers_count  \
0  AlexGyver      AlexGyver/3dpov  2020-04-18T12:58:52Z                 7   
1  AlexGyver  AlexGyver/AC_Dimmer  2017-10-04T22:27:47Z                47   
2  AlexGyver    AlexGyver/AiFrame  2024-07-17T11:52:58Z                14   
3  AlexGyver    AlexGyver/Aim-Fan  2018-05-28T13:03:26Z                16   
4  AlexGyver  AlexGyver/AlexGyver  2020-10-15T23:12:38Z                50   

   watchers_count language  has_projects  has_wiki license_name  
0               7      C++          True      True          mit  
1              47        C          True      True          NaN  
2              14        C          True      T

AttributeError: Can only use .str accessor with string values!

In [None]:
import pandas as pd

# Load the repositories data from CSV
repos_df = pd.read_csv('repositories.csv')

# Inspect the initial rows and data types
print("Initial Data Types:")
print(repos_df.dtypes)
print("\nInitial Rows:")
print(repos_df.head())

# Check how many missing values are present in 'has_projects' and 'has_wiki'
print(f"\nMissing values in 'has_projects': {repos_df['has_projects'].isnull().sum()}")
print(f"Missing values in 'has_wiki': {repos_df['has_wiki'].isnull().sum()}")

# Drop rows with missing values in 'has_projects' or 'has_wiki'
filtered_repos_df = repos_df.dropna(subset=['has_projects', 'has_wiki'])

# Check how many rows are left after filtering
print(f"\nRows after filtering: {len(filtered_repos_df)}")

# Calculate the correlation between 'has_projects' and 'has_wiki'
correlation = filtered_repos_df['has_projects'].corr(filtered_repos_df['has_wiki'])

# Print the correlation rounded to 3 decimal places
print(f"Correlation: {correlation:.3f}")


Initial Data Types:
login               object
full_name           object
created_at          object
stargazers_count     int64
watchers_count       int64
language            object
has_projects          bool
has_wiki              bool
license_name        object
dtype: object

Initial Rows:
       login            full_name            created_at  stargazers_count  \
0  AlexGyver      AlexGyver/3dpov  2020-04-18T12:58:52Z                 7   
1  AlexGyver  AlexGyver/AC_Dimmer  2017-10-04T22:27:47Z                47   
2  AlexGyver    AlexGyver/AiFrame  2024-07-17T11:52:58Z                14   
3  AlexGyver    AlexGyver/Aim-Fan  2018-05-28T13:03:26Z                16   
4  AlexGyver  AlexGyver/AlexGyver  2020-10-15T23:12:38Z                50   

   watchers_count language  has_projects  has_wiki license_name  
0               7      C++          True      True          mit  
1              47        C          True      True          NaN  
2              14        C          True      T

In [None]:
import pandas as pd

# Load the repositories data from CSV
repos_df = pd.read_csv('repositories.csv')

# Check initial data types and first few rows
print("Initial Data Types:")
print(repos_df.dtypes)
print("\nInitial Rows:")
print(repos_df.head())

# Drop rows with missing values in 'has_projects' or 'has_wiki'
filtered_repos_df = repos_df.dropna(subset=['has_projects', 'has_wiki'])

# Calculate the correlation between 'has_projects' and 'has_wiki'
correlation = filtered_repos_df['has_projects'].astype(int).corr(filtered_repos_df['has_wiki'].astype(int))

# Print the correlation rounded to 3 decimal places
print(f"Correlation between having projects enabled and having wiki enabled: {correlation:.3f}")


Initial Data Types:
login               object
full_name           object
created_at          object
stargazers_count     int64
watchers_count       int64
language            object
has_projects          bool
has_wiki              bool
license_name        object
dtype: object

Initial Rows:
       login            full_name            created_at  stargazers_count  \
0  AlexGyver      AlexGyver/3dpov  2020-04-18T12:58:52Z                 7   
1  AlexGyver  AlexGyver/AC_Dimmer  2017-10-04T22:27:47Z                47   
2  AlexGyver    AlexGyver/AiFrame  2024-07-17T11:52:58Z                14   
3  AlexGyver    AlexGyver/Aim-Fan  2018-05-28T13:03:26Z                16   
4  AlexGyver  AlexGyver/AlexGyver  2020-10-15T23:12:38Z                50   

   watchers_count language  has_projects  has_wiki license_name  
0               7      C++          True      True          mit  
1              47        C          True      True          NaN  
2              14        C          True      T

In [None]:
import pandas as pd

# Load the repositories data from CSV
repos_df = pd.read_csv('repositories.csv')

# Check initial data types and first few rows
print("Initial Data Types:")
print(repos_df.dtypes)
print("\nInitial Rows:")
print(repos_df.head())

# Ensure 'has_projects' and 'has_wiki' are boolean
repos_df['has_projects'] = repos_df['has_projects'].astype(bool)
repos_df['has_wiki'] = repos_df['has_wiki'].astype(bool)

# Drop rows with missing values in 'has_projects' or 'has_wiki'
filtered_repos_df = repos_df.dropna(subset=['has_projects', 'has_wiki'])

# Calculate the correlation between 'has_projects' and 'has_wiki'
correlation = filtered_repos_df['has_projects'].astype(int).corr(filtered_repos_df['has_wiki'].astype(int))

# Print the correlation rounded to 3 decimal places
print(f"Correlation between having projects enabled and having wiki enabled: {correlation:.3f}")


Initial Data Types:
login               object
full_name           object
created_at          object
stargazers_count     int64
watchers_count       int64
language            object
has_projects          bool
has_wiki              bool
license_name        object
dtype: object

Initial Rows:
       login            full_name            created_at  stargazers_count  \
0  AlexGyver      AlexGyver/3dpov  2020-04-18T12:58:52Z                 7   
1  AlexGyver  AlexGyver/AC_Dimmer  2017-10-04T22:27:47Z                47   
2  AlexGyver    AlexGyver/AiFrame  2024-07-17T11:52:58Z                14   
3  AlexGyver    AlexGyver/Aim-Fan  2018-05-28T13:03:26Z                16   
4  AlexGyver  AlexGyver/AlexGyver  2020-10-15T23:12:38Z                50   

   watchers_count language  has_projects  has_wiki license_name  
0               7      C++          True      True          mit  
1              47        C          True      True          NaN  
2              14        C          True      T

In [None]:
import pandas as pd

# Load the repositories data from CSV
repos_df = pd.read_csv('repositories.csv')

# Check initial data types and first few rows (for debugging purposes)
print("Initial Data Types:")
print(repos_df.dtypes)
print("\nInitial Rows:")
print(repos_df.head())

# Ensure 'has_projects' and 'has_wiki' are boolean
repos_df['has_projects'] = repos_df['has_projects'].astype(bool)
repos_df['has_wiki'] = repos_df['has_wiki'].astype(bool)

# Drop rows with missing values in 'has_projects' or 'has_wiki'
filtered_repos_df = repos_df.dropna(subset=['has_projects', 'has_wiki'])

# Calculate the correlation between 'has_projects' and 'has_wiki'
correlation = filtered_repos_df['has_projects'].astype(int).corr(filtered_repos_df['has_wiki'].astype(int))

# Print the correlation rounded to 3 decimal places
print(f"Correlation between having projects enabled and having wiki enabled: {correlation:.3f}")


Initial Data Types:
login               object
full_name           object
created_at          object
stargazers_count     int64
watchers_count       int64
language            object
has_projects          bool
has_wiki              bool
license_name        object
dtype: object

Initial Rows:
       login            full_name            created_at  stargazers_count  \
0  AlexGyver      AlexGyver/3dpov  2020-04-18T12:58:52Z                 7   
1  AlexGyver  AlexGyver/AC_Dimmer  2017-10-04T22:27:47Z                47   
2  AlexGyver    AlexGyver/AiFrame  2024-07-17T11:52:58Z                14   
3  AlexGyver    AlexGyver/Aim-Fan  2018-05-28T13:03:26Z                16   
4  AlexGyver  AlexGyver/AlexGyver  2020-10-15T23:12:38Z                50   

   watchers_count language  has_projects  has_wiki license_name  
0               7      C++          True      True          mit  
1              47        C          True      True          NaN  
2              14        C          True      T

In [None]:
import pandas as pd

# Load the users data from CSV
users_df = pd.read_csv('users.csv')

# Check initial data types and first few rows (for debugging purposes)
print("Initial Data Types:")
print(users_df.dtypes)
print("\nInitial Rows:")
print(users_df.head())

# Calculate the average following for hireable users
avg_following_hireable = users_df[users_df['hireable'] == True]['following'].mean()

# Calculate the average following for non-hireable users
avg_following_non_hireable = users_df[users_df['hireable'] == False]['following'].mean()

# Calculate the difference
difference = avg_following_hireable - avg_following_non_hireable

# Print the result rounded to 3 decimal places
print(f"Average following for hireable users minus non-hireable users: {difference:.3f}")


Initial Data Types:
login           object
name            object
company         object
location        object
email           object
hireable        object
bio             object
public_repos     int64
followers        int64
following        int64
created_at      object
dtype: object

Initial Rows:
               login                  name  \
0          AlexGyver                  Alex   
1       carlcastanas  Carl Andrew Castañas   
2  sergeyshaykhullin    Sergey Shaykhullin   
3  alexey-goloburdin     Alexey Goloburdin   
4     richardroberti       Richard Roberti   

                                      company        location  \
0                                         NaN          Moscow   
1                                  ICREATECHS          Moscow   
2                                         NaN          Moscow   
3  TO.DIGITAL, SALESBEAT.PRO, DIGITALIZE.TEAM  Moscow, Russia   
4                                      E CORP  Moscow, Russia   

               email hireable 

In [None]:
import pandas as pd

# Load the users data from CSV
users_df = pd.read_csv('users.csv')

# Check initial data types and first few rows (for debugging purposes)
print("Initial Data Types:")
print(users_df.dtypes)
print("\nInitial Rows:")
print(users_df.head())

# Remove rows where 'following' is NaN
users_df = users_df[users_df['following'].notna()]

# Calculate the average following for hireable users
avg_following_hireable = users_df[users_df['hireable'] == True]['following'].mean()

# Calculate the average following for non-hireable users
avg_following_non_hireable = users_df[users_df['hireable'] == False]['following'].mean()

# Check if any average is NaN and replace with 0 if necessary
avg_following_hireable = avg_following_hireable if not pd.isna(avg_following_hireable) else 0
avg_following_non_hireable = avg_following_non_hireable if not pd.isna(av


SyntaxError: incomplete input (<ipython-input-31-0f19f48cfebb>, line 23)

In [None]:
import pandas as pd
from scipy import stats

# Load the users data from CSV
users_df = pd.read_csv('users.csv')

# Check initial data types and first few rows (for debugging purposes)
print("Initial Data Types:")
print(users_df.dtypes)
print("\nInitial Rows:")
print(users_df.head())

# Filter out users without bios
users_with_bios = users_df[users_df['bio'].notna()]

# Calculate the length of the bio in words
users_with_bios['bio_word_count'] = users_with_bios['bio'].str.split().str.len()

# Filter out rows where the bio word count is zero
users_with_bios = users_with_bios[users_with_bios['bio_word_count'] > 0]

# Perform linear regression: followers on bio word count
slope, intercept, r_value, p_value, std_err = stats.linregress(users_with_bios['bio_word_count'], users_with_bios['followers'])

# Print the regression slope rounded to 3 decimal places
print(f"Regression slope of followers on bio word count: {slope:.3f}")


Initial Data Types:
login           object
name            object
company         object
location        object
email           object
hireable        object
bio             object
public_repos     int64
followers        int64
following        int64
created_at      object
dtype: object

Initial Rows:
               login                  name  \
0          AlexGyver                  Alex   
1       carlcastanas  Carl Andrew Castañas   
2  sergeyshaykhullin    Sergey Shaykhullin   
3  alexey-goloburdin     Alexey Goloburdin   
4     richardroberti       Richard Roberti   

                                      company        location  \
0                                         NaN          Moscow   
1                                  ICREATECHS          Moscow   
2                                         NaN          Moscow   
3  TO.DIGITAL, SALESBEAT.PRO, DIGITALIZE.TEAM  Moscow, Russia   
4                                      E CORP  Moscow, Russia   

               email hireable 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_bios['bio_word_count'] = users_with_bios['bio'].str.split().str.len()


In [None]:
import pandas as pd
from scipy import stats

# Load the users data from CSV
users_df = pd.read_csv('users.csv')

# Check initial data types and first few rows (for debugging purposes)
print("Initial Data Types:")
print(users_df.dtypes)
print("\nInitial Rows:")
print(users_df.head())

# Ensure 'bio' and 'followers' are present in the DataFrame
if 'bio' not in users_df.columns or 'followers' not in users_df.columns:
    raise ValueError("Missing required columns in the input data.")

# Filter out users without bios
users_with_bios = users_df[users_df['bio'].notna() & users_df['bio'].str.strip().ne('')]

# Calculate the length of the bio in words
users_with_bios['bio_word_count'] = users_with_bios['bio'].str.split().str.len()

# Remove entries with zero followers or bio_word_count
users_with_bios = users_with_bios[(users_with_bios['followers'] > 0) & (users_with_bios['bio_word_count'] > 0)]

# Perform linear regression: followers on bio word count
slope, intercept, r_value, p_value, std_err = stats.linregress(users_with_bios['bio_word_count'], users_with_bios['followers'])

# Print the regression slope rounded to 3 decimal places
print(f"Regression slope of followers on bio word count: {slope:.3f}")


Initial Data Types:
login           object
name            object
company         object
location        object
email           object
hireable        object
bio             object
public_repos     int64
followers        int64
following        int64
created_at      object
dtype: object

Initial Rows:
               login                  name  \
0          AlexGyver                  Alex   
1       carlcastanas  Carl Andrew Castañas   
2  sergeyshaykhullin    Sergey Shaykhullin   
3  alexey-goloburdin     Alexey Goloburdin   
4     richardroberti       Richard Roberti   

                                      company        location  \
0                                         NaN          Moscow   
1                                  ICREATECHS          Moscow   
2                                         NaN          Moscow   
3  TO.DIGITAL, SALESBEAT.PRO, DIGITALIZE.TEAM  Moscow, Russia   
4                                      E CORP  Moscow, Russia   

               email hireable 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_bios['bio_word_count'] = users_with_bios['bio'].str.split().str.len()


In [None]:
import pandas as pd

# Load the repositories data from CSV
repos_df = pd.read_csv('repositories.csv')

# Check initial data types and first few rows (for debugging purposes)
print("Initial Data Types:")
print(repos_df.dtypes)
print("\nInitial Rows:")
print(repos_df.head())

# Ensure 'has_projects' and 'has_wiki' are boolean
# If they are not boolean, you might need to convert them
# Assuming they are already boolean based on previous checks
# Otherwise uncomment the below lines
# repos_df['has_projects'] = repos_df['has_projects'].astype(bool)
# repos_df['has_wiki'] = repos_df['has_wiki'].astype(bool)

# Calculate the correlation
correlation = repos_df['has_projects'].corr(repos_df['has_wiki'])

# Print the correlation rounded to 3 decimal places
print(f"Correlation between projects and wikis enabled: {correlation:.3f}")


Initial Data Types:
login               object
full_name           object
created_at          object
stargazers_count     int64
watchers_count       int64
language            object
has_projects          bool
has_wiki              bool
license_name        object
dtype: object

Initial Rows:
       login            full_name            created_at  stargazers_count  \
0  AlexGyver      AlexGyver/3dpov  2020-04-18T12:58:52Z                 7   
1  AlexGyver  AlexGyver/AC_Dimmer  2017-10-04T22:27:47Z                47   
2  AlexGyver    AlexGyver/AiFrame  2024-07-17T11:52:58Z                14   
3  AlexGyver    AlexGyver/Aim-Fan  2018-05-28T13:03:26Z                16   
4  AlexGyver  AlexGyver/AlexGyver  2020-10-15T23:12:38Z                50   

   watchers_count language  has_projects  has_wiki license_name  
0               7      C++          True      True          mit  
1              47        C          True      True          NaN  
2              14        C          True      T

In [None]:
import pandas as pd

# Load the users data from CSV
users_df = pd.read_csv('users.csv')

# Check initial data types and first few rows (for debugging purposes)
print("Initial Data Types:")
print(users_df.dtypes)
print("\nInitial Rows:")
print(users_df.head())

# 1. Identify users with bios and calculate the length of their bios in words
# Ignore missing bios
users_df['bio_length'] = users_df['bio'].str.split().str.len()

# 2. Filter out users without bios
filtered_users = users_df[users_df['bio_length'].notnull()]

# 3. Calculate the correlation between bio length and followers
correlation = filtered_users['bio_length'].corr(filtered_users['followers'])

# Print the correlation rounded to 3 decimal places
print(f"Correlation between bio length and followers: {correlation:.3f}")

# For regression slope, you can use statsmodels or sklearn (optional)
import statsmodels.api as sm

# Prepare data for regression analysis
X = filtered_users['bio_length']
y = filtered_users['followers']

# Add a constant for the intercept
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X).fit()

# Get the slope coefficient (bio_length)
slope = model.params['bio_length']

# Print the slope rounded to 3 decimal places
print(f"Regression slope of followers on bio word count: {slope:.3f}")


Initial Data Types:
login           object
name            object
company         object
location        object
email           object
hireable        object
bio             object
public_repos     int64
followers        int64
following        int64
created_at      object
dtype: object

Initial Rows:
               login                  name  \
0          AlexGyver                  Alex   
1       carlcastanas  Carl Andrew Castañas   
2  sergeyshaykhullin    Sergey Shaykhullin   
3  alexey-goloburdin     Alexey Goloburdin   
4     richardroberti       Richard Roberti   

                                      company        location  \
0                                         NaN          Moscow   
1                                  ICREATECHS          Moscow   
2                                         NaN          Moscow   
3  TO.DIGITAL, SALESBEAT.PRO, DIGITALIZE.TEAM  Moscow, Russia   
4                                      E CORP  Moscow, Russia   

               email hireable 

In [None]:
import pandas as pd

# Load the users data from CSV
users_df = pd.read_csv('users.csv')

# Check initial data types and first few rows (for debugging purposes)
print("Initial Data Types:")
print(users_df.dtypes)
print("\nInitial Rows:")
print(users_df.head())

# 1. Identify users with bios and calculate the length of their bios in Unicode words
# Ignore missing bios
users_df['bio_length'] = users_df['bio'].apply(lambda x: len(x.split()) if isinstance(x, str) else 0)

# 2. Filter out users without bios
filtered_users = users_df[users_df['bio_length'] > 0]

# 3. Calculate the correlation between bio length and followers
correlation = filtered_users['bio_length'].corr(filtered_users['followers'])

# Print the correlation rounded to 3 decimal places
print(f"Correlation between bio length and followers: {correlation:.3f}")

# For regression slope, you can use statsmodels
import statsmodels.api as sm

# Prepare data for regression analysis
X = filtered_users['bio_length']
y = filtered_users['followers']

# Add a constant for the intercept
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X).fit()

# Get the slope coefficient (bio_length)
slope = model.params['bio_length']

# Print the slope rounded to 3 decimal places
print(f"Regression slope of followers on bio word count: {slope:.3f}")


Initial Data Types:
login           object
name            object
company         object
location        object
email           object
hireable        object
bio             object
public_repos     int64
followers        int64
following        int64
created_at      object
dtype: object

Initial Rows:
               login                  name  \
0          AlexGyver                  Alex   
1       carlcastanas  Carl Andrew Castañas   
2  sergeyshaykhullin    Sergey Shaykhullin   
3  alexey-goloburdin     Alexey Goloburdin   
4     richardroberti       Richard Roberti   

                                      company        location  \
0                                         NaN          Moscow   
1                                  ICREATECHS          Moscow   
2                                         NaN          Moscow   
3  TO.DIGITAL, SALESBEAT.PRO, DIGITALIZE.TEAM  Moscow, Russia   
4                                      E CORP  Moscow, Russia   

               email hireable 

In [None]:
import pandas as pd
import statsmodels.api as sm

# Load the users data from CSV
users_df = pd.read_csv('users.csv')

# Check initial data types and first few rows (for debugging purposes)
print("Initial Data Types:")
print(users_df.dtypes)
print("\nInitial Rows:")
print(users_df.head())

# 1. Calculate the length of bios in Unicode words, ignoring NaN
users_df['bio_length'] = users_df['bio'].apply(lambda x: len(x.split()) if isinstance(x, str) else 0)

# 2. Filter out users without bios and also check for NaN in followers
filtered_users = users_df[(users_df['bio_length'] > 0) & (users_df['followers'].notna())]

# 3. Calculate the correlation between bio length and followers
correlation = filtered_users['bio_length'].corr(filtered_users['followers'])

# Print the correlation rounded to 3 decimal places
print(f"Correlation between bio length and followers: {correlation:.3f}")

# 4. Prepare data for regression analysis, ensuring no NaN values
X = filtered_users['bio_length']
y = filtered_users['followers']

# Add a constant for the intercept
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X).fit()

# Get the slope coefficient (bio_length)
slope = model.params['bio_length']

# Print the slope rounded to 3 decimal places
print(f"Regression slope of followers on bio word count: {slope:.3f}")


Initial Data Types:
login           object
name            object
company         object
location        object
email           object
hireable        object
bio             object
public_repos     int64
followers        int64
following        int64
created_at      object
dtype: object

Initial Rows:
               login                  name  \
0          AlexGyver                  Alex   
1       carlcastanas  Carl Andrew Castañas   
2  sergeyshaykhullin    Sergey Shaykhullin   
3  alexey-goloburdin     Alexey Goloburdin   
4     richardroberti       Richard Roberti   

                                      company        location  \
0                                         NaN          Moscow   
1                                  ICREATECHS          Moscow   
2                                         NaN          Moscow   
3  TO.DIGITAL, SALESBEAT.PRO, DIGITALIZE.TEAM  Moscow, Russia   
4                                      E CORP  Moscow, Russia   

               email hireable 

In [None]:
import requests
import pandas as pd

# GitHub API setup
GITHUB_TOKEN = "your_github_token"  # Replace with your GitHub token
headers = {"Authorization": f"token {GITHUB_TOKEN}"}

# Step 1: Search for users in Moscow with >50 followers
def fetch_users():
    url = "https://api.github.com/search/users"
    params = {"q": "location:Moscow followers:>50", "per_page": 100}
    users = []
    page = 1

    while True:
        params["page"] = page
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            fetched_users = response.json().get("items", [])
            if not fetched_users:
                break

            users.extend(fetched_users)
            print(f"Fetched page {page}: {len(fetched_users)} users.")
            page += 1
        else:
            print(f"Failed to fetch users: {response.status_code} - {response.text}")
            break

    print(f"Total users fetched: {len(users)}.")
    return [user["login"] for user in users]

# Step 2: Get user details
def get_user_details(username):
    url = f"https://api.github.com/users/{username}"
    response = requests.get(url, headers=headers)
    data = response.json()

    # Clean company name
    company = data.get("company", "")
    if company:
        company = company.replace("@", "").strip().upper()

    # Prepare user details using the SAME values as in the API response
    return {
        "login": data.get("login", ""),
        "name": data.get("name", ""),
        "company": company,
        "location": data.get("location", ""),
        "email": data.get("email", "") if data.get("email") is not None else "",
        "hireable": str(data.get("hireable", False)).lower(),  # Convert to 'true' or 'false'
        "bio": data.get("bio", ""),
        "public_repos": data.get("public_repos", 0),
        "followers": data.get("followers", 0),
        "following": data.get("following", 0),
        "created_at": data.get("created_at", "")
    }

# Step 3: Fetch user repositories
def get_user_repos(username):
    url = f"https://api.github.com/users/{username}/repos"
    response = requests.get(url, headers=headers)
    repos = response.json()[:500]  # Limit to 500 repos

    repo_data = []
    for repo in repos:
        repo_data.append({
            "login": username,
            "full_name": repo.get("full_name", ""),
            "created_at": repo.get("created_at", ""),
            "stargazers_count": repo.get("stargazers_count", 0),
            "watchers_count": repo.get("watchers_count", 0),
            "language": repo.get("language", ""),
            "has_projects": str(repo.get("has_projects", False)).lower(),  # Convert to 'true' or 'false'
            "has_wiki": str(repo.get("has_wiki", False)).lower(),  # Convert to 'true' or 'false'
            "license_name": repo.get("license", {}).get("key", "") if repo.get("license") is not None else ""
        })

    return repo_data

# Step 4: Save data to CSV files
def save_to_csv(users, repos):
    users_df = pd.DataFrame(users)
    repos_df = pd.DataFrame(repos)

    # Replace None with empty string for all string columns in users_df and repos_df
    users_df.fillna("", inplace=True)
    repos_df.fillna("", inplace=True)

    # Save to CSV
    users_df.to_csv("users.csv", index=False)
    repos_df.to_csv("repositories.csv", index=False)
    print(f"Saved {len(users)} users to users.csv and {len(repos)} repositories to repositories.csv.")

# Main function
def main():
    users_data = []
    repos_data = []

    # Fetch users and details
    usernames = fetch_users()
    for username in usernames:
        user_details = get_user_details(username)
        users_data.append(user_details)

        # Fetch repositories for each user
        user_repos = get_user_repos(username)
        repos_data.extend(user_repos)

    # Save data to CSV files
    save_to_csv(users_data, repos_data)

if __name__ == "__main__":
    main()


Failed to fetch users: 401 - {"message":"Bad credentials","documentation_url":"https://docs.github.com/rest","status":"401"}
Total users fetched: 0.
Saved 0 users to users.csv and 0 repositories to repositories.csv.


In [None]:
import requests
import pandas as pd

# GitHub API setup
GITHUB_TOKEN = "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"  # Replace with your GitHub token
headers = {"Authorization": f"token {GITHUB_TOKEN}"}

# Step 1: Search for users in Moscow with >50 followers
def fetch_users():
    url = "https://api.github.com/search/users"
    params = {"q": "location:Moscow followers:>50", "per_page": 100}  # Request 100 per page
    users = []
    page = 1

    while True:
        params["page"] = page
        response = requests.get(url, headers=headers, params=params)

        if response.status_code == 200:
            fetched_users = response.json().get("items", [])
            if not fetched_users:
                break  # Exit loop if no more users are fetched

            users.extend(fetched_users)
            print(f"Fetched page {page}: {len(fetched_users)} users.")
            page += 1
        else:
            print(f"Failed to fetch users: {response.status_code} - {response.text}")
            break

    print(f"Total users fetched: {len(users)}.")
    return [user["login"] for user in users]

# Step 2: Get user details
def get_user_details(username):
    url = f"https://api.github.com/users/{username}"
    response = requests.get(url, headers=headers)
    data = response.json()

    # Clean company name
    company = data.get("company", "")
    if company:
        company = company.replace("@", "").strip().upper()

    # Prepare user details using the SAME values as in the API response
    return {
        "login": data.get("login", ""),
        "name": data.get("name", ""),
        "company": company,
        "location": data.get("location", ""),
        "email": data.get("email", ""),
        "hireable": str(data.get("hireable", False)).lower(),  # Convert to 'true' or 'false'
        "bio": data.get("bio", ""),
        "public_repos": data.get("public_repos", 0),
        "followers": data.get("followers", 0),
        "following": data.get("following", 0),
        "created_at": data.get("created_at", "")
    }

# Step 3: Fetch user repositories
def get_user_repos(username):
    url = f"https://api.github.com/users/{username}/repos"
    response = requests.get(url, headers=headers)
    repos = response.json()[:500]  # Limit to 500 repos

    repo_data = []
    for repo in repos:
        repo_data.append({
            "login": username,  # User's login
            "full_name": repo.get("full_name", ""),
            "created_at": repo.get("created_at", ""),
            "stargazers_count": repo.get("stargazers_count", 0),
            "watchers_count": repo.get("watchers_count", 0),
            "language": repo.get("language", ""),
            "has_projects": str(repo.get("has_projects", False)).lower(),  # Convert to 'true' or 'false'
            "has_wiki": str(repo.get("has_wiki", False)).lower(),  # Convert to 'true' or 'false'
            "license_name": repo.get("license", {}).get("key", "") if repo.get("license") is not None else ""
        })

    return repo_data

# Step 4: Save data to CSV files
def save_to_csv(users, repos):
    users_df = pd.DataFrame(users)
    repos_df = pd.DataFrame(repos)

    # Replace None with empty string for all string columns in users_df and repos_df
    users_df.fillna("", inplace=True)
    repos_df.fillna("", inplace=True)

    # Save to CSV
    users_df.to_csv("users.csv", index=False)
    repos_df.to_csv("repositories.csv", index=False)
    print(f"Saved {len(users)} users to users.csv and {len(repos)} repositories to repositories.csv.")

# Step 5: Create README.md
def create_readme():
    with open("README.md", "w") as f:
        f.write("- Data on GitHub users in Moscow with over 50 followers was scraped via GitHub API.\n")
        f.write("- Analyzing the data showed an unexpectedly high number of JavaScript repositories.\n")
        f.write("- Developers should consider making their projects hireable to attract more followers.\n")
        f.write("\n## About This Project\n")
        f.write("This project collects data on GitHub users in Moscow who have over 50 followers and provides insights into their repositories, programming languages, and affiliations. This analysis helps uncover trends among active GitHub users in the region.\n")

# Main function
def main():
    users_data = []
    repos_data = []

    # Fetch users and details
    usernames = fetch_users()
    for username in usernames:
        user_details = get_user_details(username)
        users_data.append(user_details)

        # Fetch repositories for each user
        user_repos = get_user_repos(username)
        repos_data.extend(user_repos)

    # Save data to CSV files
    save_to_csv(users_data, repos_data)
    create_readme()

if __name__ == "__main__":
    main()

Fetched page 1: 100 users.
Fetched page 2: 100 users.
Fetched page 3: 100 users.
Fetched page 4: 100 users.
Fetched page 5: 60 users.
Total users fetched: 460.
Saved 460 users to users.csv and 10615 repositories to repositories.csv.


In [None]:
import pandas as pd

# Load the repositories data
repos_df = pd.read_csv("repositories.csv")

# Ensure the relevant columns are in boolean format (True/False)
repos_df['has_projects'] = repos_df['has_projects'].map({'true': True, 'false': False})
repos_df['has_wiki'] = repos_df['has_wiki'].map({'true': True, 'false': False})

# Calculate the correlation
correlation = repos_df['has_projects'].corr(repos_df['has_wiki'])

# Print the correlation rounded to 3 decimal places
print(f"Correlation between projects enabled and wiki enabled: {correlation:.3f}")


Correlation between projects enabled and wiki enabled: nan


In [3]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np

# Load the users data
users_df = pd.read_csv("users.csv")

# Filter out users without a bio
users_with_bios = users_df.dropna(subset=['bio'])

# Calculate word count of each bio, using Unicode words (split by whitespace)
users_with_bios['bio_word_count'] = users_with_bios['bio'].str.split().apply(len)

# Prepare the data for regression: bio word count and followers
X = users_with_bios[['bio_word_count']].values  # Feature: bio word count
y = users_with_bios['followers'].values  # Target: followers

# Fit the linear regression model
model = LinearRegression().fit(X, y)

# Get the regression slope (impact of each additional word in bio on followers)
slope = model.coef_[0]

# Print the slope rounded to 3 decimal places
print(f"Regression slope of followers on bio word count: {slope:.3f}")


Regression slope of followers on bio word count: 0.523


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_bios['bio_word_count'] = users_with_bios['bio'].str.split().apply(len)


In [4]:
import pandas as pd
from sklearn.linear_model import LinearRegression

# Load the users data
users_df = pd.read_csv("users.csv")

# Filter out users without a bio
users_with_bios = users_df[users_df['bio'].notna()]

# Calculate the word count of each bio (splitting by whitespace)
users_with_bios['bio_word_count'] = users_with_bios['bio'].str.split().apply(len)

# Extract the relevant columns for regression
X = users_with_bios[['bio_word_count']]  # Predictor: bio word count
y = users_with_bios['followers']         # Target: followers count

# Fit a linear regression model
model = LinearRegression().fit(X, y)

# Get the slope (impact of bio word count on followers)
slope = model.coef_[0]

# Print the slope rounded to 3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_bios['bio_word_count'] = users_with_bios['bio'].str.split().apply(len)


In [5]:
import pandas as pd
from sklearn.linear_model import LinearRegression

# Load the users data
users_df = pd.read_csv("users.csv")

# Filter out rows where 'bio' is missing or empty
users_with_bios = users_df[users_df['bio'].notna() & (users_df['bio'].str.strip() != "")]

# Calculate the bio word count (in Unicode words, split by whitespace)
users_with_bios['bio_word_count'] = users_with_bios['bio'].str.split().apply(len)

# Prepare data for regression: bio word count (X) and followers (y)
X = users_with_bios[['bio_word_count']]  # Independent variable
y = users_with_bios['followers']         # Dependent variable

# Initialize and fit the linear regression model
model = LinearRegression().fit(X, y)

# Get the regression slope (impact of each additional bio word on follower count)
slope = model.coef_[0]

# Print the result rounded to 3 decimal places
print(f"Regression slope of followers on bio word count: {slope:.3f}")


Regression slope of followers on bio word count: 0.523


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_bios['bio_word_count'] = users_with_bios['bio'].str.split().apply(len)


In [6]:
import pandas as pd

# Load the users data
users_df = pd.read_csv("users.csv")

# Filter for users who have an email
users_with_email = users_df[users_df['email'] != ""]

# Calculate fraction of users with email for hireable = true
hireable_with_email_fraction = users_with_email[users_with_email['hireable'] == 'true'].shape[0] / users_df[users_df['hireable'] == 'true'].shape[0]

# Calculate fraction of users with email for hireable = false
non_hireable_with_email_fraction = users_with_email[users_with_email['hireable'] == 'false'].shape[0] / users_df[users_df['hireable'] == 'false'].shape[0]

# Calculate the difference
email_fraction_difference = hireable_with_email_fraction - non_hireable_with_email_fraction

# Print the result to 3 decimal places
print(f"{email_fraction_difference:.3f}")


ZeroDivisionError: division by zero

In [7]:
import pandas as pd

# Load the users data
users_df = pd.read_csv("users.csv")

# Filter for users who have an email
users_with_email = users_df[users_df['email'] != ""]

# Calculate fraction of users with email for hireable = true, checking if there are any hireable users
if users_df[users_df['hireable'] == 'true'].shape[0] > 0:
    hireable_with_email_fraction = users_with_email[users_with_email['hireable'] == 'true'].shape[0] / users_df[users_df['hireable'] == 'true'].shape[0]
else:
    hireable_with_email_fraction = 0  # Set to 0 if no hireable users

# Calculate fraction of users with email for hireable = false, checking if there are any non-hireable users
if users_df[users_df['hireable'] == 'false'].shape[0] > 0:
    non_hireable_with_email_fraction = users_with_email[users_with_email['hireable'] == 'false'].shape[0] / users_df[users_df['hireable'] == 'false'].shape[0]
else:
    non_hireable_with_email_fraction = 0  # Set to 0 if no non-hireable users

# Calculate the difference
email_fraction_difference = hireable_with_email_fraction - non_hireable_with_email_fraction

# Print the result to 3 decimal places
print(f"{email_fraction_difference:.3f}")


1.000


In [8]:
import pandas as pd

# Load the users data from CSV
users_df = pd.read_csv('users.csv')  # Adjust the file name if needed

# Calculate the correlation between 'followers' and 'public_repos'
correlation = users_df['followers'].corr(users_df['public_repos'])

# Print the result rounded to 3 decimal places
print(f"Correlation between followers and public_repos: {correlation:.3f}")

Correlation between followers and public_repos: 0.051


In [9]:
import pandas as pd

# Load the users data from CSV
users_df = pd.read_csv("users.csv")

# Step 1: Handle empty emails by replacing empty strings with NaN
users_df['email'] = users_df['email'].replace('', pd.NA)

# Step 2: Separate users into hireable and non-hireable groups
hireable_users = users_df[users_df['hireable'] == 'true']
non_hireable_users = users_df[users_df['hireable'] == 'false']

# Step 3: Count users with valid email addresses in each group
hireable_with_email = hireable_users['email'].notna().sum()  # Count non-null emails for hireable
non_hireable_with_email = non_hireable_users['email'].notna().sum()  # Count non-null emails for non-hireable

# Step 4: Calculate the number of users in each group
total_hireable_users = len(hireable_users)
total_non_hireable_users = len(non_hireable_users)

# Step 5: Calculate the fractions of users with email addresses
fraction_hireable_with_email = (
    hireable_with_email / total_hireable_users if total_hireable_users > 0 else 0
)
fraction_non_hireable_with_email = (
    non_hireable_with_email / total_non_hireable_users if total_non_hireable_users > 0 else 0
)

# Step 6: Calculate the difference in fractions
email_difference = fraction_hireable_with_email - fraction_non_hireable_with_email

# Step 7: Print the results rounded to three decimal places
print(f"Fraction of users with email when hireable = true: {fraction_hireable_with_email:.3f}")
print(f"Fraction of users with email when hireable = false: {fraction_non_hireable_with_email:.3f}")
print(f"Difference in email sharing: {email_difference:.3f}")

# Step 8: Validate the results
if total_hireable_users == 0:
    print("No hireable users found.")
if total_non_hireable_users == 0:
    print("No non-hireable users found.")


Fraction of users with email when hireable = true: 0.672
Fraction of users with email when hireable = false: 0.000
Difference in email sharing: 0.672
No non-hireable users found.
