In [None]:
import pandas as pd

# Load users and repositories data
users_df = pd.read_csv("/content/users.csv")
repos_df = pd.read_csv("/content/repositories.csv")


Question 1: Top 5 Users with the Highest Number of Followers


In [None]:
# Sort users by followers in descending order and select the top 5
top_followers = users_df.sort_values(by="followers", ascending=False).head(5)
top_followers_logins = ",".join(top_followers["login"].tolist())
print("Top 5 users by followers:", top_followers_logins)


Top 5 users by followers: midudev,ai,raysan5,vfarcic,spite


Question 2: 5 Earliest Registered GitHub Users

In [None]:
# Sort by created_at in ascending order and select the first 5 users
earliest_users = users_df.sort_values(by="created_at").head(5)
earliest_users_logins = ",".join(earliest_users["login"].tolist())
print("Earliest registered users:", earliest_users_logins)


Earliest registered users: oleganza,gravityblast,fesplugas,fxn,pauek


Question 3: Top 3 Most Popular Licenses

In [None]:
# Filter out missing licenses, then count each license type
popular_licenses = repos_df["license_name"].dropna().value_counts().head(3)
popular_licenses_names = ",".join(popular_licenses.index.tolist())
print("Top 3 licenses:", popular_licenses_names)


Top 3 licenses: mit,apache-2.0,other


Question 4: Company with the Majority of Developers

In [None]:
# Exclude null values and find the most common company in the cleaned column
majority_company = users_df["company"].dropna().mode()[0]
print("Company with majority of developers:", majority_company)


Company with majority of developers: FREELANCE


Question 5: Most Popular Programming Language

In [None]:
# Count each programming language, exclude null values
popular_language = repos_df["language"].dropna().mode()[0]
print("Most popular programming language:", popular_language)


Most popular programming language: JavaScript


Question 6: Second Most Popular Language Among Users Joined After 2020

In [None]:
# Filter users who joined after 2020
recent_users = users_df[users_df["created_at"] >= "2020-01-01"]
# Find repos by these users
recent_user_logins = recent_users["login"].tolist()
recent_repos = repos_df[repos_df["login"].isin(recent_user_logins)]
# Get language popularity
second_popular_language = recent_repos["language"].dropna().value_counts().index[1]
print("Second most popular language after 2020:", second_popular_language)


Second most popular language after 2020: Python


Question 7: Language with the Highest Average Stars

In [None]:
# Calculate the average stars per language and find the max
avg_stars_language = repos_df.groupby("language")["stargazers_count"].mean().idxmax()
print("Language with highest average stars:", avg_stars_language)


Language with highest average stars: Vim Script


Question 8: Top 5 in Terms of Leader Strength

In [None]:
# Define leader strength
users_df["leader_strength"] = users_df["followers"] / (1 + users_df["following"])
# Sort by leader strength
top_leader_strength = users_df.sort_values(by="leader_strength", ascending=False).head(5)
top_leader_strength_logins = ",".join(top_leader_strength["login"].tolist())
print("Top 5 by leader strength:", top_leader_strength_logins)


Top 5 by leader strength: midudev,vfarcic,spite,amix,cfenollosa


Question 9: Correlation Between Followers and Public Repos

In [None]:
from scipy.stats import pearsonr

# Calculate the correlation
corr_followers_repos, _ = pearsonr(users_df["followers"], users_df["public_repos"])
print(f"Correlation between followers and public repos: {corr_followers_repos:.3f}")


Correlation between followers and public repos: 0.071


Question 10: Regression Slope of Followers on Public Repos

In [None]:
from scipy.stats import linregress

# Perform linear regression
slope, _, _, _, _ = linregress(users_df["public_repos"], users_df["followers"])
print(f"Regression slope of followers on public repos: {slope:.3f}")


Regression slope of followers on public repos: 1.027


Question 11: Correlation Between Projects and Wiki Enabled

In [None]:
from scipy.stats import pearsonr

# Convert 'has_projects' and 'has_wiki' columns to numeric (True/False to 1/0)
repos_df['has_projects'] = repos_df['has_projects'].astype(int)
repos_df['has_wiki'] = repos_df['has_wiki'].astype(int)

# Calculate the Pearson correlation
corr_projects_wiki, _ = pearsonr(repos_df['has_projects'], repos_df['has_wiki'])

print(f"Correlation between having projects enabled and having wikis enabled: {corr_projects_wiki}")


Correlation between having projects enabled and having wikis enabled: 0.3174612492863113


Question 12: Difference in Following Count for Hireable Users

In [None]:
# Calculate average following for hireable and non-hireable users
# avg_following_hireable = users_df[users_df["hireable"] == True]["following"].mean()
# avg_following_non_hireable = users_df[users_df["hireable"] != True]["following"].mean()
# following_difference = avg_following_hireable - avg_following_non_hireable
# print(f"Difference in following for hireable users: {following_difference:.3f}")

# Separate hireable and non-hireable (including NaN as non-hireable)
hireable_users = users_df[users_df["hireable"] == True]
non_hireable_users = users_df[users_df["hireable"] != True]

# Calculate the average following count for both groups
avg_following_hireable = hireable_users["following"].mean()
avg_following_non_hireable = non_hireable_users["following"].mean()

# Calculate the difference in average following
following_difference = avg_following_hireable - avg_following_non_hireable
print(f"Average difference in following between hireable and non-hireable users: {following_difference:.3f}")


Average difference in following between hireable and non-hireable users: 294.549


Question 13: Regression Slope of Followers on Bio Word Count

In [None]:
from scipy.stats import linregress

# Drop rows with missing bios to focus only on users with a bio
users_with_bio_df = users_df.dropna(subset=['bio'])

# Calculate word count for each bio (split by whitespace)
users_with_bio_df['bio_word_count'] = users_with_bio_df['bio'].apply(lambda x: len(x.split()))

# Perform linear regression: followers as dependent variable, bio_word_count as independent variable
slope, _, _, _, _ = linregress(users_with_bio_df["bio_word_count"], users_with_bio_df["followers"])

print(f"Regression slope of followers on bio word count: {slope:.3f}")


Regression slope of followers on bio word count: 13.488


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_bio_df['bio_word_count'] = users_with_bio_df['bio'].apply(lambda x: len(x.split()))


Question 14: Most Repositories Created on Weekends

In [None]:
# Convert created_at to datetime and extract day of the week (5, 6 are Sat, Sun)
repos_df["created_at"] = pd.to_datetime(repos_df["created_at"])
repos_df["is_weekend"] = repos_df["created_at"].dt.weekday >= 5
# Count repositories by user on weekends
weekend_repos_count = repos_df[repos_df["is_weekend"]].groupby("login").size().sort_values(ascending=False).head(5)
weekend_top_users = ",".join(weekend_repos_count.index.tolist())
print("Top 5 users by weekend repositories:", weekend_top_users)


Top 5 users by weekend repositories: nilportugues,kinow,ajsb85,vfarcic,wlsf82


Question 15: Email Sharing Difference Between Hireable and Non-Hireable

In [None]:
# Set NaN values in hireable to False and email to None
users_df["hireable"] = users_df["hireable"].fillna(False)
users_df["email"] = users_df["email"].fillna("")

# Calculate the fraction of users with email addresses for hireable and non-hireable users
email_hireable_fraction = users_df[users_df["hireable"] == True]["email"].apply(lambda x: x != "").mean()
email_non_hireable_fraction = users_df[users_df["hireable"] == False]["email"].apply(lambda x: x != "").mean()

# Calculate the difference
email_sharing_difference = email_hireable_fraction - email_non_hireable_fraction
print(f"Difference in email sharing between hireable and non-hireable users: {email_sharing_difference}")




Difference in email sharing between hireable and non-hireable users: 0.09537037037037038


Question 16: Most Common Surname

In [None]:
# Filter out missing names, split by whitespace, and extract the last word
users_df["surname"] = users_df["name"].dropna().apply(lambda x: x.strip().split()[-1])

# Find the most common surname(s)
most_common_surnames = users_df["surname"].value_counts()
max_count = most_common_surnames.max()
top_surnames = most_common_surnames[most_common_surnames == max_count].index.tolist()
top_surnames_sorted = ",".join(sorted(top_surnames))

print("Most common surname(s):", top_surnames_sorted)


Most common surname(s): Mart√≠nez,Ortiz
