In [None]:
import requests
import json
import time

# GitHub API token (replace with your own token)
GITHUB_TOKEN = 'my pwd'
HEADERS = {'Authorization': f'token {GITHUB_TOKEN}'}

# Base URL for the GitHub API
GITHUB_API_URL = 'https://api.github.com'


def search_users(city, followers_min, per_page=30, page=1):
    """Search GitHub users based on location and number of followers."""
    query = f"location:{city} followers:>{followers_min}"
    url = f'{GITHUB_API_URL}/search/users?q={query}&per_page={per_page}&page={page}'
    response = requests.get(url, headers=HEADERS)

    if response.status_code == 200:
        return response.json()['items']
    else:
        print(f"Error: {response.status_code}, {response.text}")
        return []


def get_user_repos(username):
    """Get all repositories of a GitHub user."""
    repos = []
    url = f'{GITHUB_API_URL}/users/{username}/repos'
    response = requests.get(url, headers=HEADERS)

    if response.status_code == 200:
        repos = response.json()
    else:
        print(f"Error: {response.status_code}, {response.text}")

    return repos


def main():
    city = "Barcelona"
    followers_min = 100
    per_page = 30  # You can set a maximum of 100 per page
    current_page = 1
    all_users = []

    while True:
        users = search_users(city, followers_min, per_page, current_page)
        if not users:
            break
        all_users.extend(users)
        current_page += 1
        time.sleep(1)  # To avoid rate-limiting

    users_data = []
    repos_data = []
    # Get repos for each user
    for user in all_users:
        username = user['login']
        repo = get_user_repos(username)
        users_data.append({
            'login': username,
            'name': user['name'],
            'company': user(user['company']),
            'location': user['location'],
            'email': user['email'],
            'hireable': user['hireable'],
            'bio': user['bio'],
            'public_repos': user['public_repos'],
            'followers': user['followers'],
            'following': user['following'],
            'created_at': user['created_at'],
        })

        repos_data.append({
            'login': username,
            'full_name': repo['full_name'],
            'created_at': repo['created_at'],
            'stargazers_count': repo['stargazers_count'],
            'watchers_count': repo['watchers_count'],
            'language': repo['language'],
            'has_projects': repo['has_projects'],
            'has_wiki': repo['has_wiki'],
            'license_name': repo['license']['key'] if repo['license'] else None,
        })
        time.sleep(1)  # To avoid rate-limiting

    # Save the data to a JSON file
    with open('barcelona_users.json', 'w') as f:
        json.dump(users_data, f, indent=4)

    with open('barcelona_repos.json', 'w') as f:
        json.dump(repos_data, f, indent=4)

    print(f"Data for {len(users_data)} users saved to 'barcelona_users.json'.")

    print(f"Data for {len(repos_data)} repos saved to 'barcelona_repos.json'.")


if __name__ == '__main__':
    main()


KeyError: 'name'

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Who are the top 5 users in Barcelona with the highest number of followers? List their login in order, comma-separated.

In [None]:
# Load the users.csv file
users_df = pd.read_csv('users_git.csv')

#users_df.head()

# Sort by the followers column in descending order
#top_users = users_df.sort_values(by='followers', ascending=False).head(5)

# Get the logins of the top 5 users
#top_logins = top_users['login'].tolist()

# Print the logins as a comma-separated string
#print(', '.join(top_logins))

# Filter users from Barcelona and sort them by followers in descending order
barcelona_users_df = users_df[users_df['location'].str.contains("Barcelona", case=False, na=False)]
top_5_users = barcelona_users_df.sort_values(by='followers', ascending=False).head(5)

# Extract the 'login' column and list the logins in a comma-separated string
top_5_logins = ', '.join(top_5_users['login'].tolist())
top_5_logins


'midudev, ai, raysan5, vfarcic, spite'

Who are the 5 earliest registered GitHub users in Barcelona? List their login in ascending order of created_at, comma-separated.

In [None]:
# Convert the 'created_at' column to datetime
users_df['created_at'] = pd.to_datetime(users_df['created_at'])

# Sort by the created_at column in ascending order
earliest_users = users_df.sort_values(by='created_at', ascending=True).head(5)

# Get the logins of the top 5 earliest registered users
earliest_logins = earliest_users['login'].tolist()

# Print the logins as a comma-separated string
print(', '.join(earliest_logins))

oleganza, gravityblast, fesplugas, fxn, pauek


What are the 3 most popular license among these users? Ignore missing licenses. List the license_name in order, comma-separated.

In [None]:
# Load the repositories.csv file
repositories_df = pd.read_csv('repositories_git.csv')

# Drop rows with missing license_name and count the frequency of each license_name
licenses_count = repositories_df.dropna(subset=['license_name'])['license_name'].value_counts()

# Get the top 3 most popular licenses
top_3_licenses = licenses_count.head(3).index.tolist()

# Display the result as a comma-separated string
", ".join(top_3_licenses)

'mit, apache-2.0, other'

Which company do the majority of these developers work at?

In [None]:
# Drop rows with missing company information and count the frequency of each company
company_count = users_df.dropna(subset=['company'])['company'].value_counts()

# Get the company where the majority of developers work
majority_company = company_count.idxmax()

majority_company


'FREELANCE'

Which programming language is most popular among these users?

In [None]:
# Count the frequency of each programming language in the repositories dataframe
language_count = repositories_df.dropna(subset=['language'])['language'].value_counts()

# Get the most popular programming language
most_popular_language = language_count.idxmax()

most_popular_language


'JavaScript'

Which programming language is the second most popular among users who joined after 2020?

In [None]:
# Filter repositories for users who joined after 2020
users_after_2020 = users_df[users_df['created_at'] > '2020-01-01']['login']

# Filter the repositories for these users and count the programming languages
repositories_after_2020 = repositories_df[repositories_df['login'].isin(users_after_2020)]
language_count_after_2020 = repositories_after_2020.dropna(subset=['language'])['language'].value_counts()

# Get the second most popular programming language
second_most_popular_language = language_count_after_2020.index[1]

second_most_popular_language


'Python'

Which language has the highest average number of stars per repository?

In [None]:
# Group repositories by language and calculate the average number of stars per repository
avg_stars_per_language = repositories_df.dropna(subset=['language']).groupby('language')['stargazers_count'].mean()

# Get the language with the highest average number of stars
language_highest_avg_stars = avg_stars_per_language.idxmax()

language_highest_avg_stars


'Vim Script'

Let's define leader_strength as followers / (1 + following). Who are the top 5 in terms of leader_strength? List their login in order, comma-separated.


In [None]:
# Avoid division by zero by using (1 + following)
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])
# Sort by leader_strength in descending order
top_users = users_df.sort_values(by='leader_strength', ascending=False)
# Get the top 5 user logins
top_5_logins = top_users['login'].head(5).tolist()
top_5_logins_str = ', '.join(top_5_logins)
print(f"Top 5 Users by Leader Strength: {top_5_logins_str}")


Top 5 Users by Leader Strength: midudev, vfarcic, spite, amix, cfenollosa


What is the correlation between the number of followers and the number of public repositories among users in Barcelona?
Correlation between followers and repos (to 3 decimal places, e.g. 0.123 or -0.123)

In [None]:
# Filter users located in Barcelona
barcelona_users = users_df[users_df['location'].str.contains('Barcelona', case=False, na=False)]

# Calculate the correlation between the number of followers and the number of public repositories
correlation_followers_repos = barcelona_users['followers'].corr(barcelona_users['public_repos'])

# Display the correlation rounded to 3 decimal places
round(correlation_followers_repos, 3)


0.071

Does creating more repos help users get more followers? Using regression, estimate how many additional followers a user gets per additional public repository.
Regression slope of followers on repos (to 3 decimal places, e.g. 0.123 or -0.123)

In [None]:
import statsmodels.api as sm
repos_followers = users_df[['public_repos', 'followers']].dropna()
# Add a constant to the model (for the intercept)
X = sm.add_constant(repos_followers['public_repos'])
Y = repos_followers['followers']

# Fit the model
model = sm.OLS(Y, X).fit()

# Print the regression results
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:              followers   R-squared:                       0.005
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     1.706
Date:                Wed, 30 Oct 2024   Prob (F-statistic):              0.192
Time:                        08:30:01   Log-Likelihood:                -2962.9
No. Observations:                 336   AIC:                             5930.
Df Residuals:                     334   BIC:                             5938.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const          341.0215    111.390      3.062   

Do people typically enable projects and wikis together? What is the correlation between a repo having projects enabled and having wiki enabled?
Correlation between projects and wiki enabled (to 3 decimal places, e.g. 0.123 or -0.123)

In [None]:
correlation = repositories_df['has_projects'].corr(repositories_df['has_wiki'])
correlation

0.3323124084332627

Do hireable users follow more people than those who are not hireable?
Average of following per user for hireable=true minus the average following for the rest (to 3 decimal places, e.g. 12.345 or -12.345)

In [None]:
# Convert 'hireable' column to boolean, where NaN is considered False
users_df['hireable'] = users_df['hireable'].fillna(False).astype(bool)

# Calculate the average following for hireable and non-hireable users
average_following_hireable = users_df[users_df['hireable']]['following'].mean()
average_following_non_hireable = users_df[~users_df['hireable']]['following'].mean()

# Difference between the two averages, rounded to three decimal places
difference = round(average_following_hireable - average_following_non_hireable, 3)
average_following_hireable, average_following_non_hireable, difference


  users_df['hireable'] = users_df['hireable'].fillna(False).astype(bool)


(394.56302521008405, 96.34562211981567, 298.217)

Some developers write long bios. Does that help them get more followers? What's the correlation of the length of their bio (in Unicode characters) with followers? (Ignore people without bios)
Regression slope of followers on bio word count (to 3 decimal places, e.g. 12.345 or -12.345)

In [None]:
from sklearn.linear_model import LinearRegression
users_with_bio = users_df[(users_df['bio'].notna()) & (users_df['bio'] != '')].copy()
users_with_bio.loc[:, 'bio_len'] = users_with_bio['bio'].str.len()

X = users_with_bio['bio_len'].values.reshape(-1,1)
y = users_with_bio['followers']

lr2 = LinearRegression()
lr2.fit(X, y)
lr2.coef_[0]

1.6185526415090765

In [None]:
# Filter users with non-empty bios
users_with_bio = users_df[users_df['bio'].notna()]

# Calculate the length of each bio in Unicode characters
users_with_bio['bio_length'] = users_with_bio['bio'].apply(len)
import statsmodels.api as sm

# Prepare the regression variables
X = sm.add_constant(users_with_bio['bio_length'])
Y = users_with_bio['followers']

# Fit the regression model
model = sm.OLS(Y, X).fit()

# Print the regression results
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              followers   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                 -0.003
Method:                 Least Squares   F-statistic:                    0.3221
Date:                Wed, 30 Oct 2024   Prob (F-statistic):              0.571
Time:                        08:30:46   Log-Likelihood:                -2181.3
No. Observations:                 243   AIC:                             4367.
Df Residuals:                     241   BIC:                             4374.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        391.8983    230.027      1.704      0.0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_bio['bio_length'] = users_with_bio['bio'].apply(len)


Who created the most repositories on weekends (UTC)? List the top 5 users' login in order, comma-separated

In [None]:
import csv
from collections import Counter
from datetime import datetime

weekend_repo_counts = Counter()

with open('repositories_git.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)

    for row in reader:
        created_at = row.get('created_at', '')
        if created_at:
            created_date = datetime.fromisoformat(created_at[:-1])

            if created_date.weekday() in [5, 6]:
                user_login = row['login']
                weekend_repo_counts[user_login] += 1

top_users = weekend_repo_counts.most_common(5)

top_logins = [user[0] for user in top_users]

print(','.join(top_logins))

Hamadabcn,vieron,julianxhokaxhiu,domini-code,ctford


Do people who are hireable share their email addresses more often?


In [None]:
# Convert 'hireable' to a boolean
users_df['hireable'] = users_df['hireable'].fillna(False).astype(bool)

# Determine if a user has an email
users_df['has_email'] = users_df['email'].notna() & (users_df['email'] != '')

# Calculate the fraction of users with email for hireable and non-hireable users
hireable_with_email = users_df[users_df['hireable']]['has_email'].mean()
non_hireable_with_email = users_df[~users_df['hireable']]['has_email'].mean()

# Calculate the difference
difference = round(hireable_with_email - non_hireable_with_email, 3)

print(f"Difference: {difference}")


Difference: 0.089


Let's assume that the last word in a user's name is their surname (ignore missing names, trim and split by whitespace.) What's the most common surname? (If there's a tie, list them all, comma-separated, alphabetically)
Number of users with the most common surname

In [None]:
# We will extract the 'name' column from the users_df, split the names by whitespace, and select the last word as the surname.
# First, we filter out rows where the 'name' is missing (NaN), then proceed with the surname extraction.

# Drop rows where 'name' is NaN
users_with_names = users_df.dropna(subset=['name'])

# Extract the last word (assumed surname) from the 'name' column
users_with_names['surname'] = users_with_names['name'].str.split().str[-1]

# Count the occurrences of each surname
surname_counts = users_with_names['surname'].value_counts()

# Find the most common surname(s) and the corresponding count
max_count = surname_counts.max()
most_common_surnames = surname_counts[surname_counts == max_count].index.tolist()

# Join the surnames alphabetically and get the count
most_common_surnames_sorted = ', '.join(sorted(most_common_surnames))

most_common_surnames_sorted, max_count



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_names['surname'] = users_with_names['name'].str.split().str[-1]


('Martínez, Ortiz', 3)