In [1]:
import requests
import pandas as pd
import time
import logging
from typing import List, Dict

class GitHubScraper:
    def __init__(self, token: str):
        self.headers = {
            'Authorization': f'token {token}',
            'Accept': 'application/vnd.github.v3+json'
        }
        self.base_url = 'https://api.github.com'

        # Setup logging
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        self.logger = logging.getLogger(__name__)

    def _make_request(self, url: str, params: dict = None) -> Dict:
        while True:
            response = requests.get(url, headers=self.headers, params=params)
            if response.status_code == 200:
                return response.json()
            elif response.status_code == 403:
                reset_time = int(response.headers.get('X-RateLimit-Reset', 0))
                sleep_time = max(reset_time - time.time(), 0) + 1
                self.logger.warning(f"Rate limit hit. Sleeping for {sleep_time} seconds")
                time.sleep(sleep_time)
            else:
                self.logger.error(f"Error {response.status_code}: {response.text}")
                response.raise_for_status()

    def clean_company_name(self, company: str) -> str:
        if not company:
            return ""
        cleaned = company.strip().lstrip('@')
        return cleaned.upper()

    def search_users(self, location: str, min_followers: int) -> List[Dict]:
        users = []
        page = 1

        while True:
            self.logger.info(f"Fetching users page {page}")
            query = f"location:{location} followers:>={min_followers}"
            params = {'q': query, 'per_page': 100, 'page': page}
            url = f"{self.base_url}/search/users"
            response = self._make_request(url, params)

            if not response['items']:
                break

            self.logger.info(f"Found {len(response['items'])} users on page {page}")

            for user in response['items']:
                user_data = self._make_request(user['url'])
                cleaned_data = {
                    'login': user_data['login'],
                    'name': user_data['name'] if user_data['name'] else "",
                    'company': self.clean_company_name(user_data.get('company')),
                    'location': user_data['location'] if user_data['location'] else "",
                    'email': user_data['email'] if user_data['email'] else "",
                    'hireable': user_data['hireable'] if user_data['hireable'] is not None else False,
                    'bio': user_data['bio'] if user_data['bio'] else "",
                    'public_repos': user_data['public_repos'],
                    'followers': user_data['followers'],
                    'following': user_data['following'],
                    'created_at': user_data['created_at']
                }
                users.append(cleaned_data)

            page += 1

        self.logger.info(f"Total users fetched: {len(users)}")
        return users

    def get_user_repositories(self, username: str, max_repos: int = 500) -> List[Dict]:
        repos = []
        page = 1

        while len(repos) < max_repos:
            self.logger.info(f"Fetching repositories for {username}, page {page}")
            params = {'sort': 'pushed', 'direction': 'desc', 'per_page': 100, 'page': page}
            url = f"{self.base_url}/users/{username}/repos"
            response = self._make_request(url, params)

            if not response:
                break

            for repo in response:
                repo_data = {
                    'login': username,
                    'full_name': repo['full_name'],
                    'created_at': repo['created_at'],
                    'stargazers_count': repo['stargazers_count'],
                    'watchers_count': repo['watchers_count'],
                    'language': repo['language'] if repo['language'] else "",
                    'has_projects': repo['has_projects'],
                    'has_wiki': repo['has_wiki'],
                    'license_name': repo['license']['key'] if repo.get('license') else ""
                }
                repos.append(repo_data)

            if len(response) < 100:
                break

            page += 1

        return repos[:max_repos]

def main():
    token = "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"  # Replace with your actual GitHub token

    # Initialize scraper
    scraper = GitHubScraper(token)

    # Search for users in Stockholm with >100 followers
    users = scraper.search_users(location='Stockholm', min_followers=100)

    # Save users to CSV
    users_df = pd.DataFrame(users)
    users_df.to_csv('users.csv', index=False)

    # Get repositories for each user
    all_repos = []
    for user in users:
        repos = scraper.get_user_repositories(user['login'])
        all_repos.extend(repos)

    # Save repositories to CSV
    repos_df = pd.DataFrame(all_repos)
    repos_df.to_csv('repositories.csv', index=False)

    print(f"Scraped {len(users)} users and {len(all_repos)} repositories")

    # Create README.md
    with open('README.md', 'w') as f:
        f.write(f"""# GitHub Users in Stockholm

This repository contains data about GitHub users in Stockholm with over 100 followers and their repositories.

## Files

1. `users.csv`: Contains information about {len(users)} GitHub users in Stockholm with over 100 followers
2. `repositories.csv`: Contains information about {len(all_repos)} public repositories from these users
3. `fetch_users.py`: Python script used to collect this data

## Data Collection

- Data collected using GitHub API
- Date of collection: {time.strftime('%Y-%m-%d')}
- Only included users with 100+ followers
- Up to 500 most recently pushed repositories per user
""")

if __name__ == "__main__":
    main()


Scraped 411 users and 35520 repositories


In [None]:
from google.colab import files

# Download the users.csv and repositories.csv files
files.download('users.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import files

# Download the repositories.csv file
files.download('repositories.csv')


Q1.

In [None]:
import pandas as pd

# Load the users data
users_df = pd.read_csv('users.csv')

# Sort the users by followers in descending order and get the top 5
top_users = users_df.sort_values(by='followers', ascending=False).head(5)

# Get the logins of the top users
top_user_logins = ', '.join(top_users['login'].tolist())

print("Top 5 users in Stockholm with the highest number of followers:")
print(top_user_logins)


Top 5 users in Stockholm with the highest number of followers:
emmabostian, emilk, mpj, hrydgard, eriklindernoren


Q2.

In [None]:
import pandas as pd

# Load the users.csv file
users = pd.read_csv('users.csv')

# Sort by followers in descending order and get the top 5
top_users = users.sort_values(by='followers', ascending=False).head(5)

# Get the login of the top 5 users
top_user_logins = ', '.join(top_users['login'].tolist())
print(top_user_logins)


emmabostian, emilk, mpj, hrydgard, eriklindernoren


Q3.

In [None]:
import pandas as pd

# Load the users data
users_df = pd.read_csv('users.csv')

# Convert 'created_at' to datetime
users_df['created_at'] = pd.to_datetime(users_df['created_at'])

# Get the top 5 earliest registered users
earliest_users = users_df.nsmallest(5, 'created_at')

# Get the login names in ascending order
earliest_user_logins = ', '.join(earliest_users['login'].tolist())

# Output the result
print("Earliest registered users in Stockholm:", earliest_user_logins)


Earliest registered users in Stockholm: Mange, kallepersson, fesplugas, etnt, pirelenito


Q4.

In [None]:
import pandas as pd

# Load the repositories data
repos_df = pd.read_csv('repositories.csv')

# Filter out missing licenses
licenses = repos_df['license_name'].dropna()

# Count the occurrences of each license
license_counts = licenses.value_counts()

# Get the top 3 most popular licenses
top_3_licenses = license_counts.head(3).index.tolist()

# Output the result
print("The 3 most popular licenses are:", ', '.join(top_3_licenses))



The 3 most popular licenses are: mit, apache-2.0, other


Q5.

In [None]:
import pandas as pd

# Load the users data
users_df = pd.read_csv('users.csv')

# Clean up the company names
users_df['company'] = users_df['company'].str.strip().str.lstrip('@').str.upper()

# Count the occurrences of each company
company_counts = users_df['company'].value_counts()

# Get the company with the highest count
most_common_company = company_counts.idxmax()
most_common_count = company_counts.max()

# Output the result
print("The majority of developers work at:", most_common_company)
print("Number of developers at this company:", most_common_count)


In [None]:
import pandas as pd

# Load the repositories.csv file
repos = pd.read_csv('repositories.csv')

# Drop rows where the language is NaN (no language specified)
repos = repos.dropna(subset=['language'])

# Count the occurrences of each language
language_counts = repos['language'].value_counts()

# Get the most popular language
most_popular_language = language_counts.idxmax()
print(f"The most popular programming language is: {most_popular_language}")


The most popular programming language is: JavaScript


Q6.

In [None]:
import pandas as pd

# Load users and repositories data
users_df = pd.read_csv('users.csv')
repos_df = pd.read_csv('repositories.csv')

# Filter users who joined after 2020
users_after_2020 = users_df[pd.to_datetime(users_df['created_at']) > '2020-01-01']

# Merge with repositories data to get repos only for users who joined after 2020
repos_after_2020 = repos_df[repos_df['login'].isin(users_after_2020['login'])]

# Filter out missing languages
languages_after_2020 = repos_after_2020['language'].dropna()

# Count occurrences of each language
language_counts_after_2020 = languages_after_2020.value_counts()

# Get the second most popular language
second_most_popular_language = language_counts_after_2020.index[1]

# Output the result
print("The second most popular programming language among users who joined after 2020 is:", second_most_popular_language)


The second most popular programming language among users who joined after 2020 is: TypeScript


Q7.

In [None]:
import pandas as pd

# Load repositories data
repos_df = pd.read_csv('repositories.csv')

# Filter out rows with missing language or zero stars
repos_with_stars = repos_df[repos_df['stargazers_count'] > 0].dropna(subset=['language'])

# Group by language and calculate the average star count per language
avg_stars_per_language = repos_with_stars.groupby('language')['stargazers_count'].mean()

# Find the language with the highest average stars
top_language_by_avg_stars = avg_stars_per_language.idxmax()
top_avg_stars = avg_stars_per_language.max()

# Output the result
print("The language with the highest average number of stars per repository is:", top_language_by_avg_stars)
print("Average stars per repository:", top_avg_stars)


The language with the highest average number of stars per repository is: RAML
Average stars per repository: 981.0


Q8.

In [None]:
import pandas as pd

# Load users data
users_df = pd.read_csv('users.csv')

# Calculate leader_strength
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])

# Sort by leader_strength in descending order and get the top 5 users
top_leaders = users_df.sort_values(by='leader_strength', ascending=False).head(5)

# Output the top 5 users' login in a comma-separated format
top_leader_logins = ", ".join(top_leaders['login'])
print("Top 5 users by leader_strength:", top_leader_logins)


Top 5 users by leader_strength: spotify, Mojang, fornwall, joearms, EmbarkStudios


Q9.

In [None]:
import pandas as pd

# Load the users data
users_df = pd.read_csv('users.csv')

# Calculate the correlation between followers and public_repos
correlation = users_df['followers'].corr(users_df['public_repos'])

# Print the correlation rounded to 3 decimal places
print("Correlation between followers and public repositories:", round(correlation, 3))


Correlation between followers and public repositories: 0.035


Q10.

In [None]:
import pandas as pd
import statsmodels.api as sm

# Load the users data
users_df = pd.read_csv('users.csv')

# Define the independent (X) and dependent (Y) variables
X = users_df['public_repos']
Y = users_df['followers']

# Add a constant to the independent variable for the intercept
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(Y, X).fit()

# Get the slope (coefficient of public_repos)
slope = model.params['public_repos']

print("Estimated increase in followers per additional repository:", round(slope, 3))


Estimated increase in followers per additional repository: 0.228


Q11.

In [None]:

repos = pd.read_csv('repositories.csv')

if repos['has_projects'].dtype == 'object':
    repos['has_projects'] = repos['has_projects'].map({'true': 'TRUE', 'false': 'FALSE'})
if repos['has_wiki'].dtype == 'object':
    repos['has_wiki'] = repos['has_wiki'].map({'true': 'TRUE', 'false': 'FALSE'})

correlation = repos['has_projects'].corr(repos['has_wiki'])

print(round(correlation, 3))

0.375


In [None]:
import pandas as pd
# Load the CSV file
csv_file = 'repositories.csv'  # Replace with the correct path

# Load the CSV into a DataFrame
data = pd.read_csv(csv_file)
# Assuming 'data' is your DataFrame with binary columns 'projects_enabled' and 'wiki_enabled'
correlation = data['has_projects'].corr(data['has_wiki'])
print(f"The correlation between projects and wiki enabled is: {correlation:.4f}")


The correlation between projects and wiki enabled is: 0.3749


Q12.

In [None]:
import pandas as pd
data = pd.read_csv('users.csv')

# Assuming 'data' is your DataFrame
average_hireable = data[data['hireable'] == True]['following'].mean()
average_non_hireable = data[data['hireable'] == False]['following'].mean()

# Calculate the difference
difference = average_hireable - average_non_hireable

print(f"The average following for hireable users minus the average following for non-hireable users is: {difference:.3f}")


The average following for hireable users minus the average following for non-hireable users is: 48.389


Q13.

In [None]:
import pandas as pd
import statsmodels.api as sm

# Load the CSV file
csv_file = 'users.csv'  # Ensure this path is correct

# Load the CSV into a DataFrame
df = pd.read_csv(csv_file)

# Check the first few rows and the data types of the DataFrame
print("DataFrame Overview:")
print(df.head())
print("\nDataFrame Info:")
print(df.info())

# Filter out users without bios
df = df[df['bio'].notnull()]

# Calculate the length of each bio in words
df['bio_word_count'] = df['bio'].str.split().str.len()

# Prepare the independent variable (X) and dependent variable (y)
X = df['bio_word_count']
y = df['followers']  # Adjust the column name as per your dataset

# Add a constant to the independent variable (for the intercept)
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X).fit()

# Get the slope (coefficient of the bio_word_count)
slope = model.params['bio_word_count']

# Print the regression slope rounded to three decimal places
print(f"\nRegression slope of followers on bio word count: {slope:.3f}")

DataFrame Overview:
             login                      name           company  \
0      emmabostian              Emma Bostian           SPOTIFY   
1            emilk           Emil Ernerfeldt    RERUN.IO, EGUI   
2              mpj  Mattias Petter Johansson  FUN FUN FUNCTION   
3         hrydgard            Henrik Rydgård               NaN   
4  eriklindernoren         Erik Linder-Norén               NaN   

            location                      email  hireable  \
0  Stockholm, Sweden                        NaN     False   
1  Stockholm, Sweden  emil.ernerfeldt@gmail.com     False   
2  Stockholm, Sweden                        NaN      True   
3  Stockholm, Sweden         hrydgard@gmail.com     False   
4  Stockholm, Sweden  eriklindernoren@gmail.com     False   

                                                 bio  public_repos  followers  \
0          Front-end Software Engineer @ Spotify\r\n            61       6473   
1       Rust coder, creator of egui, CTO of rerun.io  

Q14.

In [2]:
import pandas as pd

# Load the repositories data
repos_df = pd.read_csv('repositories.csv')

# Convert the 'created_at' column to datetime
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])

# Filter to keep only weekend entries (Saturday: 5, Sunday: 6)
repos_df['day_of_week'] = repos_df['created_at'].dt.dayofweek
weekend_repos = repos_df[repos_df['day_of_week'].isin([5, 6])]

# Count repositories created by each user on weekends
top_weekend_users = weekend_repos['login'].value_counts().head(5)

# List of top 5 users' logins
top_users_logins = ', '.join(top_weekend_users.index)
print(f"Top 5 users who created the most repositories on weekends: {top_users_logins}")


Top 5 users who created the most repositories on weekends: HaraldNordgren, Nyholm, lydell, linhduongtuan, LinusU


Q15.

In [None]:
fraction_hierable = users[users['hireable'] == True]['email'].notna().mean()
fraction_non_hierable = users[users['hireable'] == False]['email'].notna().mean()
diff = fraction_hierable - fraction_non_hierable
diff

0.16870967741935483

Q16.

In [3]:
import pandas as pd
users = pd.read_csv('users.csv')

new_users = users[users['name'].notna()].copy()
new_users['surname'] = new_users['name'].str.split().str[-1].str.strip()
surname_counts = new_users['surname'].value_counts()
max_count = surname_counts.max()
common_surnames = surname_counts[surname_counts == max_count].index.tolist()
common_surnames.sort()
print(','.join(common_surnames))


Gustafsson,Persson
