In [None]:
import requests
import pandas as pd
from google.colab import userdata

api_token='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'

# GitHub API URL
base_url = 'https://api.github.com'
headers = {'Authorization': f'token {api_token}'}

users_data=[]
page = 1
while True:
    users_url = f"{base_url}/search/users?q=location:Tokyo+followers:>=200&page={page}&per_page=100"
    response = requests.get(users_url, headers=headers)
    data = response.json()
    if 'items' not in data or not data['items']:
        break
    users_data.extend(data['items'])
    page += 1

# Extract user info
users = []
for user in users_data:
    user_detail_url = user['url']
    user_response = requests.get(user_detail_url, headers=headers)
    user_info = user_response.json()

    # Clean up company name
    company = user_info.get('company', '')
    if company:
        company = company.strip().lstrip('@').upper()

    users.append({
        'login': user_info['login'],
        'name': user_info['name'],
        'company': company,
        'location': user_info['location'],
        'email': user_info['email'],
        # 'hireable': user_info['hireable'] if user_info['hireable'] else False,
        'hireable': 'true' if user_info['hireable'] else 'false',
        'bio': user_info['bio'],
        'public_repos': user_info['public_repos'],
        'followers': user_info['followers'],
        'following': user_info['following'],
        'created_at': user_info['created_at']
        })

# Convert to DataFrame and save as CSV
users_df = pd.DataFrame(users)
users_df.to_csv('users.csv', index=False)

# Fetch repositories for each user
repos = []
for user in users:
    page = 1
    user_repos=[]
    while True:
        repos_url = f"{base_url}/users/{user['login']}/repos?sort=pushed&direction=desc&page={page}&per_page=100"
        # repos_url = f"{base_url}/users/{user['login']}/repos?per_page=100"
        repos_response = requests.get(repos_url, headers=headers)
        repos_data = repos_response.json()
        if not repos_data or len(user_repos) >= 500:
            break
        for repo in repos_data:
            if len(user_repos) >= 500:
                break
        # for repo in repos_data:
            user_repos.append({
                'login': user['login'],
                'full_name': repo['full_name'],
                'created_at': repo['created_at'],
                'stargazers_count': repo['stargazers_count'],
                'watchers_count': repo['watchers_count'],
                'language': repo['language'],
                'has_projects': 'true' if repo['has_projects'] else 'false',
                'has_wiki': 'true' if repo['has_wiki'] else 'false',
                'license_name': repo['license']['name'] if repo['license'] else None
            })
        page+=1
    repos.extend(user_repos)

# Convert to DataFrame and save as CSV
repos_df = pd.DataFrame(repos)
repos_df.to_csv('repositories.csv', index=False)



print("Data scraping and file creation completed.")
users_df.shape

KeyboardInterrupt: 

In [None]:
import csv
from datetime import datetime

# Load users from CSV and filter by Tokyo location
def load_users(file_path):
    users = []
    with open(file_path, mode='r', encoding='utf-8') as file:
        csv_reader = csv.DictReader(file)
        for row in csv_reader:
            # Check if the user is located in Tokyo (case insensitive)
            if row["location"].strip().lower() == "tokyo":
                # Append user info along with parsed created_at date
                users.append({
                    "login": row["login"],
                    "created_at": datetime.strptime(row["created_at"], "%Y-%m-%dT%H:%M:%SZ")
                })
    return users

def find_earliest_users(users, top_n=5):
    # Sort users by the created_at date
    sorted_users = sorted(users, key=lambda x: x["created_at"])
    # Get the logins of the top N earliest users
    earliest_users = [user["login"] for user in sorted_users[:top_n]]
    return ", ".join(earliest_users)

# Main execution
users = load_users("users.csv")
earliest_users_logins = find_earliest_users(users)
print("Earliest registered users in Tokyo:", earliest_users_logins)


Earliest registered users in Tokyo: mootoh, lhl, proppy, takuma104, javascripter


In [None]:
import csv
from collections import Counter

def load_and_count_companies(file_path):
    companies = []
    with open(file_path, mode='r', encoding='utf-8') as file:
        csv_reader = csv.DictReader(file)
        for row in csv_reader:
            # Check if the user is located in Tokyo (case insensitive)
            if row["location"].strip().lower() == "tokyo":
                # Standardize the company names
                company = row["company"].strip().upper() if row["company"] else "UNKNOWN"
                companies.append(company)

    # Count occurrences of each company
    company_counts = Counter(companies)
    # Get the company with the most users
    majority_company = company_counts.most_common(1)
    return majority_company[0][0] if majority_company else "UNKNOWN"

# Main execution
majority_company = load_and_count_companies("users.csv")
print("The company with the most developers in Tokyo is:", majority_company)


The company with the most developers in Tokyo is: UNKNOWN


In [None]:
import csv
from datetime import datetime
from collections import Counter

def load_users_joined_after_2020(file_path):
    users = set()
    with open(file_path, mode='r', encoding='utf-8') as file:
        csv_reader = csv.DictReader(file)
        for row in csv_reader:
            # Check if user joined after 2020
            created_at = datetime.strptime(row["created_at"], "%Y-%m-%dT%H:%M:%SZ")
            if created_at.year > 2020:
                users.add(row["login"])
    return users

def count_languages_for_users(users, file_path):
    languages = []
    with open(file_path, mode='r', encoding='utf-8') as file:
        csv_reader = csv.DictReader(file)
        for row in csv_reader:
            # Only consider repositories of users who joined after 2020
            if row["login"] in users:
                language = row["language"].strip() if row["language"] else "Unknown"
                languages.append(language)

    # Count occurrences of each language
    language_counts = Counter(languages)
    return language_counts

def find_second_most_popular_language(language_counts):
    # Get the two most common languages
    most_common_languages = language_counts.most_common(2)
    return most_common_languages[1][0] if len(most_common_languages) > 1 else "Unknown"

# Main execution
users_after_2020 = load_users_joined_after_2020("users.csv")
language_counts = count_languages_for_users(users_after_2020, "repositories.csv")
second_most_popular_language = find_second_most_popular_language(language_counts)

print("The second most popular programming language among users who joined after 2020 is:", second_most_popular_language)


The second most popular programming language among users who joined after 2020 is: TypeScript


In [None]:
import csv
from datetime import datetime
from collections import Counter

def count_languages_in_recent_repositories(file_path):
    languages = []
    with open(file_path, mode='r', encoding='utf-8') as file:
        csv_reader = csv.DictReader(file)
        for row in csv_reader:
            # Check if the repository was created after 2020
            created_at = datetime.strptime(row["created_at"], "%Y-%m-%dT%H:%M:%SZ")
            if created_at.year > 2020:
                language = row["language"].strip() if row["language"] else "Unknown"
                languages.append(language)

    # Count occurrences of each language
    language_counts = Counter(languages)
    return language_counts

def find_second_most_popular_language(language_counts):
    # Get the two most common languages
    most_common_languages = language_counts.most_common(2)
    return most_common_languages[1][0] if len(most_common_languages) > 1 else "Unknown"

# Main execution
language_counts = count_languages_in_recent_repositories("repositories.csv")
second_most_popular_language = find_second_most_popular_language(language_counts)

print("The second most popular programming language among repositories created after 2020 is:", second_most_popular_language)


The second most popular programming language among repositories created after 2020 is: TypeScript


In [None]:
import csv
from collections import Counter

def analyze_projects_and_wikis(file_path):
    # Counters for different combinations of projects and wiki
    project_and_wiki = 0
    only_project = 0
    only_wiki = 0
    neither = 0

    with open(file_path, mode='r', encoding='utf-8') as file:
        csv_reader = csv.DictReader(file)
        for row in csv_reader:
            has_projects = row["has_projects"].strip().lower() == "true"
            has_wiki = row["has_wiki"].strip().lower() == "true"

            if has_projects and has_wiki:
                project_and_wiki += 1
            elif has_projects:
                only_project += 1
            elif has_wiki:
                only_wiki += 1
            else:
                neither += 1

    # Calculate total counts
    total_repos = project_and_wiki + only_project + only_wiki + neither

    # Calculate correlation values
    correlation_data = {
        "both": project_and_wiki,
        "only_projects": only_project,
        "only_wiki": only_wiki,
        "neither": neither,
        "total_repos": total_repos
    }

    return correlation_data

# Main execution
correlation_results = analyze_projects_and_wikis("repositories.csv")

print("Correlation Analysis of Projects and Wikis:")
print(f"Repositories with both projects and wikis: {correlation_results['both']}")
print(f"Repositories with only projects enabled: {correlation_results['only_projects']}")
print(f"Repositories with only wikis enabled: {correlation_results['only_wiki']}")
print(f"Repositories with neither enabled: {correlation_results['neither']}")
print(f"Total repositories analyzed: {correlation_results['total_repos']}")


Correlation Analysis of Projects and Wikis:
Repositories with both projects and wikis: 56192
Repositories with only projects enabled: 7907
Repositories with only wikis enabled: 129
Repositories with neither enabled: 2278
Total repositories analyzed: 66506


In [None]:
import csv
import numpy as np

def load_boolean_data(file_path):
    has_projects = []
    has_wiki = []

    with open(file_path, mode='r', encoding='utf-8') as file:
        csv_reader = csv.DictReader(file)
        for row in csv_reader:
            # Convert boolean values to 1 (True) and 0 (False)
            projects = 1 if row["has_projects"].strip().lower() == "true" else 0
            wiki = 1 if row["has_wiki"].strip().lower() == "true" else 0

            has_projects.append(projects)
            has_wiki.append(wiki)

    return has_projects, has_wiki

def calculate_correlation(has_projects, has_wiki):
    correlation = np.corrcoef(has_projects, has_wiki)[0, 1]  # Get the correlation coefficient
    return correlation

# Main execution
has_projects, has_wiki = load_boolean_data("repositories.csv")
correlation_coefficient = calculate_correlation(has_projects, has_wiki)

print("Correlation coefficient between has_projects and has_wiki:", correlation_coefficient)


Correlation coefficient between has_projects and has_wiki: 0.42684906204332856


In [None]:
import csv
from datetime import datetime, timedelta
from collections import Counter

def load_repository_data(file_path):
    user_repo_count = Counter()
    with open(file_path, mode='r', encoding='utf-8') as file:
        csv_reader = csv.DictReader(file)
        for row in csv_reader:
            created_at = datetime.strptime(row["created_at"], "%Y-%m-%dT%H:%M:%SZ")
            # Check if the created_at date is a weekend
            if created_at.weekday() >= 5:  # 5=Saturday, 6=Sunday
                user_repo_count[row["login"]] += 1
    return user_repo_count

def get_top_users(user_repo_count, top_n=5):
    # Get the top N users with the most repositories created on weekends
    top_users = user_repo_count.most_common(top_n)
    return [user[0] for user in top_users]

# Main execution
user_repo_count = load_repository_data("repositories.csv")
top_users = get_top_users(user_repo_count)

print("Top 5 users who created the most repositories on weekends (UTC):")
print(", ".join(top_users))


Top 5 users who created the most repositories on weekends (UTC):
azu, suzuki-shunsuke, yuiseki, xuwei-k, zchee


In [None]:
import csv
import numpy as np

def load_bio_and_followers(file_path):
    bio_lengths = []
    followers_counts = []

    with open(file_path, mode='r', encoding='utf-8') as file:
        csv_reader = csv.DictReader(file)
        for row in csv_reader:
            bio = row["bio"].strip()
            if bio:  # Only consider users with a bio
                bio_length = len(bio.split())  # Split by whitespace to count words
                bio_lengths.append(bio_length)
                followers_counts.append(int(row["followers"]))  # Convert followers to int

    return bio_lengths, followers_counts

def calculate_correlation(bio_lengths, followers_counts):
    if len(bio_lengths) == 0 or len(followers_counts) == 0:
        return None
    correlation = np.corrcoef(bio_lengths, followers_counts)[0, 1]  # Get the correlation coefficient
    return correlation

# Main execution
bio_lengths, followers_counts = load_bio_and_followers("users.csv")
correlation_coefficient = calculate_correlation(bio_lengths, followers_counts)

print("Correlation coefficient between the length of bios and number of followers:", correlation_coefficient)


Correlation coefficient between the length of bios and number of followers: 0.10938474824006604


In [None]:
import csv

def analyze_hireable_emails(file_path):
    hireable_with_email = 0
    hireable_total = 0
    not_hireable_with_email = 0
    not_hireable_total = 0

    with open(file_path, mode='r', encoding='utf-8') as file:
        csv_reader = csv.DictReader(file)
        for row in csv_reader:
            email = row["email"].strip()
            hireable = row["hireable"].strip().lower() == "true"

            if hireable:
                hireable_total += 1
                if email:  # Check if email is not empty
                    hireable_with_email += 1
            else:
                not_hireable_total += 1
                if email:  # Check if email is not empty
                    not_hireable_with_email += 1

    # Calculate fractions
    hireable_fraction = hireable_with_email / hireable_total if hireable_total > 0 else 0
    not_hireable_fraction = not_hireable_with_email / not_hireable_total if not_hireable_total > 0 else 0

    # Calculate the difference
    difference = round(hireable_fraction - not_hireable_fraction, 3)
    return difference

# Main execution
email_difference = analyze_hireable_emails("users.csv")

print("Difference in email sharing between hireable and not hireable users:", email_difference)


Difference in email sharing between hireable and not hireable users: 0.13


In [None]:
data=pd.read_csv('/content/users.csv')

In [None]:
import pandas as pd
import statsmodels.api as sm

# Load the CSV file
csv_file = '/content/users.csv'  # Ensure this path is correct

# Load the CSV into a DataFrame
df = pd.read_csv(csv_file)

# Check the first few rows and the data types of the DataFrame
print("DataFrame Overview:")
print(df.head())
print("\nDataFrame Info:")
print(df.info())

# Filter out users without bios
df = df[df['bio'].notnull()]

# Calculate the length of each bio in words
df['bio_word_count'] = df['bio'].str.split().str.len()

# Prepare the independent variable (X) and dependent variable (y)
X = df['bio_word_count']
y = df['followers']  # Adjust the column name as per your dataset

# Add a constant to the independent variable (for the intercept)
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X).fit()

# Get the slope (coefficient of the bio_word_count)
slope = model.params['bio_word_count']

# Print the regression slope rounded to three decimal places
print(f"\nRegression slope of followers on bio word count: {slope:.3f}")

DataFrame Overview:
       login               name       company         location  \
0   tiangolo  Sebastián Ramírez           NaN  Berlin, Germany   
1    schacon       Scott Chacon  GITBUTLERAPP  Berlin, Germany   
2   rwieruch      Robin Wieruch           NaN    Berlin/Remote   
3    shuding           Shu Ding        VERCEL           Berlin   
4  android10     Fernando Cejas      PEPPR-IO  Berlin, Germany   

                         email hireable  \
0           tiangolo@gmail.com     True   
1            schacon@gmail.com      NaN   
2                          NaN     True   
3                    g@shud.in      NaN   
4  android10@fernandocejas.com     True   

                                                 bio  public_repos  followers  \
0  Creator of FastAPI, Typer, SQLModel, Asyncer, ...            73      26445   
1                                                NaN           215      13757   
2  React & Next.js • JavaScript & TypeScript • Fr...           151       8618   
