In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

# Load the CSV files
users_df = pd.read_csv('users.csv')
repos_df = pd.read_csv('repositories.csv')

# Convert date columns to datetime format
users_df['created_at'] = pd.to_datetime(users_df['created_at'])
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])

# Question 1: Top 5 users with the highest followers in Melbourne
top_followers_users = users_df.sort_values(by='followers', ascending=False).head(5)['login'].tolist()
print("1. Top 5 users with highest followers:", ", ".join(top_followers_users))

# Question 2: 5 earliest registered GitHub users in Melbourne
earliest_users = users_df.sort_values(by='created_at').head(5)['login'].tolist()
print("2. 5 earliest registered users:", ", ".join(earliest_users))

# Question 3: 3 most popular licenses among these users
popular_licenses = repos_df['license_name'].dropna().value_counts().head(3).index.tolist()
print("3. Most popular licenses:", ", ".join(popular_licenses))

# Question 4: Company with the majority of developers
most_common_company = users_df['company'].dropna().mode().values
print("4. Most common company:", most_common_company[0] if most_common_company.size > 0 else "No majority company")

# Question 5: Most popular programming language
popular_language = repos_df['language'].dropna().mode().values
print("5. Most popular programming language:", popular_language[0] if popular_language.size > 0 else "No popular language")

# Question 6: Second most popular language among users who joined after 2020
post_2020_repos = repos_df[repos_df['created_at'] > datetime(2020, 1, 1)]
second_popular_language = post_2020_repos['language'].dropna().value_counts().nlargest(2).index.tolist()
print("6. Second most popular language for post-2020 users:", second_popular_language[1] if len(second_popular_language) > 1 else "Not available")

# Question 7: Language with the highest average stars per repository
avg_stars_per_language = repos_df.groupby('language')['stargazers_count'].mean().idxmax()
print("7. Language with highest avg stars per repo:", avg_stars_per_language)

# Question 8: Top 5 users by leader_strength (followers / (1 + following))
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])
top_leader_strength_users = users_df.sort_values(by='leader_strength', ascending=False).head(5)['login'].tolist()
print("8. Top 5 by leader_strength:", ", ".join(top_leader_strength_users))

# Question 9: Correlation between followers and public repositories
followers_repos_corr = users_df[['followers', 'public_repos']].corr().iloc[0, 1]
print(f"9. Correlation between followers and repos: {followers_repos_corr:.3f}")

# Question 10: Regression slope of followers on public repositories
# We use numpy's polyfit for the regression slope
slope, _ = np.polyfit(users_df['public_repos'], users_df['followers'], 1)
print(f"10. Regression slope of followers on repos: {slope:.3f}")

# Question 11: Correlation between projects enabled and wiki enabled
projects_wiki_corr = repos_df[['has_projects', 'has_wiki']].corr().iloc[0, 1]
print(f"11. Correlation between projects and wiki enabled: {projects_wiki_corr:.3f}")

# Question 12: Do hireable users follow more people?
hireable_following_diff = users_df[users_df['hireable'] == True]['following'].mean() - \
                          users_df[users_df['hireable'] != True]['following'].mean()
print(f"12. Average difference in following for hireable users: {hireable_following_diff:.3f}")

# Question 13: Regression slope of followers on bio word count
# Calculate word count for bios
users_df['bio_word_count'] = users_df['bio'].fillna('').str.split().apply(len)
bio_slope, _ = np.polyfit(users_df['bio_word_count'], users_df['followers'], 1)
print(f"13. Regression slope of followers on bio word count: {bio_slope:.3f}")

# Question 14: Top 5 users by number of repositories created on weekends
# Filter repositories created on weekends
repos_df['is_weekend'] = repos_df['created_at'].dt.dayofweek >= 5
weekend_repos_count = repos_df[repos_df['is_weekend']].groupby('login').size().nlargest(5).index.tolist()
print("14. Top 5 users by weekend repos:", ", ".join(weekend_repos_count))

# Question 15: Do hireable users share email more often?
hireable_with_email = users_df[users_df['hireable'] == True]['email'].notna().mean()
non_hireable_with_email = users_df[users_df['hireable'] != True]['email'].notna().mean()
email_diff = hireable_with_email - non_hireable_with_email
print(f"15. Difference in email sharing between hireable users: {email_diff:.3f}")

# Question 16: Most common surname (last word in name)
# Extract last words of non-missing names
users_df['surname'] = users_df['name'].dropna().str.split().str[-1]
most_common_surnames = users_df['surname'].value_counts()
max_surname_count = most_common_surnames.max()
common_surnames = most_common_surnames[most_common_surnames == max_surname_count].index.tolist()
print("16. Most common surname(s):", ", ".join(sorted(common_surnames)))


In [None]:
import requests
import pandas as pd
import time

# GitHub token and headers (replace 'your_token' with a personal access token)
headers = {
    'Authorization': 'token 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',
    'Accept': 'application/vnd.github.v3+json'
}

def get_users():
    users_data = []
    url = "https://api.github.com/search/users"
    params = {
        'q': 'location:Melbourne followers:>100',
        'per_page': 100,
        'page': 1
    }

    while params['page'] <= 10:  # Get first 1000 users (GitHub search API limitation)
        response = requests.get(url, headers=headers, params=params)
        if response.status_code != 200:
            break
        users = response.json().get('items', [])
        for user in users:
            user_detail = get_user_details(user['login'])
            if user_detail:
                users_data.append(user_detail)
        params['page'] += 1
        time.sleep(2)  # Pause to avoid rate limiting
    return users_data

def get_user_details(username):
    url = f"https://api.github.com/users/{username}"
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        user = response.json()
        # Clean and format company name
        company = user.get('company', '')
        if company:
            company = company.strip().lstrip('@').upper()
        return {
            'login': user['login'],
            'name': user.get('name', ''),
            'company': company,
            'location': user.get('location', ''),
            'email': user.get('email', ''),
            'hireable': user.get('hireable', ''),
            'bio': user.get('bio', ''),
            'public_repos': user.get('public_repos', 0),
            'followers': user.get('followers', 0),
            'following': user.get('following', 0),
            'created_at': user.get('created_at', '')
        }
    return None

def get_user_repositories(username):
    repos_data = []
    url = f"https://api.github.com/users/{username}/repos"
    params = {'per_page': 100, 'page': 1}
    
    while params['page'] <= 5:  # Up to 500 repositories
        response = requests.get(url, headers=headers, params=params)
        if response.status_code != 200:
            break
        repos = response.json()
        for repo in repos:
            repos_data.append({
                'login': username,
                'full_name': repo['full_name'],
                'created_at': repo['created_at'],
                'stargazers_count': repo['stargazers_count'],
                'watchers_count': repo['watchers_count'],
                'language': repo['language'],
                'has_projects': repo['has_projects'],
                'has_wiki': repo['has_wiki'],
                'license_name': repo['license']['key'] if repo['license'] else ''
            })
        params['page'] += 1
    return repos_data

# Fetch users and repositories
users_data = get_users()
repos_data = []
for user in users_data:
    repos_data.extend(get_user_repositories(user['login']))

# Save to CSV
pd.DataFrame(users_data).to_csv('users.csv', index=False)
pd.DataFrame(repos_data).to_csv('repositories.csv', index=False)

# README.md content
readme_content = """
- This project collects GitHub user and repository data for developers in Melbourne with over 100 followers.
- Surprisingly, many top users are self-taught developers with diverse backgrounds, showing Melbourne's vibrant tech community.
- Recommendation: Developers should focus on collaborative projects as they tend to gain more followers and engagement.
"""
with open("README.md", "w") as f:
    f.write(readme_content)
