**CODE**

In [28]:
import requests
import pandas as pd
import time  # To handle rate limiting

# GitHub Personal Access Token (replace with your actual token)
TOKEN = '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'
HEADERS = {"Authorization": f"token {TOKEN}"}

def get_all_users_in_basel():
    users = []
    url = 'https://api.github.com/search/users'
    params = {"q": "location:Basel followers:>10", "per_page": 100}
    page = 1

    while True:
        params["page"] = page
        response = requests.get(url, headers=HEADERS, params=params)
        data = response.json()

        if "items" not in data:
            print("Rate limit exceeded or no results found. Try again later.")
            break

        for user in data["items"]:
            user_details = requests.get(user["url"], headers=HEADERS).json()
            users.append({
                "login": user_details.get("login", ""),
                "name": user_details.get("name", ""),
                "company": clean_company_name(user_details.get("company", "")),
                "location": user_details.get("location", ""),
                "email": user_details.get("email", ""),
                "hireable": user_details.get("hireable", ""),
                "bio": user_details.get("bio", ""),
                "public_repos": user_details.get("public_repos", 0),
                "followers": user_details.get("followers", 0),
                "following": user_details.get("following", 0),
                "created_at": user_details.get("created_at", "")
            })

        if "next" not in response.links:
            break  # Exit the loop when there are no more pages
        page += 1
        time.sleep(1)  # Optional delay to avoid rate limits

    return users

def clean_company_name(company):
    if company:
        company = company.strip().upper()
        if company.startswith("@"):
            company = company[1:]
    return company

def main():
    # Fetch all users
    users_data = get_all_users_in_basel()
    users_df = pd.DataFrame(users_data)
    users_df.to_csv("users.csv", index=False)

    # Fetch repositories for each user
    all_repositories = []
    for user in users_data:
        all_repositories.extend(get_user_repositories(user["login"]))
    repos_df = pd.DataFrame(all_repositories)
    repos_df.to_csv("repositories.csv", index=False)

def get_user_repositories(user_login):
    repositories = []
    url = f"https://api.github.com/users/{user_login}/repos"
    params = {"per_page": 100, "sort": "pushed"}
    while url:
        response = requests.get(url, headers=HEADERS, params=params)
        data = response.json()

        for repo in data:
            license_name = repo.get("license", {}).get("key") if repo.get("license") else ""
            repositories.append({
                "login": user_login,
                "full_name": repo.get("full_name", ""),
                "created_at": repo.get("created_at", ""),
                "stargazers_count": repo.get("stargazers_count", 0),
                "watchers_count": repo.get("watchers_count", 0),
                "language": repo.get("language", ""),
                "has_projects": repo.get("has_projects", False),
                "has_wiki": repo.get("has_wiki", False),
                "license_name": license_name
            })

        if "next" not in response.links:
            break  # Exit loop when there are no more pages
        url = response.links["next"]["url"]
        time.sleep(2)  # Optional delay to avoid rate limits

    return repositories

if __name__ == "__main__":
    main()

**Q1**


In [29]:
import pandas as pd

users_df = pd.read_csv('users.csv')
top_5_followers = users_df.sort_values(by=['followers'], ascending=False).head(5)['login'].tolist()
print(','.join(top_5_followers))

tarsius,aalmiray,marcoroth,klmr,MrNeRF


**Q2**

In [30]:
earliest_5_users = users_df.sort_values(by=['created_at']).head(5)['login'].tolist()
print(','.join(earliest_5_users))

bennyzen,aalmiray,pvillega,tarsius,amaunz


**Q3**

In [31]:
import pandas as pd

repos_df = pd.read_csv('repositories.csv')
top_3_licenses = repos_df['license_name'].value_counts().head(3).index.tolist()
print(','.join(top_3_licenses))

mit,apache-2.0,other


**Q4**

In [46]:
company_majority = users_df['company'].mode()[0]
print(company_majority)

ADOBE


**Q5**

In [33]:
language_popular = repos_df['language'].mode()[0]
print(language_popular)

JavaScript


**Q6**

In [34]:
filtered_repos = repos_df[repos_df['login'].isin(users_df[pd.to_datetime(users_df['created_at']) > '2020-01-01']['login'])]
second_popular_language = filtered_repos['language'].value_counts().index[1]
print(second_popular_language)

HTML


**Q7**

In [35]:
average_stars = repos_df.groupby('language')['stargazers_count'].mean().sort_values(ascending=False)
highest_avg_stars_language = average_stars.index[0]
print(highest_avg_stars_language)

PureScript


**Q8**

In [36]:
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])
top_5_leader_strength = users_df.sort_values(by=['leader_strength'], ascending=False).head(5)['login'].tolist()
print(','.join(top_5_leader_strength))

dpryan79,wasserth,ravage84,elanmart,quadbiolab


**Q9**

In [37]:
correlation = users_df['followers'].corr(users_df['public_repos'])
print(f'{correlation:.3f}')

0.345


**Q10**

In [38]:
from scipy import stats

# Calculate the regression slope
slope, intercept, r_value, p_value, std_err = stats.linregress(
    users_df['public_repos'], users_df['followers']
)

# Print the slope to 3 decimal places
print(f'{slope:.3f}')

0.674


**Q11**

In [49]:
# Calculate the correlation
correlation_projects_wiki = repos_df['has_projects'].corr(repos_df['has_wiki'])

# Print the correlation to 3 decimal places
print(f'{correlation_projects_wiki:.3f}')

0.260


**Q12**

In [48]:
# Calculate the average following for hireable and non-hireable users
hireable_following_mean = users_df[users_df['hireable'] == True]['following'].mean()
non_hireable_following_mean = users_df[users_df['hireable'] != True]['following'].mean()


# Calculate the difference
difference = hireable_following_mean - non_hireable_following_mean

# Print the difference to 3 decimal places
print(f'{difference:.3f}')

45.914


**Q13**

In [42]:
from scipy import stats

users_df['bio_word_count'] = users_df['bio'].apply(lambda x: len(str(x).split()) if pd.notnull(x) else 0)
filtered_users = users_df[users_df['bio_word_count'] > 0]  # Ignore users without bios
slope, _, _, _, _ = stats.linregress(filtered_users['bio_word_count'], filtered_users['followers'])
print(f'{slope:.3f}')

2.403


**Q14**

In [43]:
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])
repos_df['dayofweek'] = repos_df['created_at'].dt.dayofweek
weekend_repos = repos_df[repos_df['dayofweek'].isin([5, 6])]  # Saturday (5) and Sunday (6)
top_5_weekend_users = weekend_repos['login'].value_counts().head(5).index.tolist()
print(','.join(top_5_weekend_users))

dpryan79,syzer,ioolkos,maysam,pvillega


**Q15**

In [50]:
hireable_with_email = users_df[users_df['hireable'] == True]['email'].notna().mean()
non_hireable_with_email = users_df[users_df['hireable'] == False]['email'].notna().mean()
email_diff = hireable_with_email - non_hireable_with_email
print("Difference in email sharing (hireable vs non-hireable):", round(email_diff, 3))

Difference in email sharing (hireable vs non-hireable): nan


**Q16**

In [45]:
users_df['surname'] = users_df['name'].str.strip().str.split().str[-1]
surname_counts = users_df['surname'].value_counts()
max_count = surname_counts.max()
most_common_surnames = ','.join(sorted(surname_counts[surname_counts == max_count].index.tolist()))
print(most_common_surnames)

Arnold,Brand,Christensen,Fink,GmbH,Group,Guggisberg,Landolt,Roth,Tan
