#### Scraping the data

In [None]:
import requests
import pandas as pd
import time

def make_github_request(url, headers, params=None):
    """Make a GitHub API request """
    while True:
        try:
            response = requests.get(url, headers=headers, params=params)
            if response.status_code == 200:
                return response.json()
            elif response.status_code == 403:  # Rate limit exceeded
                reset_time = int(response.headers.get('X-RateLimit-Reset', 0))
                wait_time = max(reset_time - time.time(), 0)
                print(f"Rate limit exceeded. Waiting {wait_time:.0f} seconds...")
                time.sleep(wait_time + 1)
            else:
                print(f"Error {response.status_code}: {response.text}")
                return None
        except Exception as e:
            print(f"Request error: {str(e)}")
            return None

def clean_company_name(company):
    """Clean company names"""
    if pd.isna(company) or company is None:
        return ""
    company = str(company).strip()
    if company.startswith('@'):
        company = company[1:]
    return company.upper()

def safe_get(data, *keys, default=''):
    """Getting nested dictionary values"""
    for key in keys:
        data = data.get(key, None)
        if data is None:
            return default
    return data

def get_github_data():
    """Fetch data and create files"""
    token = input("Please enter your GitHub token: ").strip()
    if not token:
        print("GitHub token is required.")
        return None, None

    headers = {
        'Authorization': f'token {token}',
        'Accept': 'application/vnd.github.v3+json'
    }

    users_data = []
    repos_data = []
    search_url = "https://api.github.com/search/users"
    params = {
        'q': 'location:Hyderabad followers:>50',
        'per_page': 100
    }

    page = 1  # Start pagination for user search

    while True:
        params['page'] = page
        search_results = make_github_request(search_url, headers, params)

        if not search_results:
            print("No more results or error encountered in user search.")
            break

        items = search_results.get('items', [])
        if not items:
            break  # No more items to paginate


        for user in items:
            try:
                print(f"\n👤 Fetching data for user: {user.get('login', 'unknown')}")

                # Get user details
                user_data = make_github_request(user['url'], headers)
                if not user_data:
                    print(f"⚠️ Could not fetch details for user {user.get('login', 'unknown')}. Skipping.")
                    continue

                # Append user details to users_data
                users_data.append({
                    'login': safe_get(user_data, 'login'),
                    'name': safe_get(user_data, 'name'),
                    'company': clean_company_name(safe_get(user_data, 'company')),
                    'location': safe_get(user_data, 'location'),
                    'email': safe_get(user_data, 'email'),
                    'hireable': str(safe_get(user_data, 'hireable')).lower(),
                    'bio': safe_get(user_data, 'bio'),
                    'public_repos': safe_get(user_data, 'public_repos', default=0),
                    'followers': safe_get(user_data, 'followers', default=0),
                    'following': safe_get(user_data, 'following', default=0),
                    'created_at': safe_get(user_data, 'created_at')
                })

                # Get repositories
                repos_url = f"https://api.github.com/users/{user['login']}/repos"
                repo_page = 1
                while True:
                    repo_params = {
                        'sort': 'pushed',
                        'direction': 'desc',
                        'per_page': 100,
                        'page': repo_page
                    }
                    repos = make_github_request(repos_url, headers, repo_params)
                    if not repos:
                        print(f"No more repositories for {user.get('login', 'unknown')}.")
                        break

                    for repo in repos:
                        repos_data.append({
                            'login': safe_get(user_data, 'login'),
                            'full_name': safe_get(repo, 'full_name'),
                            'created_at': safe_get(repo, 'created_at'),
                            'stargazers_count': safe_get(repo, 'stargazers_count', default=0),
                            'watchers_count': safe_get(repo, 'watchers_count', default=0),
                            'language': safe_get(repo, 'language'),
                            'has_projects': str(safe_get(repo, 'has_projects')).lower(),
                            'has_wiki': str(safe_get(repo, 'has_wiki')).lower(),
                            'license_name': safe_get(repo, 'license', 'key', default='')
                        })

                    repo_page += 1

            except Exception as e:
                print(f"⚠️ Error processing user {user.get('login', 'unknown')}: {str(e)}")

        # Proceed to the next page for users
        page += 1

    if not users_data:
        print("❌ No user data collected.")
        return None, None

    if not repos_data:
        print("❌ No repository data collected.")
        return None, None

    print(f"\n💾 Creating CSV files for {len(users_data)} users and {len(repos_data)} repositories...")

    # Create DataFrames and save to CSV
    users_df = pd.DataFrame(users_data)
    repos_df = pd.DataFrame(repos_data)
    users_df.to_csv('/content/users.csv', index=False)
    repos_df.to_csv('/content/repositories.csv', index=False)

    return users_df, repos_df

def create_readme(users_df, repos_df):
    """Save results of analysis in README.md"""
    print("📝 Generating README.md...")

    try:
        total_users = len(users_df)
        total_repos = len(repos_df)
        avg_followers = users_df['followers'].mean()
        top_languages = repos_df['language'].value_counts().head()
        companies = users_df['company'].value_counts().head()

        total_repos_with_language = len(repos_df[repos_df['language'].notna()])
        python_repos = len(repos_df[repos_df['language'] == 'Python'])
        js_repos = len(repos_df[repos_df['language'] == 'JavaScript'])

        python_percent = (python_repos / total_repos_with_language * 100) if total_repos_with_language > 0 else 0
        js_percent = (js_repos / total_repos_with_language * 100) if total_repos_with_language > 0 else 0
        wiki_percent = (len(repos_df[repos_df['has_wiki'] == 'false']) / len(repos_df) * 100) if len(repos_df) > 0 else 0
        license_percent = (len(repos_df[repos_df['license_name'] == '']) / len(repos_df) * 100) if len(repos_df) > 0 else 0

        readme_content = f"""# GitHub Users Analysis - Hyderabad
## Analysis Details

### User Statistics
- Total Users Analyzed: {total_users}
- Total Repositories: {total_repos}
- Average Followers per User: {avg_followers:.1f}

### Top Programming Languages

### Most Common Companies
"""

        with open('/content/README.md', 'w') as f:
            f.write(readme_content)

    except Exception as e:
        print(f"⚠️ Error generating README: {str(e)}")
        with open('/content/README.md', 'w') as f:
            f.write("# GitHub Users Analysis\n\nError occurred during analysis.")

def main():
    print("🚀 Starting GitHub Analysis...")
    users_df, repos_df = get_github_data()
    if users_df is not None and repos_df is not None:
        create_readme(users_df, repos_df)
        print("\n✅ Analysis complete! Check users.csv, repositories.csv, and README.md in /content directory.")
    else:
        print("\n❌ Analysis failed!")

if __name__ == "__main__":
    main()


🚀 Starting GitHub Analysis...
Please enter your GitHub token: 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
🔍 Searching for users in Hyderabad...
📊 Processing page 1 with 100 users found...

👤 Fetching data for user: iam-veeramalla
No more repositories for iam-veeramalla.

👤 Fetching data for user: in28minutes
No more repositories for in28minutes.

👤 Fetching data for user: stacksimplify
No more repositories for stacksimplify.

👤 Fetching data for user: thenaveensaggam
No more repositories for thenaveensaggam.

👤 Fetching data for user: MadhavBahl
No more repositories for MadhavBahl.

👤 Fetching data for user: sivaprasadreddy
No more repositories for sivaprasadreddy.

👤 Fetching data for user: ashokitschool
No more repositories for ashokitschool.

👤 Fetching data for user: Shahzaib-D-Memon
No more repositories for Shahzaib-D-Memon.

👤 Fetching data for user: codewithdev
No more repositories for codewithdev.

👤 Fetching data for user: NotHarshhaa
No more repositories for NotHarshhaa.

👤 Fetc

##### Solutions to questions based on analysis

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from scipy import stats
import re

# Read CSV files
print("Reading CSV files...")
users_df = pd.read_csv(r'C:\Users\kadal\Downloads\users.csv')
repos_df = pd.read_csv(r'C:\Users\kadal\Downloads\repositories.csv')

# Convert created_at columns to datetime for both dataframes
users_df['created_at'] = pd.to_datetime(users_df['created_at'])
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])

print(f"Loaded {len(users_df)} users and {len(repos_df)} repositories")

# Analysis functions
def get_weekend_repo_count(created_at):
    """Count repositories created on weekends."""
    return created_at.weekday() >= 5  # 5 is Saturday, 6 is Sunday

def get_surname(name):
    """Extract surname from full name."""
    if not name or pd.isna(name):
        return None
    words = str(name).strip().split()
    return words[-1] if words else None

def bio_word_count(bio):
    """Count words in bio."""
    if not bio or pd.isna(bio):
        return 0
    return len(str(bio).split())

print("\nCalculating answers...")

# 1. Top 5 users by followers
top_followers = users_df.nlargest(5, 'followers')['login'].tolist()
answer1 = ','.join(top_followers)
print("\n1. Top 5 users by followers:", answer1)

# 2. 5 earliest registered users (now works with datetime)
earliest_users = users_df.nsmallest(5, 'created_at')['login'].tolist()
answer2 = ','.join(earliest_users)
print("\n2. 5 earliest registered users:", answer2)

# 3. Top 3 licenses
top_licenses = repos_df[repos_df['license_name'].notna()]['license_name'].value_counts().head(3).index.tolist()
answer3 = ','.join(top_licenses)
print("\n3. Top 3 licenses:", answer3)

# 4. Most common company
answer4 = users_df[users_df['company'].notna() & (users_df['company'] != '')]['company'].mode()[0]
print("\n4. Most common company:", answer4)

# 5. Most popular language
answer5 = repos_df[repos_df['language'].notna() & (repos_df['language'] != '')]['language'].mode()[0]
print("\n5. Most popular language:", answer5)

# 6. Second most popular language for users after 2020
cutoff_date = pd.Timestamp('2020-01-01').tz_localize(None)
recent_users = users_df[users_df['created_at'] > cutoff_date]['login'].tolist()
recent_repos = repos_df[repos_df['login'].isin(recent_users)]
valid_langs = recent_repos[recent_repos['language'].notna() & (recent_repos['language'] != '')]
lang_counts = valid_langs['language'].value_counts()
answer6 = lang_counts.index[1] if len(lang_counts) > 1 else "No second language found"
print("\n6. Second most popular language for recent users:", answer6)

# 7. Language with highest average stars
lang_stats = repos_df[repos_df['language'].notna() & (repos_df['language'] != '')].groupby('language').agg({
    'stargazers_count': ['mean', 'count']
})
lang_stats.columns = ['mean_stars', 'count']

min_repos = 5  # Minimum number of repositories required
filtered_langs = lang_stats[lang_stats['count'] >= min_repos]
answer7 = filtered_langs.nlargest(1, 'mean_stars').index[0] if not filtered_langs.empty else "No qualifying languages"
print("\n7. Language with highest average stars:", answer7)

# 8. Top 5 by leader_strength
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])
top_leaders = users_df.nlargest(5, 'leader_strength')['login'].tolist()
answer8 = ','.join(top_leaders)
print("\n8. Top 5 by leader_strength:", answer8)

# 9. Correlation between followers and public_repos
answer9 = f"{users_df['followers'].corr(users_df['public_repos']):.3f}"
print("\n9. Correlation followers-repos:", answer9)

# 10. Regression slope of followers on repos
valid_data = users_df[['public_repos', 'followers']].dropna()
if len(valid_data) >= 2:  # Need at least 2 points for regression
    slope, _, _, _, _ = stats.linregress(valid_data['public_repos'], valid_data['followers'])
    answer10 = f"{slope:.3f}"
else:
    answer10 = "Insufficient data"
print("\n10. Regression slope followers-repos:", answer10)

# 11. Correlation between projects and wiki
repos_df['has_projects_bool'] = repos_df['has_projects'].astype(str).map({'true': 1, 'false': 0})
repos_df['has_wiki_bool'] = repos_df['has_wiki'].astype(str).map({'true': 1, 'false': 0})
answer11 = f"{repos_df['has_projects_bool'].corr(repos_df['has_wiki_bool']):.3f}"
print("\n11. Correlation projects-wiki:", answer11)

# 12. Hireable users following difference
hireable_following = users_df[users_df['hireable'].astype(str).str.lower() == 'true']['following'].mean()
non_hireable_following = users_df[users_df['hireable'].astype(str).str.lower() != 'true']['following'].mean()
answer12 = f"{hireable_following - non_hireable_following:.3f}"
print("\n12. Hireable following difference:", answer12)

# 13. Bio length correlation with followers
users_df['bio_words'] = users_df['bio'].apply(bio_word_count)
users_with_bio = users_df[users_df['bio_words'] > 0]
if len(users_with_bio) >= 2:
    slope, _, _, _, _ = stats.linregress(users_with_bio['bio_words'], users_with_bio['followers'])
    answer13 = f"{slope:.3f}"
else:
    answer13 = "Insufficient data"
print("\n13. Bio length correlation:", answer13)

# 14. Most weekend repositories
repos_df['is_weekend'] = repos_df['created_at'].apply(get_weekend_repo_count)
weekend_repos = repos_df[repos_df['is_weekend']].groupby('login').size()
top_weekend = weekend_repos.nlargest(5).index.tolist()
answer14 = ','.join(top_weekend)
print("\n14. Top weekend repository creators:", answer14)

# 15. Hireable email sharing difference
hireable_email = (users_df[users_df['hireable'].astype(str).str.lower() == 'true']['email'].notna() &
                 (users_df[users_df['hireable'].astype(str).str.lower() == 'true']['email'] != '')).mean()
non_hireable_email = (users_df[users_df['hireable'].astype(str).str.lower() != 'true']['email'].notna() &
                     (users_df[users_df['hireable'].astype(str).str.lower() != 'true']['email'] != '')).mean()
answer15 = f"{hireable_email - non_hireable_email:.3f}"
print("\n15. Hireable email sharing difference:", answer15)

# 16. Most common surname
users_df['surname'] = users_df['name'].apply(get_surname)
valid_surnames = users_df['surname'].dropna()
surname_counts = valid_surnames.value_counts()
max_count = surname_counts.max() if not surname_counts.empty else 0
answer16 = str(max_count)
print("\n16. Number of users with most common surname:", answer16)


# Create a detailed summary with additional insights
print("\nDetailed Summary:")
print("-" * 50)

# Summary for users
print("\nUser Statistics:")
print(f"Total users analyzed: {len(users_df)}")
print(f"Average followers: {users_df['followers'].mean():.1f}")
print(f"Average public repos: {users_df['public_repos'].mean():.1f}")
print(f"Percentage hireable: {(users_df['hireable'] == 'true').mean()*100:.1f}%")

# Summary for repositories
print("\nRepository Statistics:")
print(f"Total repositories analyzed: {len(repos_df)}")
print(f"Average stars per repo: {repos_df['stargazers_count'].mean():.1f}")
print(f"Number of different languages: {repos_df['language'].nunique()}")

# Export answers to a file
answers = {
    "1. Top followers": answer1,
    "2. Earliest users": answer2,
    "3. Top licenses": answer3,
    "4. Most common company": answer4,
    "5. Most popular language": answer5,
    "6. Second most popular recent language": answer6,
    "7. Highest average stars language": answer7,
    "8. Top leader strength": answer8,
    "9. Followers-repos correlation": answer9,
    "10. Followers-repos slope": answer10,
    "11. Projects-wiki correlation": answer11,
    "12. Hireable following difference": answer12,
    "13. Bio length correlation": answer13,
    "14. Weekend repository creators": answer14,
    "15. Hireable email difference": answer15,
    "16. Most common surname count": answer16
}

# Save answers to a text file for better viewing
with open('analysis_results.txt', 'w') as f:
    f.write("GitHub Toronto Developer Analysis Results\n")
    f.write("=" * 40 + "\n\n")
    for question, answer in answers.items():
        f.write(f"{question}:\n{answer}\n\n")

print("\nResults have been saved to 'analysis_results.txt'")