In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# --- Display Settings ---
pd.set_option('display.max_colwidth', None)

# 1. Load and Initial Cleaning
def process_github_data(file_path):
    df = pd.read_csv('github_candidates_1.csv')
    
    # Cleaning Language noise
    df['dominant_language'] = df['dominant_language'].str.strip()
    
    # 2. AGGREGATION: Handling repeated usernames
    # We treat the history as a cumulative engineering profile
    candidates = df.groupby('username').agg({
        'public_repos': 'sum',
        'total_stars': 'sum',
        'total_forks': 'sum',
        'commits_12m': 'sum',
        'has_cicd': 'max' # If they use CI/CD in any repo, they understand the practice
    }).reset_index()
    
    # 3. HANDLING OUTLIERS (Log Scaling)
    # GitHub stats are heavily skewed (e.g., torvalds has 200k+ stars).
    # Log scaling ensures these outliers don't make everyone else look like 0.
    candidates['log_stars'] = np.log1p(candidates['total_stars'])
    candidates['log_forks'] = np.log1p(candidates['total_forks'])
    
    # 4. ML SCALING
    scaler = MinMaxScaler()
    cols_to_scale = ['public_repos', 'log_stars', 'log_forks', 'commits_12m', 'has_cicd']
    scaled_data = scaler.fit_transform(candidates[cols_to_scale])
    df_s = pd.DataFrame(scaled_data, columns=cols_to_scale)
    
    # ==========================================
    # 5. ROLE-BASED RELATIONSHIP LOGIC
    # ==========================================

    # DEVELOPER: Focuses on Activity (Commits) + Core Programming
    # Basis: Is this person actively pushing code?
    candidates['Developer_Score'] = (df_s['commits_12m'] * 0.7) + (df_s['public_repos'] * 0.3)

    # SENIOR DEVELOPER: Focuses on Impact (Stars) + Experience (Repos)
    # Basis: Has the community validated their expertise over time?
    candidates['Senior_Developer_Score'] = (df_s['log_stars'] * 0.6) + (df_s['public_repos'] * 0.4)

    # SOLUTION ARCHITECT: Focuses on Scale (Forks) + Best Practices (CI/CD)
    # Basis: Is their code architected for reuse and automated delivery?
    candidates['Solution_Architect_Score'] = (df_s['log_forks'] * 0.5) + (df_s['has_cicd'] * 0.4) + (df_s['public_repos'] * 0.1)

    return candidates

# 6. Reasoning Engine
def get_github_reason(role, row):
    if role == "Developer":
        return (f"Selected for high shipping frequency. Contributed {int(row['commits_12m'])} commits "
                f"across {int(row['public_repos'])} repositories in the last year.")
    
    if role == "Senior Developer":
        return (f"Selected for community authority. Their work has earned {int(row['total_stars'])} stars, "
                f"indicating a high level of industry trust and senior-level impact.")
    
    if role == "Solution Architect":
        status = "Integrated" if row['has_cicd'] == 1 else "Not Found"
        return (f"Selected for architectural reuse and rigor. Their systems are forked {int(row['total_forks'])} times "
                f"and they utilize CI/CD ({status}) for automated delivery.")

# --- Execution ---
github_results = process_github_data('github_candidates_1.csv')

roles = [("Developer", "Developer_Score"), 
         ("Senior Developer", "Senior_Developer_Score"), 
         ("Solution Architect", "Solution_Architect_Score")]

for role_name, score_col in roles:
    print(f"\n{'#'*80}\nTOP 3 {role_name.upper()} CANDIDATES (GitHub Pool)\n{'#'*80}")
    top_3 = github_results.sort_values(score_col, ascending=False).head(3)
    
    for i, (_, row) in enumerate(top_3.iterrows()):
        print(f"{i+1}. {row['username']}")
        print(f"   BASIS: {get_github_reason(role_name, row)}\n")


################################################################################
TOP 3 DEVELOPER CANDIDATES (GitHub Pool)
################################################################################
1. mxstbr
   BASIS: Selected for high shipping frequency. Contributed 95 commits across 288 repositories in the last year.

2. JakeWharton
   BASIS: Selected for high shipping frequency. Contributed 95 commits across 149 repositories in the last year.

3. necolas
   BASIS: Selected for high shipping frequency. Contributed 95 commits across 80 repositories in the last year.


################################################################################
TOP 3 SENIOR DEVELOPER CANDIDATES (GitHub Pool)
################################################################################
1. getify
   BASIS: Selected for community authority. Their work has earned 226289 stars, indicating a high level of industry trust and senior-level impact.

2. evanphx
   BASIS: Selected for community author