In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# --- Fix Truncation Issue ---
# These settings ensure the console displays long strings (like for Architects) fully.
pd.set_option('display.max_colwidth', None)
pd.set_option('display.expand_frame_repr', False)

# 1. Load Data
try:
    df = pd.read_csv('kaggle-preprocessed.csv', index_col=0)
except FileNotFoundError:
    print("Error: 'kaggle-preprocessed.csv' not found. Please ensure the file is in the same directory.")
    exit()

# ==========================================
# 2. DATA CLEANING & NOISE REDUCTION
# ==========================================

# Standardizing 'Size' (Handles noise from mixed units KB, MB, GB, B)
def standardize_size_to_mb(val):
    val = str(val).upper()
    try:
        if 'GB' in val: return float(val.replace('GB','').strip()) * 1024
        if 'KB' in val: return float(val.replace('KB','').strip()) / 1024
        if 'B' in val: return 0.01 
        return float(val.replace('MB','').strip())
    except: return 0.0

df['size_mb'] = df['size'].apply(standardize_size_to_mb)

# Weighted Medals (Establishing a quality relationship: Gold=5, Silver=3, Bronze=1)
medal_map = {'Gold': 5, 'Silver': 3, 'Bronze': 1, 'No Medal': 0}
df['medal_points'] = df['Medals'].map(medal_map).fillna(0)

# Handling Missing Usability (Noise removal using median imputation)
df['Usability'] = df['Usability'].fillna(df['Usability'].median())

# ==========================================
# 3. PORTFOLIO AGGREGATION (Repeated Authors)
# ==========================================
# We treat the author's total history as their "Resume"
authors = df.groupby('Author_name').agg({
    'Dataset_name': 'count',      # Volume of projects
    'No_of_files': 'sum',         # Total structural complexity
    'Upvotes': 'sum',             # Total community impact
    'Usability': 'mean',          # Average documentation quality
    'medal_points': 'sum',        # Total peer validation
    'size_mb': 'sum'              # Total data scale managed
}).reset_index()

# ==========================================
# 4. ML FEATURE SCALING & OUTLIER HANDLING
# ==========================================
# We use Log transformation (log1p) on highly skewed columns.
# This prevents one "viral" dataset from outranking consistently good developers.
authors['log_upvotes'] = np.log1p(authors['Upvotes'])
authors['log_size'] = np.log1p(authors['size_mb'])

scaler = MinMaxScaler()
cols_to_scale = ['Dataset_name', 'No_of_files', 'log_upvotes', 'Usability', 'medal_points', 'log_size']
authors_scaled = pd.DataFrame(scaler.fit_transform(authors[cols_to_scale]), columns=cols_to_scale)

# ==========================================
# 5. ROLE-BASED RELATIONSHIP LOGIC
# ==========================================

# DEVELOPER Index: Documentation Consistency + Project Volume
# Relationship: Clean hand-off capability.
authors['Developer_Score'] = (authors_scaled['Usability'] * 0.7) + (authors_scaled['Dataset_name'] * 0.3)

# SENIOR DEVELOPER Index: Industry Validation + Peer Impact
# Relationship: Technical authority via community vetting.
authors['Senior_Developer_Score'] = (authors_scaled['medal_points'] * 0.6) + (authors_scaled['log_upvotes'] * 0.4)

# SOLUTION ARCHITECT Index: Complexity + Data Scale + Usability
# Relationship: Ability to organize massive datasets without losing structure.
authors['Solution_Architect_Score'] = (authors_scaled['No_of_files'] * 0.4) + (authors_scaled['log_size'] * 0.4) + (authors_scaled['Usability'] * 0.2)

# ==========================================
# 6. FINAL OUTPUT & REASONING ENGINE
# ==========================================

def get_reasoning(role, row):
    if role == "Developer":
        return (f"Selected for high documentation standards. Maintained an average Usability score "
                f"of {row['Usability']:.1f}/10 across {int(row['Dataset_name'])} projects, ensuring clean code handovers.")
    
    if role == "Senior Developer":
        return (f"Selected for community authority and peer vetting. Earned {int(row['medal_points'])} medal points "
                f"and {int(row['Upvotes'])} peer upvotes, establishing them as a gold-standard reference in the industry.")
    
    if role == "Solution Architect":
        # Check if size is large enough to show in GB
        size_val = row['size_mb'] / 1024 if row['size_mb'] > 1024 else row['size_mb']
        unit = "GB" if row['size_mb'] > 1024 else "MB"
        return (f"Selected for scale mastery and structural complexity. Successfully managed {int(row['No_of_files'])} "
                f"individual files across a total footprint of {size_val:.1f} {unit} data.")

roles = [("Developer", "Developer_Score"), 
         ("Senior Developer", "Senior_Developer_Score"), 
         ("Solution Architect", "Solution_Architect_Score")]

for role_label, score_col in roles:
    print(f"\n{'#'*80}\nTOP 3 {role_label.upper()} CANDIDATES\n{'#'*80}")
    # Sort by calculated ML Score and take Top 3
    top_3 = authors.sort_values(score_col, ascending=False).head(3)
    
    for i, (_, row) in enumerate(top_3.iterrows()):
        print(f"{i+1}. {row['Author_name']}")
        print(f"   BASIS: {get_reasoning(role_label, row)}\n")


################################################################################
TOP 3 DEVELOPER CANDIDATES
################################################################################
1. The Devastator
   BASIS: Selected for high documentation standards. Maintained an average Usability score of 9.4/10 across 504 projects, ensuring clean code handovers.

2. fedesoriano
   BASIS: Selected for high documentation standards. Maintained an average Usability score of 10.0/10 across 23 projects, ensuring clean code handovers.

3. MarÃ­lia Prata
   BASIS: Selected for high documentation standards. Maintained an average Usability score of 10.0/10 across 23 projects, ensuring clean code handovers.


################################################################################
TOP 3 SENIOR DEVELOPER CANDIDATES
################################################################################
1. Larxel
   BASIS: Selected for community authority and peer vetting. Earned 129 medal points and 9