# Logistic Regression for Sponsorship Success

This notebook fits a logistic regression model to predict "sponsorship success".

**Definition (configurable):**
- success = 1 if `total_sponsors` is in the top quartile of the sample
- success = 0 otherwise

**Predictors include GitHub activity and basic demographics:**
- followers, public_repos, estimated_earnings
- account type (User / Organization)
- gender (where available)

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split

from data_loader import load_sample_data

In [None]:
def make_target(df: pd.DataFrame, quantile: float = 0.75) -> pd.Series:
    if "total_sponsors" not in df.columns:
        raise ValueError("Column 'total_sponsors' not found in dataset.")
    threshold = df["total_sponsors"].quantile(quantile)
    return (df["total_sponsors"] >= threshold).astype(int)


def make_features(df: pd.DataFrame):
    df = df.copy()

    y = make_target(df)

    feature_cols = [
        "followers",
        "public_repos",
        "estimated_earnings",
        "total_sponsoring",
    ]
    for col in feature_cols:
        if col not in df.columns:
            df[col] = 0.0

    X_num = df[feature_cols].fillna(0.0)

    cat_cols = []
    if "type" in df.columns:
        cat_cols.append("type")
    if "gender" in df.columns:
        cat_cols.append("gender")

    X_cat = pd.get_dummies(df[cat_cols].fillna("Unknown"), drop_first=True) if cat_cols else pd.DataFrame(
        index=df.index
    )

    X = pd.concat([X_num, X_cat], axis=1)
    return X, y

## Load Data and Prepare Features

In [None]:
df = load_sample_data()
X, y = make_features(df)

## Train-Test Split and Model Training

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

clf = LogisticRegression(max_iter=200, n_jobs=None)
clf.fit(X_train, y_train)

## Model Evaluation

In [None]:
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

print("\nClassification report (predicting sponsorship success):\n")
print(classification_report(y_test, y_pred, digits=3))

try:
    auc = roc_auc_score(y_test, y_proba)
    print(f"ROC-AUC: {auc:.3f}")
except ValueError:
    print("ROC-AUC could not be computed (likely due to only one class in y_test).")

## Model Coefficients

In [None]:
coef = pd.Series(clf.coef_[0], index=X.columns).sort_values(ascending=False)
print("\nEstimated logistic regression coefficients (higher => stronger positive association):\n")
coef