# Multinomial Logit: Solutions

**Tutorial Series**: Discrete Choice Econometrics with PanelBox

**Notebook**: 06 - Multinomial Logit (Solutions)

**Author**: PanelBox Contributors

**Date**: 2026-02-17

---

This notebook contains complete solutions for the exercises in `06_multinomial_logit.ipynb`.

In [None]:
# Setup (same as main notebook)
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm, chi2

from panelbox.models.discrete.multinomial import MultinomialLogit

warnings.filterwarnings('ignore')
np.random.seed(42)
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)

plt.style.use('seaborn-v0_8-darkgrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11

DATA_DIR = Path("..") / "data"
OUTPUT_DIR = Path("..") / "outputs"
FIG_DIR = OUTPUT_DIR / "figures"
TABLE_DIR = OUTPUT_DIR / "tables"
FIG_DIR.mkdir(parents=True, exist_ok=True)
TABLE_DIR.mkdir(parents=True, exist_ok=True)

CAREER_LABELS = {0: 'Manual', 1: 'Technical', 2: 'Managerial'}
CAREER_COLORS = {'Manual': '#e74c3c', 'Technical': '#3498db', 'Managerial': '#2ecc71'}

# Load data
data = pd.read_csv(DATA_DIR / "career_choice.csv")
full_vars = ['educ', 'exper', 'age', 'female', 'urban']
X_full = data[full_vars].values
y = data['career'].values

# Estimate full model (needed for all exercises)
model_full = MultinomialLogit(
    endog=y, exog=X_full, n_alternatives=3, base_alternative=0
)
model_full.exog_names = full_vars
results_full = model_full.fit()

print("Setup complete. Full model estimated.")
print(f"Log-L: {results_full.llf:.2f}, Accuracy: {results_full.accuracy:.1%}")

---

## Exercise 1: Reference Category Invariance (Easy)

**Task**: Estimate the model with `base_alternative=1` and verify predictions are identical.

In [None]:
# Exercise 1 Solution

# Estimate with base = 0 (Manual)
model_base0 = MultinomialLogit(
    endog=y, exog=X_full, n_alternatives=3, base_alternative=0
)
results_base0 = model_base0.fit()

# Estimate with base = 1 (Technical)
model_base1 = MultinomialLogit(
    endog=y, exog=X_full, n_alternatives=3, base_alternative=1
)
results_base1 = model_base1.fit()

# Estimate with base = 2 (Managerial)
model_base2 = MultinomialLogit(
    endog=y, exog=X_full, n_alternatives=3, base_alternative=2
)
results_base2 = model_base2.fit()

print("=== Coefficients with Different Base Categories ===")
for base, res, mod in [(0, results_base0, model_base0),
                        (1, results_base1, model_base1),
                        (2, results_base2, model_base2)]:
    print(f"\nBase = {base} ({CAREER_LABELS[base]}):")
    for idx, j in enumerate(mod.non_base_alts):
        coefs = ', '.join([f'{v}={c:+.4f}' for v, c in zip(full_vars, res.params_matrix[idx])])
        print(f"  {CAREER_LABELS[j]:10s} vs {CAREER_LABELS[base]:10s}: {coefs}")

In [None]:
# Compare predicted probabilities
probs_base0 = results_base0.predict_proba()
probs_base1 = results_base1.predict_proba()
probs_base2 = results_base2.predict_proba()

print("=== First 5 Predicted Probability Vectors ===")
print("\nBase = 0 (Manual):")
print(pd.DataFrame(probs_base0[:5], columns=['Manual', 'Technical', 'Managerial']).round(4))
print("\nBase = 1 (Technical):")
print(pd.DataFrame(probs_base1[:5], columns=['Manual', 'Technical', 'Managerial']).round(4))

# Verify identity
print("\n=== Numerical Verification ===")
print(f"Base 0 vs Base 1: max diff = {np.max(np.abs(probs_base0 - probs_base1)):.2e}")
print(f"Base 0 vs Base 2: max diff = {np.max(np.abs(probs_base0 - probs_base2)):.2e}")
print(f"Base 1 vs Base 2: max diff = {np.max(np.abs(probs_base1 - probs_base2)):.2e}")
print(f"\nAll identical: {np.allclose(probs_base0, probs_base1) and np.allclose(probs_base0, probs_base2)}")

print("\n=== Interpretation ===")
print("Coefficients CHANGE with the base category (they are relative to the base).")
print("Predictions do NOT change. The base is a normalization, not a substantive choice.")
print("Choose the base that makes interpretation most natural.")

---

## Exercise 2: IIA Test (Medium)

**Task**: Run the Hausman-McFadden test by omitting each non-base alternative.

In [None]:
# Exercise 2 Solution

print("=" * 70)
print(" " * 15 + "HAUSMAN-MCFADDEN IIA TEST")
print("=" * 70)

iia_results = []

for alt_to_remove in [1, 2]:  # Omit Technical, then Managerial
    # Restrict data
    mask = data['career'] != alt_to_remove
    data_sub = data[mask].copy()
    
    # Remap to binary (0 = Manual, 1 = remaining non-base)
    remaining_alts = sorted(set(range(3)) - {alt_to_remove})
    remap = {old: new for new, old in enumerate(remaining_alts)}
    y_sub = data_sub['career'].map(remap).values
    X_sub = data_sub[full_vars].values
    
    base_in_sub = remap[0]  # Manual stays as base
    
    model_sub = MultinomialLogit(
        endog=y_sub, exog=X_sub, n_alternatives=2, base_alternative=base_in_sub
    )
    results_sub = model_sub.fit()
    
    # Comparable parameters
    remaining_nonbase = [a for a in remaining_alts if a != 0][0]
    full_idx = list(model_full.non_base_alts).index(remaining_nonbase)
    
    beta_full = results_full.params_matrix[full_idx]
    beta_sub = results_sub.params_matrix[0]
    
    K = model_full.K
    start = full_idx * K
    end = start + K
    vcov_full = results_full.cov_params[start:end, start:end]
    vcov_sub = results_sub.cov_params[:K, :K]
    
    diff = beta_sub - beta_full
    vcov_diff = vcov_sub - vcov_full
    
    try:
        H = float(diff @ np.linalg.inv(vcov_diff) @ diff)
        df = K
        p_value = 1 - chi2.cdf(abs(H), df)
        conclusion = 'Fail to reject IIA' if p_value > 0.05 else 'Reject IIA'
    except np.linalg.LinAlgError:
        H, p_value = np.nan, np.nan
        conclusion = 'Singular matrix'
    
    iia_results.append({
        'Removed': CAREER_LABELS[alt_to_remove],
        'N_obs': len(data_sub),
        'H': H,
        'df': K,
        'p_value': p_value,
        'Conclusion': conclusion
    })
    
    print(f"\nOmitting {CAREER_LABELS[alt_to_remove]}:")
    print(f"  Coefficient comparison ({CAREER_LABELS[remaining_nonbase]} vs Manual):")
    for i, var in enumerate(full_vars):
        print(f"    {var:8s}: Full = {beta_full[i]:+.4f}, Restricted = {beta_sub[i]:+.4f}, Diff = {diff[i]:+.4f}")
    print(f"  H = {H:.4f}, df = {K}, p = {p_value:.4f}")
    print(f"  -> {conclusion}")

print("\n=== Summary ===")
iia_df = pd.DataFrame(iia_results)
print(iia_df.to_string(index=False))

print("\nInterpretation: If both tests fail to reject, the IIA assumption")
print("appears reasonable for this career choice model.")

---

## Exercise 3: Marginal Effects Interpretation (Medium)

**Task**: Calculate AME for education and verify sum-to-zero.

In [None]:
# Exercise 3 Solution

# Step 1: AME for education
me_educ_overall = results_full.marginal_effects(at='overall', variable=0)

print("=== AME for Education (at='overall') ===")
for j in range(3):
    print(f"  P({CAREER_LABELS[j]:10s}): {me_educ_overall[j]:+.4f} ({me_educ_overall[j]*100:+.2f} pp per extra year)")

# Step 2: Verify sum-to-zero
total = me_educ_overall.sum()
print(f"\nSum of AME: {total:.8f}")
print(f"Sum-to-zero check: {'PASS' if abs(total) < 1e-4 else 'CHECK'}")

# Step 3: Interpretation
print("\n=== Interpretation ===")
print(f"One additional year of education:")
print(f"  - INCREASES P(Managerial) by {me_educ_overall[2]*100:+.2f} pp")
print(f"  - DECREASES P(Manual) by {me_educ_overall[0]*100:+.2f} pp")
print(f"  - Changes P(Technical) by {me_educ_overall[1]*100:+.2f} pp")
print(f"  - The gains in Managerial come mainly from losses in Manual and Technical.")
print(f"  - This is a direct consequence of the sum-to-zero constraint.")

In [None]:
# Step 4: Compare 'overall' vs 'mean'
me_educ_mean = results_full.marginal_effects(at='mean', variable=0)

print("=== AME: Overall vs At-Mean ===")
comparison = pd.DataFrame({
    'AME (overall)': me_educ_overall,
    'ME at mean': me_educ_mean,
    'Difference': me_educ_overall - me_educ_mean
}, index=[CAREER_LABELS[j] for j in range(3)])
print(comparison.round(4))

print("\nInterpretation:")
print("  'overall' = average of individual-level MEs (accounts for full distribution)")
print("  'mean'    = ME evaluated at the average individual")
print("  The difference arises because the MNL probability function is nonlinear.")
print("  AME (overall) is generally preferred for policy analysis.")

---

## Exercise 4: Multinomial vs Conditional Logit (Hard)

**Task**: Compare MNL and Conditional Logit using transportation data.

In [None]:
# Exercise 4 Solution

from panelbox.models.discrete.multinomial import ConditionalLogit

# Step 1: Load transportation data
transport = pd.read_csv(DATA_DIR / "transportation_choice.csv")
transport['choice_id'] = transport['id'].astype(str) + '_' + transport['year'].astype(str)

print(f"Transportation data: {transport.shape}")
print(f"Modes: {sorted(transport['mode'].unique())}")

# Step 2: Conditional Logit (alternative-specific attributes)
model_cl = ConditionalLogit(
    data=transport,
    choice_col='choice_id',
    alt_col='mode',
    chosen_col='choice',
    alt_varying_vars=['cost', 'time', 'reliability', 'comfort']
)
results_cl = model_cl.fit()

print("\n=== Conditional Logit Results ===")
print(results_cl.summary())

In [None]:
# Step 3: Multinomial Logit (individual-specific attributes)
# First, reshape to wide format (one row per choice occasion)
chosen = transport[transport['choice'] == 1][['choice_id', 'mode', 'income', 'distance']].copy()

# Map mode to numeric
mode_map = {m: i for i, m in enumerate(sorted(transport['mode'].unique()))}
chosen['mode_num'] = chosen['mode'].map(mode_map)

# Estimate MNL with individual-level variables
X_transport = chosen[['income', 'distance']].values
y_transport = chosen['mode_num'].values

model_mnl = MultinomialLogit(
    endog=y_transport,
    exog=X_transport,
    n_alternatives=len(mode_map),
    base_alternative=0
)
model_mnl.exog_names = ['income', 'distance']
results_mnl = model_mnl.fit()

print("\n=== Multinomial Logit Results ===")
print(results_mnl.summary())

In [None]:
# Step 4: Comparison
print("=" * 70)
print(" " * 10 + "CONDITIONAL LOGIT vs MULTINOMIAL LOGIT")
print("=" * 70)

print(f"\n{'Metric':<25} {'Conditional Logit':<20} {'Multinomial Logit':<20}")
print("-" * 65)
print(f"{'Log-likelihood':<25} {results_cl.llf:<20.2f} {results_mnl.llf:<20.2f}")
print(f"{'Pseudo R-squared':<25} {results_cl.pseudo_r2:<20.4f} {results_mnl.pseudo_r2:<20.4f}")
print(f"{'Accuracy':<25} {results_cl.accuracy:<20.4f} {results_mnl.accuracy:<20.4f}")
print(f"{'N parameters':<25} {len(results_cl.params):<20d} {len(results_mnl.params):<20d}")

print("\n=== When to Use Each ===")
print("\nConditional Logit:")
print("  - When alternatives differ in measurable attributes (cost, time, quality)")
print("  - Common coefficients: effect of cost is the SAME for all alternatives")
print("  - Ideal for policy simulations (what if bus cost decreases?)")

print("\nMultinomial Logit:")
print("  - When alternatives differ mainly in the eyes of the individual")
print("  - Alternative-specific coefficients: education affects each career differently")
print("  - Ideal for understanding WHO chooses WHAT")

print("\nMixed (Generalized MNL):")
print("  - Use both individual-specific AND alternative-specific variables")
print("  - Most flexible but requires more data")

---

## Exercise 5: Subgroup Analysis (Hard)

**Task**: Estimate career choice models separately for men and women.

In [None]:
# Exercise 5 Solution

# Step 1: Split by gender
data_male = data[data['female'] == 0]
data_female = data[data['female'] == 1]

print(f"Male observations: {len(data_male)}")
print(f"Female observations: {len(data_female)}")

# Career distribution by gender
print("\nCareer distribution:")
for label, sub in [('Male', data_male), ('Female', data_female)]:
    dist = sub['career'].value_counts(normalize=True).sort_index()
    print(f"  {label}: Manual={dist[0]:.1%}, Technical={dist[1]:.1%}, Managerial={dist[2]:.1%}")

In [None]:
# Step 2: Estimate for each subgroup (drop 'female' from covariates)
sub_vars = ['educ', 'exper', 'age', 'urban']

# Male model
model_male = MultinomialLogit(
    endog=data_male['career'].values,
    exog=data_male[sub_vars].values,
    n_alternatives=3,
    base_alternative=0
)
model_male.exog_names = sub_vars
results_male = model_male.fit()

# Female model
model_female = MultinomialLogit(
    endog=data_female['career'].values,
    exog=data_female[sub_vars].values,
    n_alternatives=3,
    base_alternative=0
)
model_female.exog_names = sub_vars
results_female = model_female.fit()

print("=== Male Model ===")
print(results_male.summary())
print("\n=== Female Model ===")
print(results_female.summary())

In [None]:
# Step 3: Compare coefficients
print("=== Coefficient Comparison: Male vs Female ===")
print(f"\n{'Variable':<10} {'Alt':<12} {'Male':>10} {'Female':>10} {'Diff':>10}")
print("-" * 55)

for idx, j in enumerate(model_male.non_base_alts):
    for k, var in enumerate(sub_vars):
        coef_m = results_male.params_matrix[idx, k]
        coef_f = results_female.params_matrix[idx, k]
        print(f"{var:<10} {CAREER_LABELS[j]:<12} {coef_m:+10.4f} {coef_f:+10.4f} {coef_f-coef_m:+10.4f}")
    print()

In [None]:
# Step 4: Compare AME
ame_male = results_male.marginal_effects(at='overall')
ame_female = results_female.marginal_effects(at='overall')

print("=== AME Comparison ===")
print(f"\n{'Variable':<10} {'Career':<12} {'Male AME':>10} {'Female AME':>10} {'Gap':>10}")
print("-" * 55)

for k, var in enumerate(sub_vars):
    for j in range(3):
        me_m = ame_male[j, k]
        me_f = ame_female[j, k]
        print(f"{var if j==0 else '':<10} {CAREER_LABELS[j]:<12} {me_m:+10.4f} {me_f:+10.4f} {me_f-me_m:+10.4f}")
    print()

In [None]:
# Step 5: Visualize differences
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Plot AME of education by gender
x = np.arange(3)
width = 0.35
labels_list = [CAREER_LABELS[j] for j in range(3)]

# Education AME
educ_idx = sub_vars.index('educ')
bars1 = axes[0].bar(x - width/2, ame_male[:, educ_idx], width, label='Male',
                    color='#3498db', alpha=0.8, edgecolor='black')
bars2 = axes[0].bar(x + width/2, ame_female[:, educ_idx], width, label='Female',
                    color='#e74c3c', alpha=0.8, edgecolor='black')
axes[0].set_title('AME of Education by Gender', fontweight='bold')
axes[0].set_xlabel('Career')
axes[0].set_ylabel('AME')
axes[0].set_xticks(x)
axes[0].set_xticklabels(labels_list)
axes[0].legend()
axes[0].axhline(y=0, color='k', linestyle='--', linewidth=0.8, alpha=0.5)
axes[0].grid(True, alpha=0.3, axis='y')

# Urban AME
urban_idx = sub_vars.index('urban')
bars1 = axes[1].bar(x - width/2, ame_male[:, urban_idx], width, label='Male',
                    color='#3498db', alpha=0.8, edgecolor='black')
bars2 = axes[1].bar(x + width/2, ame_female[:, urban_idx], width, label='Female',
                    color='#e74c3c', alpha=0.8, edgecolor='black')
axes[1].set_title('AME of Urban Location by Gender', fontweight='bold')
axes[1].set_xlabel('Career')
axes[1].set_ylabel('AME')
axes[1].set_xticks(x)
axes[1].set_xticklabels(labels_list)
axes[1].legend()
axes[1].axhline(y=0, color='k', linestyle='--', linewidth=0.8, alpha=0.5)
axes[1].grid(True, alpha=0.3, axis='y')

plt.suptitle('Subgroup Analysis: Career Choice Determinants by Gender',
             fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

print("\n=== Key Findings ===")
print("The subgroup analysis reveals whether career determinants")
print("operate differently for men and women. Differences in AME")
print("indicate that the same variable (e.g., education) may have")
print("different impacts on career selection for each gender.")

---

**End of Solutions**