# Complete Discrete Choice Case Study: Solutions

**Tutorial Series**: Discrete Choice Econometrics with PanelBox

**Notebook**: 09 - Complete Case Study (Solutions)

**Author**: PanelBox Contributors

**Date**: 2026-02-17

---

This notebook contains complete solutions for the exercises in `09_complete_case_study.ipynb`.

In [None]:
# Setup (same as main notebook)
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
from scipy.stats import norm, chi2
from scipy.special import expit
import statsmodels.api as sm

from panelbox.models.discrete.binary import (
    PooledLogit, PooledProbit, FixedEffectsLogit, RandomEffectsProbit
)
from panelbox.models.discrete.multinomial import MultinomialLogit
from panelbox.models.discrete.dynamic import DynamicBinaryPanel
from panelbox.models.discrete.ordered import OrderedLogit

warnings.filterwarnings('ignore')
np.random.seed(42)
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)

plt.style.use('seaborn-v0_8-darkgrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11

DATA_DIR = Path("..") / "data"
OUTPUT_DIR = Path("..") / "outputs"
FIG_DIR = OUTPUT_DIR / "figures"
TABLE_DIR = OUTPUT_DIR / "tables"
REPORT_DIR = OUTPUT_DIR / "reports"

FIG_DIR.mkdir(parents=True, exist_ok=True)
TABLE_DIR.mkdir(parents=True, exist_ok=True)
REPORT_DIR.mkdir(parents=True, exist_ok=True)

MODE_COLORS = {0: '#e74c3c', 1: '#f39c12', 2: '#2ecc71'}
MODE_LABELS = {0: 'On-site', 1: 'Hybrid', 2: 'Remote'}

# Load data
data = pd.read_csv(DATA_DIR / "work_mode_panel.csv")
data['remote'] = (data['mode'] == 2).astype(int)
data['flex_work'] = (data['mode'] >= 1).astype(int)

exog_vars_mnl = ['prod_remote', 'commute', 'kids', 'age', 'educ', 'tech_job']

# Estimate base models (needed for exercises)
model_pooled = PooledLogit(
    "remote ~ prod_remote + commute + kids + age + educ + tech_job",
    data, "worker_id", "year"
)
results_pooled = model_pooled.fit(cov_type='cluster')

# MNL
model_mnl = MultinomialLogit(
    endog=data['mode'].values,
    exog=data[exog_vars_mnl].values,
    base_alternative=0,
    method='pooled'
)
model_mnl.exog_names = exog_vars_mnl
np.random.seed(42)
results_mnl = model_mnl.fit()

print("Setup complete.")
print(f"Dataset: {len(data)} obs, {data['worker_id'].nunique()} workers")

---

## Exercise 1: Replicate with Different Binary Cutoff (Medium)

**Task**: Apply the full pipeline (Pooled Logit, FE Logit, RE Probit) using
`flex_work = (mode >= 1)` instead of `remote = (mode == 2)`. How do the
results change?

In [None]:
# Exercise 1 Solution

# Step 1: flex_work already created in setup
print(f"Outcome variable: flex_work (hybrid + remote vs on-site)")
print(f"P(flex_work=1): {data['flex_work'].mean():.3f}")
print(f"P(remote=1):    {data['remote'].mean():.3f}")
print(f"\nNote: flex_work has much higher prevalence than remote.")

In [None]:
# Step 2: Pooled Logit with flex_work
model_flex_logit = PooledLogit(
    "flex_work ~ prod_remote + commute + kids + age + educ + tech_job",
    data, "worker_id", "year"
)
results_flex_logit = model_flex_logit.fit(cov_type='cluster')

print("=" * 60)
print("POOLED LOGIT: Flex Work (hybrid+remote) vs On-site")
print("=" * 60)
print(results_flex_logit.summary())

In [None]:
# Step 3: FE Logit with flex_work
model_flex_fe = FixedEffectsLogit(
    "flex_work ~ prod_remote + commute + kids",
    data, "worker_id", "year"
)
results_flex_fe = model_flex_fe.fit()

n_total = model_flex_fe.n_used_entities + model_flex_fe.n_dropped_entities

print("=" * 60)
print("FE LOGIT: Flex Work")
print("=" * 60)
print(results_flex_fe.summary())
print(f"\nSwitchers: {model_flex_fe.n_used_entities} / {n_total}")

In [None]:
# Step 4: RE Probit with flex_work
# Note: RE Probit uses Gauss-Hermite quadrature over each entity,
# so we use a subsample for computational tractability
np.random.seed(42)
re_workers = np.random.choice(data['worker_id'].unique(), size=300, replace=False)
data_re_sub = data[data['worker_id'].isin(re_workers)].copy()

model_flex_re = RandomEffectsProbit(
    "flex_work ~ prod_remote + commute + kids + age + educ + tech_job",
    data_re_sub, "worker_id", "year"
)
results_flex_re = model_flex_re.fit()

print("=" * 60)
print("RE PROBIT: Flex Work (subsample N=300 workers)")
print("=" * 60)
print(results_flex_re.summary())

In [None]:
# Step 5: Compare remote vs flex_work results
print("=" * 70)
print("COMPARISON: Remote vs Flex Work (Pooled Logit)")
print("=" * 70)
print(f"\n{'Variable':>15s} {'Remote':>10s} {'Flex Work':>10s} {'Diff':>10s}")
print("-" * 50)
for var in ['prod_remote', 'commute', 'kids', 'age', 'educ', 'tech_job']:
    b_remote = results_pooled.params[var]
    b_flex = results_flex_logit.params[var]
    diff = b_flex - b_remote
    print(f"{var:>15s} {b_remote:>10.4f} {b_flex:>10.4f} {diff:>+10.4f}")

print(f"\n=== Answers ===")
print(f"1. The coefficients are generally smaller for flex_work because")
print(f"   the outcome is more common (P~0.82 vs P~0.40). The logit")
print(f"   model operates on a different part of the probability curve.")
print(f"")
print(f"2. Signs should be consistent: prod_remote and commute still")
print(f"   increase flexibility, tech_job still matters.")
print(f"")
print(f"3. The flex_work definition captures the hybrid-vs-onsite margin,")
print(f"   which may have different determinants than remote-vs-everything.")

---

## Exercise 2: Ordered Alternative (Medium)

**Task**: Treat work mode as ordered: on-site < hybrid < remote (by flexibility).
Estimate Ordered Logit and compare with MNL.

In [None]:
# Exercise 2 Solution

# Step 1: Estimate Ordered Logit
X_ordered = data[exog_vars_mnl].values
y_ordered = data['mode'].values
groups_ordered = data['worker_id'].values

model_ordered = OrderedLogit(
    endog=y_ordered,
    exog=X_ordered,
    groups=groups_ordered,
    n_categories=3
)
model_ordered.exog_names = exog_vars_mnl

results_ordered = model_ordered.fit()

print("=" * 60)
print("ORDERED LOGIT")
print("=" * 60)
print(results_ordered.summary())

In [None]:
# Step 2: Compare AIC/BIC
print("\n=== Model Comparison ===")
print(f"{'Metric':>15s} {'Ordered Logit':>15s} {'MNL':>15s}")
print("-" * 48)

# Ordered model metrics
n_params_ordered = len(results_ordered.beta) + len(results_ordered.cutpoints)
aic_ordered = 2 * n_params_ordered - 2 * results_ordered.llf
bic_ordered = np.log(len(y_ordered)) * n_params_ordered - 2 * results_ordered.llf

print(f"{'Log-L':>15s} {results_ordered.llf:>15.2f} {results_mnl.llf:>15.2f}")
print(f"{'AIC':>15s} {aic_ordered:>15.2f} {results_mnl.aic:>15.2f}")
print(f"{'BIC':>15s} {bic_ordered:>15.2f} {results_mnl.bic:>15.2f}")
print(f"{'# Parameters':>15s} {n_params_ordered:>15d} {len(results_mnl.params):>15d}")

In [None]:
# Step 3: Discussion
print("\n=== Answers ===")
print("")
print("1. Ordered Logit is more parsimonious (K+J-1 vs (J-1)*K parameters).")
print("   If BIC favors Ordered, it suggests the ordering assumption is reasonable.")
print("")
print("2. The Ordered model assumes a SINGLE index (latent flexibility preference)")
print("   with parallel shifts. This means the effect of each variable is the SAME")
print("   for the on-site->hybrid and hybrid->remote transitions.")
print("")
print("3. MNL allows different coefficients for hybrid and remote. If their")
print("   determinants truly differ (e.g., kids affects hybrid differently than")
print("   remote), MNL is more appropriate.")
print("")
print("4. For this data, the ordering (on-site < hybrid < remote) is plausible")
print("   in terms of flexibility, but the parallel regression assumption is")
print("   restrictive. A Brant test could formally assess this.")

---

## Exercise 3: Heterogeneous Effects by Sector (Hard)

**Task**: Estimate the Pooled Logit separately for tech vs non-tech workers.
Are the determinants of remote work different by sector?

In [None]:
# Exercise 3 Solution

# Step 1: Split data
tech_data = data[data['tech_job'] == 1].copy()
nontech_data = data[data['tech_job'] == 0].copy()

print(f"Tech workers:     {tech_data['worker_id'].nunique()} workers, {len(tech_data)} obs")
print(f"Non-tech workers: {nontech_data['worker_id'].nunique()} workers, {len(nontech_data)} obs")
print(f"\nP(remote):")
print(f"  Tech:     {tech_data['remote'].mean():.3f}")
print(f"  Non-tech: {nontech_data['remote'].mean():.3f}")

In [None]:
# Step 2: Estimate separate models
model_tech = PooledLogit(
    "remote ~ prod_remote + commute + kids + age + educ",
    tech_data, "worker_id", "year"
)
results_tech = model_tech.fit(cov_type='cluster')

model_nontech = PooledLogit(
    "remote ~ prod_remote + commute + kids + age + educ",
    nontech_data, "worker_id", "year"
)
results_nontech = model_nontech.fit(cov_type='cluster')

print("=" * 60)
print("TECH WORKERS")
print("=" * 60)
print(results_tech.summary())

print("\n" + "=" * 60)
print("NON-TECH WORKERS")
print("=" * 60)
print(results_nontech.summary())

In [None]:
# Step 3: Compare coefficients side by side
compare_vars = ['prod_remote', 'commute', 'kids', 'age', 'educ']

print("\n=" * 70)
print("HETEROGENEOUS EFFECTS: Tech vs Non-Tech")
print("=" * 70)
print(f"\n{'Variable':>15s} {'Tech':>10s} {'(SE)':>8s} {'Non-Tech':>10s} {'(SE)':>8s} {'Diff':>10s}")
print("-" * 65)
for var in compare_vars:
    b_t = results_tech.params[var]
    se_t = results_tech.std_errors[var]
    b_nt = results_nontech.params[var]
    se_nt = results_nontech.std_errors[var]
    diff = b_t - b_nt
    print(f"{var:>15s} {b_t:>10.4f} {se_t:>8.4f} {b_nt:>10.4f} {se_nt:>8.4f} {diff:>+10.4f}")

# Informal test for equality (using pooled SE approximation)
print("\nInformal equality tests (z = diff / sqrt(se_t^2 + se_nt^2)):")
for var in compare_vars:
    b_t = results_tech.params[var]
    se_t = results_tech.std_errors[var]
    b_nt = results_nontech.params[var]
    se_nt = results_nontech.std_errors[var]
    z = (b_t - b_nt) / np.sqrt(se_t**2 + se_nt**2)
    p = 2 * (1 - norm.cdf(abs(z)))
    sig = '***' if p < 0.01 else '**' if p < 0.05 else '*' if p < 0.1 else ''
    print(f"  {var:>15s}: z = {z:>6.2f}, p = {p:.4f} {sig}")

In [None]:
# Step 4: Visualization
fig, ax = plt.subplots(figsize=(12, 6))

x = np.arange(len(compare_vars))
width = 0.35

tech_coefs = [results_tech.params[v] for v in compare_vars]
tech_ses = [results_tech.std_errors[v] for v in compare_vars]
nontech_coefs = [results_nontech.params[v] for v in compare_vars]
nontech_ses = [results_nontech.std_errors[v] for v in compare_vars]

ax.bar(x - width/2, tech_coefs, width, yerr=[1.96*s for s in tech_ses],
       label='Tech Workers', color='#3498db', alpha=0.8, capsize=5)
ax.bar(x + width/2, nontech_coefs, width, yerr=[1.96*s for s in nontech_ses],
       label='Non-Tech Workers', color='#e74c3c', alpha=0.8, capsize=5)

ax.set_xticks(x)
ax.set_xticklabels(compare_vars, rotation=15)
ax.set_ylabel('Coefficient')
ax.set_title('Heterogeneous Effects: Tech vs Non-Tech Workers', fontweight='bold')
ax.legend()
ax.axhline(y=0, color='black', linestyle='--', alpha=0.5)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\n=== Answers ===")
print("The determinants of remote work differ by sector:")
print("  - prod_remote may have a larger effect for non-tech workers")
print("    (tech workers already have high baseline remote capability)")
print("  - commute matters for both groups but may differ in magnitude")
print("  - These differences justify sector-specific policy recommendations")

---

## Exercise 4: Temporal Cross-Validation (Medium)

**Task**: Implement temporal cross-validation for the MNL model.
Train on 2019-2021, predict mode choice for 2022-2023.

In [None]:
# Exercise 4 Solution

# Step 1: Split data
train_data = data[data['year'] <= 2021].copy()
test_data = data[data['year'] >= 2022].copy()

print(f"Training: {len(train_data)} obs (years {sorted(train_data['year'].unique())})")
print(f"Testing:  {len(test_data)} obs (years {sorted(test_data['year'].unique())})")

In [None]:
# Step 2: Estimate MNL on training data
X_train = train_data[exog_vars_mnl].values
y_train = train_data['mode'].values

model_mnl_train = MultinomialLogit(
    endog=y_train,
    exog=X_train,
    base_alternative=0,
    method='pooled'
)
model_mnl_train.exog_names = exog_vars_mnl

np.random.seed(42)
results_mnl_train = model_mnl_train.fit()

print(f"Training MNL:")
print(f"  Log-L: {results_mnl_train.llf:.2f}")
print(f"  Accuracy (in-sample): {results_mnl_train.accuracy:.3f}")

In [None]:
# Step 3: Predict on test data
X_test = test_data[exog_vars_mnl].values
y_test = test_data['mode'].values

# Predict probabilities using trained model
test_probs = model_mnl_train.predict_proba(results_mnl_train.params, X_test)
test_pred = np.argmax(test_probs, axis=1)

# Accuracy
accuracy_oos = np.mean(test_pred == y_test)
print(f"\nOut-of-sample accuracy (2022-2023): {accuracy_oos:.3f}")
print(f"In-sample accuracy (2019-2021):     {results_mnl_train.accuracy:.3f}")

In [None]:
# Step 4: Confusion matrix
conf_matrix = np.zeros((3, 3), dtype=int)
for true, pred in zip(y_test, test_pred):
    conf_matrix[int(true), int(pred)] += 1

conf_df = pd.DataFrame(
    conf_matrix,
    index=[f'True {MODE_LABELS[i]}' for i in range(3)],
    columns=[f'Pred {MODE_LABELS[i]}' for i in range(3)]
)

print("\nOut-of-sample Confusion Matrix:")
print(conf_df)

# Per-class accuracy
print("\nPer-class accuracy:")
for j in range(3):
    mask = y_test == j
    if mask.sum() > 0:
        acc_j = np.mean(test_pred[mask] == j)
        print(f"  {MODE_LABELS[j]:>10s}: {acc_j:.3f} ({mask.sum()} obs)")

# Visualize
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=['On-site', 'Hybrid', 'Remote'],
            yticklabels=['On-site', 'Hybrid', 'Remote'],
            ax=ax)
ax.set_xlabel('Predicted Mode')
ax.set_ylabel('Actual Mode')
ax.set_title(f'Out-of-sample Confusion Matrix (Accuracy: {accuracy_oos:.1%})',
             fontweight='bold')
plt.tight_layout()
plt.show()

print("\n=== Answers ===")
print(f"1. Out-of-sample accuracy ({accuracy_oos:.1%}) vs in-sample ({results_mnl_train.accuracy:.1%})")
print(f"   There is some drop, which is expected.")
print(f"2. The model may struggle with on-site predictions since")
print(f"   on-site share declined post-pandemic (distribution shift).")
print(f"3. This suggests the model captures persistent relationships")
print(f"   but may miss temporal shifts not captured by covariates.")

---

## Exercise 5: Executive Report (Hard)

**Task**: Generate a comprehensive HTML report for non-technical managers.

In [None]:
# Exercise 5 Solution

# Step 1: Select top 3 models
# For executives: Pooled Logit (simple), MNL (comprehensive), Dynamic (policy)

# Step 2: Compute counterfactual for commute increase
X_base = data[exog_vars_mnl].values.copy()
X_commute_up = X_base.copy()
commute_idx = exog_vars_mnl.index('commute')
X_commute_up[:, commute_idx] += 20

p_base = results_mnl.predict_proba(X_base)
p_commute = results_mnl.predict_proba(X_commute_up)

base_shares = {MODE_LABELS[j]: p_base[:, j].mean() for j in range(3)}
commute_shares = {MODE_LABELS[j]: p_commute[:, j].mean() for j in range(3)}
changes = {k: commute_shares[k] - base_shares[k] for k in base_shares}

# Key coefficients from pooled logit
key_coefs = {}
for var in ['prod_remote', 'commute', 'kids', 'tech_job']:
    key_coefs[var] = {
        'coef': results_pooled.params[var],
        'se': results_pooled.std_errors[var],
    }

In [None]:
# Step 3: Create visualization for report
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Panel A: Current mode distribution
mode_counts = data['mode'].value_counts().sort_index()
labels_list = [MODE_LABELS[i] for i in mode_counts.index]
colors_list = [MODE_COLORS[i] for i in mode_counts.index]
axes[0].pie(mode_counts, labels=labels_list, colors=colors_list,
            autopct='%1.1f%%', startangle=90, pctdistance=0.85)
axes[0].set_title('Current Work Mode Distribution', fontweight='bold')

# Panel B: Counterfactual
x = np.arange(3)
width = 0.35
base_vals = [base_shares[MODE_LABELS[j]] for j in range(3)]
cf_vals = [commute_shares[MODE_LABELS[j]] for j in range(3)]
axes[1].bar(x - width/2, base_vals, width, label='Current', color='#3498db', alpha=0.8)
axes[1].bar(x + width/2, cf_vals, width, label='If Commute +20 min', color='#e74c3c', alpha=0.8)
axes[1].set_xticks(x)
axes[1].set_xticklabels(['On-site', 'Hybrid', 'Remote'])
axes[1].set_ylabel('Share of Workers')
axes[1].set_title('Impact of 20-min Commute Increase', fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(FIG_DIR / '09_executive_summary.png', dpi=150, bbox_inches='tight')
plt.show()
print("Figure saved.")

In [None]:
# Step 4: Generate HTML report
report = f"""<!DOCTYPE html>
<html>
<head><title>Work Mode Analysis - Executive Summary</title>
<style>
body {{font-family: 'Segoe UI', Tahoma, sans-serif; margin: 40px; max-width: 800px;
       line-height: 1.6; color: #333;}}
table {{border-collapse: collapse; margin: 20px 0; width: 100%;}}
th, td {{border: 1px solid #ddd; padding: 10px; text-align: center;}}
th {{background-color: #2c3e50; color: white;}}
tr:nth-child(even) {{background-color: #f8f9fa;}}
h1 {{color: #2c3e50; border-bottom: 3px solid #3498db; padding-bottom: 10px;}}
h2 {{color: #3498db;}}
.key-finding {{background-color: #e8f5e9; padding: 15px; border-left: 4px solid #27ae60;
              margin: 15px 0; border-radius: 4px;}}
.warning {{background-color: #fff3e0; padding: 15px; border-left: 4px solid #f39c12;
          margin: 15px 0; border-radius: 4px;}}
.metric {{display: inline-block; text-align: center; padding: 15px 25px;
         margin: 5px; background: #f8f9fa; border-radius: 8px;}}
.metric-value {{font-size: 24px; font-weight: bold; color: #2c3e50;}}
.metric-label {{font-size: 12px; color: #7f8c8d;}}
</style></head>
<body>

<h1>Work Mode Choice: Executive Summary</h1>
<p><em>Analysis of {data['worker_id'].nunique():,} workers across {len(data['year'].unique())} years (2019-2023)</em></p>

<div style="text-align: center; margin: 30px 0;">
  <div class="metric">
    <div class="metric-value">{(data['mode']==2).mean():.0%}</div>
    <div class="metric-label">Workers Remote</div>
  </div>
  <div class="metric">
    <div class="metric-value">{(data['mode']==1).mean():.0%}</div>
    <div class="metric-label">Workers Hybrid</div>
  </div>
  <div class="metric">
    <div class="metric-value">{(data['mode']==0).mean():.0%}</div>
    <div class="metric-label">Workers On-site</div>
  </div>
</div>

<h2>What Drives Remote Work?</h2>
<table>
  <tr><th>Factor</th><th>Effect</th><th>Significance</th></tr>
  <tr><td>Remote Productivity</td><td>Strong positive</td><td>Very High</td></tr>
  <tr><td>Commute Time</td><td>Positive (longer = more remote)</td><td>High</td></tr>
  <tr><td>Tech Sector</td><td>Strong positive</td><td>Very High</td></tr>
  <tr><td>Education</td><td>Positive</td><td>Moderate</td></tr>
  <tr><td>Age</td><td>Slight negative</td><td>Low-Moderate</td></tr>
</table>

<div class="key-finding">
<h3>Key Finding: The Pandemic Created a Lasting Shift</h3>
<p>Our dynamic analysis shows that <strong>workers who experienced remote work
are significantly more likely to continue working remotely</strong>. This suggests
the pandemic didn't just temporarily change behavior &mdash; it changed preferences.
The experience of working remotely made workers value it more.</p>
</div>

<h2>What If Commutes Got Longer?</h2>
<p>If commute times increased by 20 minutes for all workers:</p>
<table>
  <tr><th>Mode</th><th>Current Share</th><th>New Share</th><th>Change</th></tr>
  <tr><td>On-site</td><td>{base_shares['On-site']:.1%}</td><td>{commute_shares['On-site']:.1%}</td>
      <td style="color: {'red' if changes['On-site'] < 0 else 'green'}">{changes['On-site']:+.1%}</td></tr>
  <tr><td>Hybrid</td><td>{base_shares['Hybrid']:.1%}</td><td>{commute_shares['Hybrid']:.1%}</td>
      <td style="color: {'red' if changes['Hybrid'] < 0 else 'green'}">{changes['Hybrid']:+.1%}</td></tr>
  <tr><td>Remote</td><td>{base_shares['Remote']:.1%}</td><td>{commute_shares['Remote']:.1%}</td>
      <td style="color: {'red' if changes['Remote'] < 0 else 'green'}">{changes['Remote']:+.1%}</td></tr>
</table>

<div class="warning">
<h3>Recommendations</h3>
<ul>
  <li><strong>Invest in remote tools</strong>: Remote productivity is the #1 factor</li>
  <li><strong>Allow trial periods</strong>: Experience creates lasting adoption</li>
  <li><strong>Plan for hybrid</strong>: Most workers prefer hybrid over fully remote</li>
  <li><strong>Sector matters</strong>: Tech and non-tech workers need different approaches</li>
</ul>
</div>

<p style="color: #999; font-size: 11px; margin-top: 40px;">
Analysis performed with PanelBox using 7 econometric models. 
Results are statistically robust across specifications.
</p>

</body></html>"""

with open(REPORT_DIR / '09_executive_report.html', 'w') as f:
    f.write(report)

print("Executive report saved to outputs/reports/09_executive_report.html")
print("\nThe report includes:")
print("  - Key metrics dashboard")
print("  - Factor importance table (simple language)")
print("  - Pandemic lasting shift finding")
print("  - Commute counterfactual scenario")
print("  - Actionable recommendations")

---

**End of Solutions**