# Texas Political Candidate Performance Analysis

This notebook analyzes how Texas political candidates perform relative to expectations based on:
- Top-of-ticket race performance (Presidential/Senate races by district)
- Year-over-year performance trends
- District demographics
- Incumbency advantage
- Turnout rates

The model benchmarks candidates against top-level races and tracks performance across multiple election cycles.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Data Setup

Create sample dataset with multiple election years and top-level races for Texas legislative districts.

In [None]:
# Sample data structure - replace with your actual data
np.random.seed(42)

# Create multi-year dataset
years = [2018, 2020, 2022, 2024]
n_districts = 50  # Texas House districts to analyze

data_records = []

# District-level demographic baseline (relatively stable across years)
district_demographics = {
    f'HD-{i+1}': {
        'base_partisan_lean': np.random.uniform(0.25, 0.75),
        'pct_hispanic': np.random.uniform(0.10, 0.60),
        'pct_black': np.random.uniform(0.05, 0.40),
        'pct_white': np.random.uniform(0.30, 0.80),
        'pct_asian': np.random.uniform(0.02, 0.15),
        'median_income': np.random.randint(40000, 120000),
        'pct_college_degree': np.random.uniform(0.15, 0.55),
        'urban_rural_score': np.random.uniform(1, 10),
    }
    for i in range(n_districts)
}

# Top-level race performance by year
top_races = {
    2018: {'type': 'Senate', 'name': 'Cruz vs O\'Rourke', 'statewide_dem': 0.483},
    2020: {'type': 'President', 'name': 'Trump vs Biden', 'statewide_dem': 0.465},
    2022: {'type': 'Governor', 'name': 'Abbott vs O\'Rourke', 'statewide_dem': 0.436},
    2024: {'type': 'President', 'name': 'Trump vs Harris', 'statewide_dem': 0.427}
}

# Generate data for each district and year
for year in years:
    top_race = top_races[year]
    
    for dist_id, demographics in district_demographics.items():
        # Top-level race performance in this district (varies by year)
        district_top_race_dem = demographics['base_partisan_lean'] + np.random.normal(0, 0.03)
        district_top_race_dem = np.clip(district_top_race_dem, 0.15, 0.85)
        
        # Candidate info (some repeat across years)
        district_num = int(dist_id.split('-')[1])
        is_incumbent = np.random.choice([True, False], p=[0.70, 0.30])
        
        # Candidate performance correlates with top race but with variation
        candidate_vote_pct = district_top_race_dem + np.random.normal(0, 0.06)
        if is_incumbent:
            candidate_vote_pct += np.random.uniform(0.02, 0.06)
        candidate_vote_pct = np.clip(candidate_vote_pct, 0.15, 0.85)
        
        # Vote totals vary by year (presidential years have higher turnout)
        base_turnout = 0.50 if year in [2020, 2024] else 0.42
        turnout_rate = base_turnout + np.random.uniform(-0.08, 0.08)
        registered_voters = np.random.randint(60000, 140000)
        total_votes = int(registered_voters * turnout_rate)
        candidate_votes = int(total_votes * candidate_vote_pct)
        
        record = {
            'year': year,
            'district_id': dist_id,
            'candidate_name': f'Candidate_{district_num}_{year}',
            'party': 'D',  # Focusing on Democratic performance
            'is_incumbent': is_incumbent,
            
            # Top-level race info
            'top_race_type': top_race['type'],
            'top_race_name': top_race['name'],
            'statewide_top_race_dem_pct': top_race['statewide_dem'],
            'district_top_race_dem_pct': district_top_race_dem,
            
            # Candidate performance
            'candidate_votes': candidate_votes,
            'total_votes': total_votes,
            'candidate_vote_pct': candidate_vote_pct,
            
            # Turnout
            'registered_voters': registered_voters,
            'turnout_rate': turnout_rate,
            
            # Demographics (from baseline)
            'pct_hispanic': demographics['pct_hispanic'],
            'pct_black': demographics['pct_black'],
            'pct_white': demographics['pct_white'],
            'pct_asian': demographics['pct_asian'],
            'median_income': demographics['median_income'],
            'pct_college_degree': demographics['pct_college_degree'],
            'urban_rural_score': demographics['urban_rural_score'],
        }
        data_records.append(record)

df = pd.DataFrame(data_records)

print(f"Dataset created with {len(df)} observations")
print(f"Years: {sorted(df['year'].unique())}")
print(f"Districts: {n_districts}")
print(f"Observations per year: {df['year'].value_counts().to_dict()}")
print(f"\nTop-level races by year:")
for year in years:
    race = df[df['year'] == year].iloc[0]
    print(f"  {year}: {race['top_race_name']} (Statewide D: {race['statewide_top_race_dem_pct']:.1%})")

df.head(10)

## 2. Expected Performance Model

Build regression model to predict expected candidate performance based on top-level race and district characteristics.

In [None]:
# Features for prediction model
feature_cols = [
    'district_top_race_dem_pct',  # Primary predictor
    'statewide_top_race_dem_pct',
    'is_incumbent',
    'turnout_rate',
    'pct_hispanic',
    'pct_black',
    'pct_college_degree',
    'median_income',
    'urban_rural_score'
]

# Convert boolean to int
df['is_incumbent_int'] = df['is_incumbent'].astype(int)
feature_cols[2] = 'is_incumbent_int'

X = df[feature_cols]
y = df['candidate_vote_pct']

# Fit model
model = LinearRegression()
model.fit(X, y)

# Predictions
df['expected_vote_pct'] = model.predict(X)
df['performance_vs_expected'] = df['candidate_vote_pct'] - df['expected_vote_pct']
df['performance_vs_top_race'] = df['candidate_vote_pct'] - df['district_top_race_dem_pct']
df['performance_vs_expected_pct'] = (df['performance_vs_expected'] / df['expected_vote_pct']) * 100

# Model statistics
r2 = r2_score(y, df['expected_vote_pct'])
mae = mean_absolute_error(y, df['expected_vote_pct'])

print(f"Model R²: {r2:.4f}")
print(f"Mean Absolute Error: {mae:.4f} ({mae*100:.2f} percentage points)")
print(f"\nModel Coefficients:")
for col, coef in zip(feature_cols, model.coef_):
    print(f"  {col:35s}: {coef:7.4f}")
print(f"  {'Intercept':35s}: {model.intercept_:7.4f}")

## 3. Year-by-Year Analysis

Compare performance trends across election cycles.

In [None]:
# Calculate statistics by year
yearly_stats = df.groupby('year').agg({
    'candidate_vote_pct': ['mean', 'std'],
    'district_top_race_dem_pct': 'mean',
    'performance_vs_top_race': ['mean', 'std'],
    'performance_vs_expected': ['mean', 'std'],
    'turnout_rate': 'mean',
    'is_incumbent': lambda x: x.sum()
}).round(4)

print("YEAR-BY-YEAR PERFORMANCE SUMMARY")
print("=" * 100)
for year in sorted(df['year'].unique()):
    year_data = df[df['year'] == year]
    top_race = year_data.iloc[0]
    
    avg_candidate = year_data['candidate_vote_pct'].mean()
    avg_top_race = year_data['district_top_race_dem_pct'].mean()
    avg_vs_top = year_data['performance_vs_top_race'].mean()
    avg_vs_expected = year_data['performance_vs_expected'].mean()
    turnout = year_data['turnout_rate'].mean()
    n_incumbent = year_data['is_incumbent'].sum()
    
    print(f"\n{year} - {top_race['top_race_name']}")
    print(f"  Statewide Top Race (D):     {top_race['statewide_top_race_dem_pct']:.1%}")
    print(f"  Avg District Top Race (D):  {avg_top_race:.1%}")
    print(f"  Avg Candidate Performance:  {avg_candidate:.1%}")
    print(f"  Avg vs Top Race:            {avg_vs_top:+.2%}")
    print(f"  Avg vs Model Expected:      {avg_vs_expected:+.2%}")
    print(f"  Turnout Rate:               {turnout:.1%}")
    print(f"  Incumbents:                 {n_incumbent}/{len(year_data)}")

print("\n" + "=" * 100)

## 4. Performance Classification by Year

Classify candidates as overperformers, underperformers, or expected based on standard deviations, analyzed by year.

In [None]:
# Calculate z-scores for performance (normalized within each year)
df['performance_zscore'] = df.groupby('year')['performance_vs_expected'].transform(
    lambda x: (x - x.mean()) / x.std()
)

# Classify performance
def classify_performance(zscore):
    if zscore > 1.5:
        return 'Strong Overperformer'
    elif zscore > 0.5:
        return 'Moderate Overperformer'
    elif zscore < -1.5:
        return 'Strong Underperformer'
    elif zscore < -0.5:
        return 'Moderate Underperformer'
    else:
        return 'As Expected'

df['performance_category'] = df['performance_zscore'].apply(classify_performance)

# Summary statistics by year
print("PERFORMANCE DISTRIBUTION BY YEAR")
print("=" * 100)
for year in sorted(df['year'].unique()):
    year_df = df[df['year'] == year]
    print(f"\n{year}:")
    print(year_df['performance_category'].value_counts().sort_index())
    print(f"  Mean performance vs expected: {year_df['performance_vs_expected'].mean():+.4f}")
    print(f"  Std dev: {year_df['performance_vs_expected'].std():.4f}")

print("\n" + "=" * 100)
print("\nOVERALL PERFORMANCE DISTRIBUTION:")
print(df['performance_category'].value_counts().sort_index())

## 5. Incumbency Advantage Analysis by Year

In [None]:
# Calculate incumbency advantage by year
print("INCUMBENCY ADVANTAGE BY YEAR")
print("=" * 100)

for year in sorted(df['year'].unique()):
    year_df = df[df['year'] == year]
    incumbent_avg = year_df[year_df['is_incumbent'] == True]['performance_vs_expected'].mean()
    challenger_avg = year_df[year_df['is_incumbent'] == False]['performance_vs_expected'].mean()
    advantage = incumbent_avg - challenger_avg
    
    print(f"\n{year}:")
    print(f"  Incumbents:  {incumbent_avg:+.4f} ({incumbent_avg*100:+.2f} pp)")
    print(f"  Challengers: {challenger_avg:+.4f} ({challenger_avg*100:+.2f} pp)")
    print(f"  Advantage:   {advantage:+.4f} ({advantage*100:+.2f} pp)")
    
    # T-test
    if len(year_df[year_df['is_incumbent'] == True]) > 1 and len(year_df[year_df['is_incumbent'] == False]) > 1:
        t_stat, p_value = stats.ttest_ind(
            year_df[year_df['is_incumbent'] == True]['performance_vs_expected'],
            year_df[year_df['is_incumbent'] == False]['performance_vs_expected']
        )
        print(f"  T-test: t={t_stat:.3f}, p={p_value:.4f}")

# Overall incumbency advantage
overall_inc_avg = df[df['is_incumbent'] == True]['performance_vs_expected'].mean()
overall_chal_avg = df[df['is_incumbent'] == False]['performance_vs_expected'].mean()
overall_advantage = overall_inc_avg - overall_chal_avg

print("\n" + "=" * 100)
print(f"\nOVERALL INCUMBENCY ADVANTAGE: {overall_advantage:+.4f} ({overall_advantage*100:+.2f} pp)")

## 6. Visualizations

In [None]:
# Year-by-year comparison
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Average performance by year
yearly_perf = df.groupby('year').agg({
    'performance_vs_top_race': 'mean',
    'performance_vs_expected': 'mean',
    'turnout_rate': 'mean'
})

axes[0, 0].bar(yearly_perf.index, yearly_perf['performance_vs_top_race'] * 100, color='steelblue')
axes[0, 0].axhline(0, color='red', linestyle='--', alpha=0.5)
axes[0, 0].set_xlabel('Year')
axes[0, 0].set_ylabel('Average Performance vs Top Race (pp)')
axes[0, 0].set_title('Average Candidate Performance vs Top-of-Ticket by Year')
axes[0, 0].grid(axis='y', alpha=0.3)

# Turnout trend
axes[0, 1].plot(yearly_perf.index, yearly_perf['turnout_rate'] * 100, marker='o', linewidth=2, color='darkgreen')
axes[0, 1].set_xlabel('Year')
axes[0, 1].set_ylabel('Average Turnout Rate (%)')
axes[0, 1].set_title('Turnout Trends by Election Year')
axes[0, 1].grid(alpha=0.3)

# Performance distribution by year
for year in sorted(df['year'].unique()):
    year_data = df[df['year'] == year]['performance_vs_expected'] * 100
    axes[1, 0].hist(year_data, alpha=0.5, bins=20, label=str(year))

axes[1, 0].axvline(0, color='red', linestyle='--', linewidth=2, label='Expected')
axes[1, 0].set_xlabel('Performance vs Expected (pp)')
axes[1, 0].set_ylabel('Number of Candidates')
axes[1, 0].set_title('Distribution of Performance Deviations by Year')
axes[1, 0].legend()

# Box plot by year
year_groups = [df[df['year'] == year]['performance_vs_expected'] * 100 for year in sorted(df['year'].unique())]
axes[1, 1].boxplot(year_groups, labels=sorted(df['year'].unique()))
axes[1, 1].axhline(0, color='red', linestyle='--', alpha=0.5)
axes[1, 1].set_xlabel('Year')
axes[1, 1].set_ylabel('Performance vs Expected (pp)')
axes[1, 1].set_title('Performance Variability by Year')
axes[1, 1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Candidate vs Top Race Performance
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

years_list = sorted(df['year'].unique())
for idx, year in enumerate(years_list):
    ax = axes[idx // 2, idx % 2]
    year_df = df[df['year'] == year]
    
    # Scatter plot
    colors = ['blue' if inc else 'red' for inc in year_df['is_incumbent']]
    ax.scatter(year_df['district_top_race_dem_pct'] * 100, 
               year_df['candidate_vote_pct'] * 100,
               alpha=0.6, c=colors)
    
    # 45-degree line
    ax.plot([20, 80], [20, 80], 'k--', lw=2, label='Equal performance')
    
    top_race = year_df.iloc[0]
    ax.set_xlabel(f'{top_race["top_race_type"]} Race (D %)') 
    ax.set_ylabel('Candidate Vote (%)')
    ax.set_title(f'{year} - {top_race["top_race_name"]}')
    ax.legend(['Equal performance', 'Challenger', 'Incumbent'])
    ax.grid(alpha=0.3)
    ax.set_xlim(20, 80)
    ax.set_ylim(20, 80)

plt.tight_layout()
plt.show()

In [None]:
# Incumbency advantage over time
fig, ax = plt.subplots(1, 1, figsize=(12, 6))

incumbency_by_year = []
for year in sorted(df['year'].unique()):
    year_df = df[df['year'] == year]
    inc_avg = year_df[year_df['is_incumbent'] == True]['performance_vs_expected'].mean()
    chal_avg = year_df[year_df['is_incumbent'] == False]['performance_vs_expected'].mean()
    incumbency_by_year.append((inc_avg - chal_avg) * 100)

ax.bar(sorted(df['year'].unique()), incumbency_by_year, color='purple', alpha=0.7)
ax.axhline(0, color='red', linestyle='--', alpha=0.5)
ax.set_xlabel('Year')
ax.set_ylabel('Incumbency Advantage (pp)')
ax.set_title('Incumbency Advantage Over Time')
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## 7. Top Overperformers and Underperformers by Year

In [None]:
# Top performers by year
for year in sorted(df['year'].unique()):
    year_df = df[df['year'] == year]
    top_race = year_df.iloc[0]
    
    print(f"\n{'='*100}")
    print(f"{year} - {top_race['top_race_name']}")
    print(f"{'='*100}")
    
    print(f"\nTOP 5 OVERPERFORMERS:")
    top_5 = year_df.nlargest(5, 'performance_vs_expected')[[
        'district_id', 'candidate_name', 'candidate_vote_pct', 'district_top_race_dem_pct',
        'performance_vs_top_race', 'expected_vote_pct', 'performance_vs_expected', 'is_incumbent'
    ]].copy()
    
    # Format for display
    for col in ['candidate_vote_pct', 'district_top_race_dem_pct', 'performance_vs_top_race', 
                'expected_vote_pct', 'performance_vs_expected']:
        top_5[col] = top_5[col].apply(lambda x: f"{x:.1%}")
    
    print(top_5.to_string(index=False))
    
    print(f"\nTOP 5 UNDERPERFORMERS:")
    bottom_5 = year_df.nsmallest(5, 'performance_vs_expected')[[
        'district_id', 'candidate_name', 'candidate_vote_pct', 'district_top_race_dem_pct',
        'performance_vs_top_race', 'expected_vote_pct', 'performance_vs_expected', 'is_incumbent'
    ]].copy()
    
    for col in ['candidate_vote_pct', 'district_top_race_dem_pct', 'performance_vs_top_race',
                'expected_vote_pct', 'performance_vs_expected']:
        bottom_5[col] = bottom_5[col].apply(lambda x: f"{x:.1%}")
    
    print(bottom_5.to_string(index=False))

## 8. Export Results

In [None]:
# Select key columns for export
export_cols = [
    'year', 'district_id', 'candidate_name', 'party', 'is_incumbent',
    'top_race_type', 'top_race_name', 'statewide_top_race_dem_pct', 'district_top_race_dem_pct',
    'candidate_vote_pct', 'candidate_votes', 'total_votes', 'turnout_rate',
    'expected_vote_pct', 'performance_vs_top_race', 'performance_vs_expected',
    'performance_vs_expected_pct', 'performance_zscore', 'performance_category',
    'pct_hispanic', 'pct_black', 'pct_college_degree', 'median_income', 'urban_rural_score'
]

df_export = df[export_cols].copy()

# Sort by year and performance
df_export = df_export.sort_values(['year', 'performance_vs_expected'], ascending=[True, False])

# Save to CSV
df_export.to_csv('/mnt/user-data/outputs/texas_candidate_analysis_by_year.csv', index=False)
print("Results exported to texas_candidate_analysis_by_year.csv")
print(f"\nExported {len(df_export)} candidate records across {len(df_export['year'].unique())} years")

## 9. Summary Report

In [None]:
print("=" * 100)
print("TEXAS POLITICAL CANDIDATE PERFORMANCE ANALYSIS - SUMMARY")
print("=" * 100)
print(f"\nAnalysis Period: {df['year'].min()} - {df['year'].max()}")
print(f"Total Observations: {len(df)}")
print(f"Districts Analyzed: {len(df['district_id'].unique())}")
print(f"Years Analyzed: {len(df['year'].unique())}")

print(f"\n{'ELECTION CYCLES ANALYZED':^100}")
for year in sorted(df['year'].unique()):
    year_data = df[df['year'] == year].iloc[0]
    print(f"  {year}: {year_data['top_race_name']:40s} (Statewide D: {year_data['statewide_top_race_dem_pct']:.1%})")

print(f"\n{'MODEL PERFORMANCE':^100}")
print(f"  R² Score: {r2:.4f}")
print(f"  Mean Absolute Error: {mae*100:.2f} percentage points")

print(f"\n{'OVERALL INCUMBENCY ADVANTAGE':^100}")
print(f"  Average: {overall_advantage*100:+.2f} percentage points")

print(f"\n{'PERFORMANCE BY YEAR':^100}")
for year in sorted(df['year'].unique()):
    year_data = df[df['year'] == year]
    avg_perf = year_data['performance_vs_expected'].mean()
    std_perf = year_data['performance_vs_expected'].std()
    print(f"  {year}: Mean {avg_perf:+.4f} ({avg_perf*100:+.2f} pp), StdDev {std_perf:.4f}")

print(f"\n{'KEY FINDINGS':^100}")
print(f"  - Top-of-ticket performance is the strongest predictor of down-ballot success")
print(f"  - Incumbents perform {overall_advantage*100:+.2f} pp better than expected on average")
print(f"  - Presidential years ({2020}, {2024}) show higher turnout affecting candidate performance")
print(f"  - Candidate quality and local factors create {mae*100:.2f} pp average deviation from model")
print("=" * 100)