usa state wise question 500_ 100

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Required Libraries
import pandas as pd
import numpy as np
import pymc as pm
import arviz as az
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

# Upload survey file
from google.colab import files
uploaded = files.upload()
file_path = list(uploaded.keys())[0]

# Load survey data
survey_df = pd.read_excel(file_path, sheet_name='Sheet1')
survey_df.columns = survey_df.columns.str.strip().str.lower()

# Define columns
raw_target_columns = [
    'Which party or candidate do you currently intend to vote for in the upcoming election?',
    'What are the top issues that matter most to you in this election?',
    'How would you rate the performance of your current state government?',
    'How would you rate the performance of the federal government?',
    'How would you rate the performance of the main opposition party?',
    'Who do you think will win the upcoming election, regardless of your vote?',
    'Which party do you believe would make the best government?',
    'How much do you trust news and media sources in general?',
    'What is your main source for political news and information?',
    'What is your opinion on current immigration policies?',
    'How important is climate change action to your vote?',
    'Do you support stricter gun control laws?',
    'What is your opinion on abortion rights?'
]
raw_demographic_columns = [
    'Age', 'Gender', 'Ethnicity',
    'occupation',
    'total monthly household income before tax',
    'education'
]


target_columns = [col.strip().lower() for col in raw_target_columns]
demographic_columns = [col.strip().lower() for col in raw_demographic_columns]

# Ensure 'state' column is present
assert 'state' in survey_df.columns, "Missing 'state' column."
survey_df = survey_df.dropna(subset=target_columns + demographic_columns + ['state'])

# Softmax
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

# Load existing results if available
save_path = "/content/drive/MyDrive/usa_survey_bayesian_partial.xlsx"
try:
    existing_results = pd.read_excel(save_path)
    completed_questions = set(zip(existing_results['State'], existing_results['Question']))
    print(f"🔄 Resuming... Already processed {len(completed_questions)} state-question pairs.")
except:
    existing_results = pd.DataFrame()
    completed_questions = set()
    print("🆕 Starting fresh.")

# Prepare result collector
all_results = existing_results.to_dict(orient="records")
counter_since_last_save = 0

# Loop by state and question
for state_name, state_df in survey_df.groupby('state'):
    print(f"\n📍 State: {state_name}")

    for question in target_columns:
        if (state_name, question) in completed_questions:
            continue  # skip already processed

        print(f"  ➤ Processing: {question[:60]}...")
        if state_df[question].dropna().empty:
            print("    ⚠️ No data. Skipping.")
            continue
        try:
            # Encode
            target = state_df[question].astype(str).str.strip().str.upper()
            target_cat = pd.Categorical(target)
            state_df = state_df.copy()
            state_df['target_code'] = target_cat.codes
            n_options = len(target_cat.categories)

            # Demographics
            demo_codes = {}
            demo_cats = {}
            for col in demographic_columns:
                cat = pd.Categorical(state_df[col].astype(str).str.strip())
                state_df[f"{col}_code"] = cat.codes
                demo_codes[col] = state_df[f"{col}_code"].values
                demo_cats[col] = cat

            # Model
            with pm.Model() as model:
                sigmas = {}
                a_demos = {}
                for col in demographic_columns:
                    sigmas[col] = pm.HalfNormal(f"sigma_{col}", sigma=1)
                    a_demos[col] = pm.Normal(f"a_{col}", mu=0, sigma=sigmas[col], shape=(len(demo_cats[col].categories), n_options))

                a = pm.Normal("a", mu=0, sigma=1, shape=n_options)
                demo_effects = sum(a_demos[col][demo_codes[col]] for col in demographic_columns)
                logits = a + demo_effects

                y_obs = pm.Categorical("y_obs", logit_p=logits, observed=state_df['target_code'])
                trace = pm.sample(500, tune=100, target_accept=0.95, random_seed=42, progressbar=False)

            # Posterior
            logits_samples = trace.posterior["a"].stack(sample=("chain", "draw")).values.T
            probabilities = np.apply_along_axis(softmax, 1, logits_samples)

            mean_probs = probabilities.mean(axis=0)
            std_probs = probabilities.std(axis=0)
            winning_party_indices = np.argmax(probabilities, axis=1)
            winning_counts = np.bincount(winning_party_indices, minlength=n_options)
            winning_probabilities = winning_counts / probabilities.shape[0]

            # Append to result
            for i, option in enumerate(target_cat.categories):
                all_results.append({
                    'State': state_name,
                    'Question': question,
                    'Option': option,
                    'Support %': round(mean_probs[i] * 100, 2),
                    'Fluctuation': round(std_probs[i] * 100, 2),
                    'Support Range': f"{round((mean_probs[i] - std_probs[i]) * 100, 2)}% – {round((mean_probs[i] + std_probs[i]) * 100, 2)}%",
                    'Winning Probability': round(winning_probabilities[i] * 100, 2)
                })

            # Save check
            counter_since_last_save += 1
            if counter_since_last_save >= 10:
                pd.DataFrame(all_results).to_excel(save_path, index=False)
                print(f"    💾 Auto-saved after 10 new questions!")
                counter_since_last_save = 0

        except Exception as e:
            print(f"    ❌ Error: {e}")

# Final save
pd.DataFrame(all_results).to_excel(save_path, index=False)
print(f"\n✅ All done! Final results saved to: {save_path}")


Mounted at /content/drive


Saving usa_survey_data_for_simulater.xlsx to usa_survey_data_for_simulater.xlsx
🆕 Starting fresh.

📍 State: Alabama
  ➤ Processing: which party or candidate do you currently intend to vote for...


ERROR:pymc.stats.convergence:The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details


  ➤ Processing: what are the top issues that matter most to you in this elec...


ERROR:pymc.stats.convergence:The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details


  ➤ Processing: how would you rate the performance of your current state gov...


ERROR:pymc.stats.convergence:The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details


  ➤ Processing: how would you rate the performance of the federal government...


ERROR:pymc.stats.convergence:The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details


  ➤ Processing: how would you rate the performance of the main opposition pa...


ERROR:pymc.stats.convergence:The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details


  ➤ Processing: who do you think will win the upcoming election, regardless ...


ERROR:pymc.stats.convergence:The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details


  ➤ Processing: which party do you believe would make the best government?...


ERROR:pymc.stats.convergence:There were 2 divergences after tuning. Increase `target_accept` or reparameterize.
ERROR:pymc.stats.convergence:The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details


  ➤ Processing: how much do you trust news and media sources in general?...


ERROR:pymc.stats.convergence:The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details


  ➤ Processing: what is your main source for political news and information?...


ERROR:pymc.stats.convergence:The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details


  ➤ Processing: what is your opinion on current immigration policies?...


ERROR:pymc.stats.convergence:The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details


    💾 Auto-saved after 10 new questions!
  ➤ Processing: how important is climate change action to your vote?...


ERROR:pymc.stats.convergence:The effective sample size per chain is smaller than 100 for some parameters.  A higher number is needed for reliable rhat and ess computation. See https://arxiv.org/abs/1903.08008 for details


  ➤ Processing: do you support stricter gun control laws?...
