## USA DataSet

In [None]:
# 📦 Install required libraries (Run this cell only once)
# Uncomment below if not already installed
# %pip install pymc arviz openpyxl

# 📁 Import necessary libraries
import pandas as pd
import numpy as np
import pymc as pm
import arviz as az
import warnings
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings("ignore")

# 📂 Load Excel file manually
file_name = 'usa_statewise_survey.xlsx'  # 🔁 Change this to match your file name
df = pd.read_excel(file_name)
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace(r'[^\w\s]', '', regex=True)

# 🧹 Clean NA
df.dropna(inplace=True)

# 👥 Define demographics (adjust based on your dataset)
demographics = [
    'state', 'age', 'gender', 'ethnicity',
    'what_is_the_highest_level_of_education_you_have_completed',
    'what_is_your_current_occupation_or_job_title',
    'what_is_your_total_monthly_household_income_before_tax_select_range'
]
demographics = [col for col in demographics if col in df.columns]

# 🎯 Survey questions (exclude demographics and state)
question_columns = [col for col in df.columns if col not in demographics + ['state']]

# 🔁 Softmax function
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

# 📊 Store all results
all_results = []

# 🔁 Loop through US states
for state in df['state'].unique():
    df_state = df[df['state'] == state].copy()
    print(f"\n📍 Processing: {state}")

    # Encode demographics
    demo_codes = {}
    demo_cats = {}
    for col in demographics:
        cat = pd.Categorical(df_state[col].astype(str).str.strip())
        df_state[f"{col}_code"] = cat.codes
        demo_codes[col] = df_state[f"{col}_code"].values
        demo_cats[col] = cat

    # 🔄 Loop through survey questions
    for question in question_columns:
        print(f"  ➤ Analyzing: {question}")
        try:
            # Encode target
            target = df_state[question].astype(str).str.strip().str.upper()
            target_cat = pd.Categorical(target)
            df_state['target_code'] = target_cat.codes
            n_options = len(target_cat.categories)

            with pm.Model() as model:
                # Priors for demographic effects
                sigmas = {}
                a_demos = {}
                for col in demographics:
                    sigmas[col] = pm.HalfNormal(f"sigma_{col}", sigma=1)
                    a_demos[col] = pm.Normal(
                        f"a_{col}", mu=0, sigma=sigmas[col],
                        shape=(len(demo_cats[col].categories), n_options)
                    )

                # Global intercept
                a = pm.Normal("a", mu=0, sigma=1, shape=n_options)
                demo_effects = sum(a_demos[col][demo_codes[col]] for col in demographics)
                logits = a + demo_effects

                # Likelihood
                y_obs = pm.Categorical("y_obs", logit_p=logits, observed=df_state['target_code'])

                # Sampling
                trace = pm.sample(500, tune=100, target_accept=0.95, random_seed=42, progressbar=False)

            # Posterior processing
            posterior_samples = trace.posterior
            logits_samples = posterior_samples["a"].stack(sample=("chain", "draw")).values.T
            probabilities = np.apply_along_axis(softmax, 1, logits_samples)

            mean_probs = probabilities.mean(axis=0)
            std_probs = probabilities.std(axis=0)
            winning_party_indices = np.argmax(probabilities, axis=1)
            winning_counts = np.bincount(winning_party_indices, minlength=n_options)
            winning_probabilities = winning_counts / probabilities.shape[0]

            # Save results
            df_result = pd.DataFrame({
                'state': [state] * n_options,
                'question': [question] * n_options,
                'option': target_cat.categories,
                'support_percentage': np.round(mean_probs * 100, 2),
                'fluctuation': np.round(std_probs * 100, 2),
                'support_range': [
                    f"{np.round((mean_probs[i] - std_probs[i]) * 100, 2)}% – {np.round((mean_probs[i] + std_probs[i]) * 100, 2)}%"
                    for i in range(n_options)
                ],
                'win_probability': np.round(winning_probabilities * 100, 2)
            })

            all_results.append(df_result)

        except Exception as e:
            print(f"    ⚠️ Skipped '{question}' due to: {e}")

# ✅ Combine and export
final_df = pd.concat(all_results, ignore_index=True)
final_df.to_excel("usa_survey_bayesian_results.xlsx", index=False)
print("\n✅ Results saved to 'usa_survey_bayesian_results.xlsx'")



📍 Processing: California
  ➤ Analyzing: which_party_or_candidate_do_you_currently_intend_to_vote_for_in_the_upcoming_election


Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [sigma_state, a_state, sigma_age, a_age, sigma_gender, a_gender, sigma_ethnicity, a_ethnicity, sigma_what_is_the_highest_level_of_education_you_have_completed, a_what_is_the_highest_level_of_education_you_have_completed, sigma_what_is_your_current_occupation_or_job_title, a_what_is_your_current_occupation_or_job_title, sigma_what_is_your_total_monthly_household_income_before_tax_select_range, a_what_is_your_total_monthly_household_income_before_tax_select_range, a]
