In [103]:
# This file estimates the transition matrix M, which shows the probability of a vote for party-i upon elimination being distributed to party-j.

In [104]:
# imports
import pandas as pd
import numpy as np

In [147]:
existingparties_to_model = ['UAPP', 'ON', 'ALP', 'IND', 'GRN', 'LP',  'NP', 'CLP', "LNP", "XEN"]
pseudo_parties = ["TEAL", "OTH"]

ab_mapper = {
    "UAPP": "UAP",
    "ON": "ONP",
    "ALP": "ALP",
    "IND": "IND",
    "XEN": "IND", # Assume that Xenephon alliance can be treated as an independent party.
    "GRN": "GRN",
    "LP": "LIB",
    "NP": "NAT",
    "CLP":"CLP",
    "LNP": "LNP",
    "TEAL":"TEAL",
    "OTH": "OTH"
}

modeled_parties = list(set(ab_mapper.values()))

In [106]:
# load in the data
distribution_data = pd.read_csv("data/historical_election_data/HouseDopByDivisionDownload-27966.csv",header=1)
distribution_data.head()

Unnamed: 0,StateAb,DivisionID,DivisionNm,CountNumber,BallotPosition,CandidateID,Surname,GivenNm,PartyAb,PartyNm,Elected,HistoricElected,CalculationType,CalculationValue
0,ACT,318,Bean,0,1,36239,CONWAY,Sean,UAPP,United Australia Party,N,N,Preference Count,2831.0
1,ACT,318,Bean,0,1,36239,CONWAY,Sean,UAPP,United Australia Party,N,N,Preference Percent,2.88
2,ACT,318,Bean,0,1,36239,CONWAY,Sean,UAPP,United Australia Party,N,N,Transfer Count,0.0
3,ACT,318,Bean,0,1,36239,CONWAY,Sean,UAPP,United Australia Party,N,N,Transfer Percent,0.0
4,ACT,318,Bean,0,2,37455,AMBARD,Benjamin,ON,Pauline Hanson's One Nation,N,N,Preference Count,2680.0


In [107]:
# initial clean ups
# Remove rows contianing calculatype==Preference Count - keep Preference Percent
distribution_data_cleaned = distribution_data[distribution_data["CalculationType"] == "Transfer Percent"]
columns_to_keep = ["DivisionNm", "CountNumber", "Surname", "PartyAb", "CalculationValue"]
distribution_data_cleaned = distribution_data_cleaned[columns_to_keep]
distribution_data_cleaned.head()

Unnamed: 0,DivisionNm,CountNumber,Surname,PartyAb,CalculationValue
3,Bean,0,CONWAY,UAPP,0.0
7,Bean,0,AMBARD,ON,0.0
11,Bean,0,SMITH,ALP,0.0
15,Bean,0,CHRISTIE,IND,0.0
19,Bean,0,SAVERY,GRN,0.0


In [108]:
# Tidy up PartyAb
teal_indies = [("DYER","Boothby"),
               ("BOELE", "Bradfield"),
               ("HOOK", "Calare"),
               ("FERRES MILES","Casey"),
               ("HEISE", "Cowper"),
               ("CHANEY", "Curtin"),
               ("RUSSEL", "Flinders"),
               ("DANIEL", "Goldstein"),
               ("HOLT", "Groom"),
               ("SEYMOUR", "Hughes"),
               ("STEELE", "Hughes"),
               ("ACKERY","Hume"),
               ("HAINES", "Indi"),
               ("RYAN", "Kooyong"),
               ("SCAMPS", "Mackellar"),
               ("LEONARD", "Monash"),
               ("PRIESTLY", "Nicholls"),
               ("TINK", "North Sydney"),
               ("LUKE", "Page"),
               ("DYSON", "Wannon"),
               ("STEGGALL", "Warringah"),
               ("SPENDER", "Wentworth")]



        
# Loop over the rows and anywhere where the surname and the division name matches one of the teal indies, set the PartyAb to TEAL
for index, row in distribution_data_cleaned.iterrows():
    if (row["Surname"], row["DivisionNm"]) in teal_indies:
        distribution_data_cleaned.at[index, "PartyAb"] = "TEAL"
# Clean the names
def tidy_party_ab(party_ab):
    if party_ab in ab_mapper.keys():
        return ab_mapper[party_ab]
    else:
        return "OTH"

distribution_data_cleaned["PartyAb"] = distribution_data_cleaned["PartyAb"].apply(tidy_party_ab)
distribution_data_cleaned.head()

Unnamed: 0,DivisionNm,CountNumber,Surname,PartyAb,CalculationValue
3,Bean,0,CONWAY,UAP,0.0
7,Bean,0,AMBARD,ONP,0.0
11,Bean,0,SMITH,ALP,0.0
15,Bean,0,CHRISTIE,IND,0.0
19,Bean,0,SAVERY,GRN,0.0


In [109]:
# Instantiate M as a matrix of zeros of (len(ab_mapper.keys()) x len(ab_mapper.keys()))
M = np.zeros((len(ab_mapper.keys()), len(ab_mapper.keys())))
M = pd.DataFrame(M)
M.columns = ab_mapper.values()
M.index = ab_mapper.values()

In [110]:
# Group by DivisionNm and drop Surname

seats = list(distribution_data_cleaned.drop("Surname",axis=1).groupby("DivisionNm"))

In [169]:
Mseat_alphas = {party:pd.DataFrame(columns=modeled_parties) for party in modeled_parties}

for divsionnm,seat in seats:
    for count in sorted(seat['CountNumber'].unique(), reverse=True):
        if count == 0:
            continue

        distributed_count = seat[seat["CountNumber"] == count]

        Mseat_alpha = pd.Series(np.zeros(len(modeled_parties)), index=modeled_parties)
        alpha = distributed_count[distributed_count["CalculationValue"]==-100.00]["PartyAb"].iloc[0]
        for index, row in distributed_count.iterrows():
            if row["CalculationValue"] != -100.00:
                Mseat_alpha[row["PartyAb"]] += row["CalculationValue"]
        
        Mseat_alphas[alpha] = pd.concat([Mseat_alphas[alpha], Mseat_alpha.to_frame().T])

In [181]:
Mseat_alpha = Mseat_alphas['ONP']

non_zero_counts = (Mseat_alpha != 0).sum()

first_scaling_party = non_zero_counts.idxmax()

def scale(row):
    if row[first_scaling_party] == 0:
        return row
    else:
        return row/row[first_scaling_party]

Mseat_alpha_scaled = Mseat_alpha.apply(scale,axis=1)

In [183]:

def find_scaling_party(df):
    """Find the column with the most common nonzero value."""
    non_zero_counts = df.replace(0, np.nan).count()  # Count nonzero (non-NaN) values
    return non_zero_counts.idxmax()  # Get the column with the max count

def scale_row(row, scaling_party):
    """Scale a row by its value in the scaling_party column, if nonzero."""
    if row[scaling_party] == 0:
        return row  # Skip scaling if the scaling value is zero
    return row / row[scaling_party]  # Scale row

def iterative_scaling(df):
    """Iteratively scale the DataFrame until all rows are scaled."""
    df_scaled = df.copy()
    unscaled_mask = np.ones(len(df), dtype=bool)  # Track unscaled rows

    while np.any(unscaled_mask):  # Continue until all rows are scaled
        scaling_party = find_scaling_party(df_scaled[unscaled_mask])
        
        # Scale unscaled rows
        df_scaled.loc[unscaled_mask] = df_scaled.loc[unscaled_mask].apply(
            lambda row: scale_row(row, scaling_party), axis=1
        )

        # Find the average value in the scaling column for already scaled rows
        scaled_mask = ~unscaled_mask  # Opposite of unscaled rows
        target_avg = df_scaled.loc[scaled_mask, scaling_party].mean()
        
        # Adjust unscaled rows to match target average
        df_scaled.loc[unscaled_mask] *= target_avg / df_scaled.loc[unscaled_mask, scaling_party].mean()

        # Update unscaled mask
        unscaled_mask = df_scaled.eq(1).all(axis=1) == False  # If row is all 1s, it's scaled

    return df_scaled


# Run iterative scaling
df_final = iterative_scaling(Mseat_alpha)

print(df_final)


KeyboardInterrupt: 