In [1]:
import os,sys

elections = ['2004','2007','2010','2013','2016','2019','2022']
files = ['pollingplaces', 'primaries', 'tpptcp']

In [2]:
import pandas as pd
import numpy as np

In [3]:
os.listdir(f"data/{elections[0]}/{files[1]}")

['HouseStateFirstPrefsByPollingPlaceDownload-12246-ACT.csv',
 'HouseStateFirstPrefsByPollingPlaceDownload-12246-NSW.csv',
 'HouseStateFirstPrefsByPollingPlaceDownload-12246-NT.csv',
 'HouseStateFirstPrefsByPollingPlaceDownload-12246-QLD.csv',
 'HouseStateFirstPrefsByPollingPlaceDownload-12246-SA.csv',
 'HouseStateFirstPrefsByPollingPlaceDownload-12246-TAS.csv',
 'HouseStateFirstPrefsByPollingPlaceDownload-12246-VIC.csv',
 'HouseStateFirstPrefsByPollingPlaceDownload-12246-WA.csv']

In [4]:
def makeid(row):
    pollingid = str(row["PollingPlaceID"])
    partyid = row["PartyAb"]

    if type(partyid) != str:
        if np.isnan(partyid):
            partyid = row["Surname"].upper()
    elif partyid.upper() == "IND":
        partyid = row["Surname"].upper()

    return str(row["PollingPlaceID"])+str(partyid)

dfs = []
for electionyear in elections:
    for file in os.listdir(f"data/{electionyear}/{files[1]}"):
        file = os.path.join(f"data/{electionyear}/{files[1]}",file)
        if 'prdelms' in file:
            ppid_locations = pd.read_csv(file)
            continue
        boothresults = pd.read_csv(file,header=1)
        try:
            boothresults['id'] = boothresults.apply(makeid, axis=1)
        except KeyError as e:
            print(file)
            raise e     
        boothresults['year'] = electionyear
        boothresults['uniqueid'] = boothresults.apply(lambda row: str(row["id"])+str(row["year"]), axis=1)

        boothresults.set_index('uniqueid', inplace=True)
        dfs.append(boothresults)



In [None]:
df = pd.concat(dfs)
# df = df[df['StateAb']=='NSW']
df = df[df['year']==2022]

df.head()

In [None]:
pivot_df = df.pivot(index="id",columns='year', values='OrdinaryVotes')
covmat = pivot_df.dropna(axis=0).T.corr()

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(10,10))
plt.imshow(covmat.values, cmap='viridis', aspect='auto')
plt.show()

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(46)
kmeans.fit_predict(covmat)

In [None]:
cluster_labels = kmeans.fit_predict(covmat)

# Reorder the correlation matrix based on the cluster labels
sorted_indices = np.argsort(cluster_labels)
sorted_correlation_matrix = covmat.iloc[sorted_indices][covmat.columns[sorted_indices]]

plt.figure(figsize=(22,22),dpi=200)
plt.imshow(sorted_correlation_matrix, cmap='viridis', aspect='auto')
plt.grid(False)
plt.show()

In [None]:
pivot_df.dropna(axis=0).index[np.where(cluster_labels==1)]

In [None]:
communities_df = []
for label in np.unique(cluster_labels):
    testdf = df[df['id'].isin(pivot_df.dropna(axis=0).index[np.where(cluster_labels==label)])]
    partyvotes = testdf.groupby(['PartyAb', 'year'])['OrdinaryVotes'].sum().reset_index().sort_values(by='year')
    partyvotes['normalized_score'] = partyvotes['OrdinaryVotes'] / partyvotes.groupby('year')['OrdinaryVotes'].transform('sum')
    communities_df.append(partyvotes.pivot_table(index='PartyAb', columns='year', values='normalized_score', fill_value=0))
    

In [None]:
winners = {}
for community in communities_df:
    winner = community[2004].idxmax()
    if winner in winners.keys():
        winners[winner]+=1
    else:
        winners[winner]=1

In [None]:
winners

In [None]:
xsamples = np.sort(np.random.randint(0,1000,100))
ysamples = np.sin(xsamples*np.pi/1000 - np.pi/2) + np.random.randn(100)*0.5
xmeaned = np.linspace(0,1000,100)

shifts = np.linspace(0,5,100)
errors = []
for shift in shifts:
    def getvalue(x):
        xdiff = np.abs(x-xsamples)+shift
        weights = 1/xdiff
        return np.sum(ysamples * weights) / np.sum(weights)
    getvalue = np.vectorize(getvalue)
    ymeaned = getvalue(xmeaned)


    # plt.scatter(xsamples,ysamples)
    # plt.plot(xmeaned,ymeaned)
    # plt.plot(xmeaned,np.sin(xmeaned*np.pi/1000 - np.pi/2))

    error = np.mean((ymeaned-np.sin(xmeaned*np.pi/1000 - np.pi/2))**2)
    errors.append(error)
errors = np.asarray(errors)
plt.plot(shifts,errors)
plt.scatter(shifts,errors)

In [None]:
import geopandas as gpd     

In [None]:
unfliteredsa1geodata = gpd.read_file("SA1_2021_AUST_SHP_GDA2020/SA1_2021_AUST_GDA2020.shp")
unfliteredsa1geodata.head()

In [None]:
unfliteredsa1geodata[unfliteredsa1geodata['SA2_NAME21'].str.contains('Sydney')]

In [None]:
sa1geodata=unfliteredsa1geodata[~unfliteredsa1geodata["geometry"].isnull()]
sa1geodata=sa1geodata[sa1geodata['STE_NAME21']=='New South Wales']
sa1geodata=sa1geodata[["SA1_CODE21","AREASQKM21","geometry"]]
sa1geodata['centroids']=sa1geodata['geometry'].to_crs('+proj=cea').centroid.to_crs(sa1geodata.crs)
print(sa1geodata.shape)
sa1geodata.head()

In [None]:
sa1geodata['neighbours']= sa1geodata.apply(lambda row: set(sa1geodata[sa1geodata['geometry'].touches(row['geometry'])]["SA1_CODE21"]),axis=1)

In [None]:
sa1geodata