In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

# Vehicle registration data cleaning

In [None]:
#This was performed in Excel.
# cleaned file is called 'Cleaned_Postcode-Registrations-Data-1 (1).xlsx'


# Census Data cleaning

First clean - remove unnecessary columns, remove the extra data before the ':' in each column and rename the columns

In [None]:
#define the path for reading files
ABS_data = Path('Resources/A1_copy.csv')
#read in csv to datafile
obs_df = pd.read_csv(ABS_data)
obs_df.head()

In [None]:
#remove the extra columns using column indices
columns_to_keep_indices = [2,3,4,6,8]
obs_df = obs_df.iloc[:, columns_to_keep_indices]
#remove extra values in columns using indices, note only first 5 columns need cleaning
columns_to_clean = obs_df.columns[:4]
# Apply the split operation to the specified columns
obs_df[columns_to_clean] = obs_df[columns_to_clean].apply(lambda x: x.apply(lambda y: str(y).split(':')[1]))
# Apply new column names
Col_names = ['Total personal income (weekly)', 'Age', 'Postcode', 'State', 'Population']
obs_df.columns = Col_names

# Save the modified DataFrame back to a new CSV file 
obs_df.to_csv('Resources/Census_clean.csv', index=False)
obs_df.head()

Adding mid point to Census Age and Income ranges

In [None]:
#create a file that has Midpoint Age and Midpoint income

#Define the function to convert the Income ranges to midpoint
def convert_range_to_midpoint(value):
    value = value.replace('$', '').replace(',', '')  # Remove dollar signs and commas
    if 'Total' in value or 'Not stated' in value or 'Negative/Nil income' in value:
        return 0  # Set a default value for 'Total', 'Not stated', or 'Negative/Nil income'
    elif 'or more' in value:
        return '3500+'  # Set a special value for '3500 or more'
    elif '-' in value:
        return sum(map(int, value.split('-'))) / 2
    else:
        return int(value)
#apply function to Total Personal Income and create a new column
obs_df['Income_Midpoint'] = obs_df['Total personal income (weekly)'].apply(convert_range_to_midpoint)

obs_df.head()
 

In [None]:
# Define the function to convert age to midpoint
def convert_age_to_midpoint(value):
    if 'Total' in value:
        return None  # Ignore 'Total'
    
    if 'years' in value:
        value = value.replace(' years', '')
        if '-' in value:
            return sum(map(int, value.split('-'))) / 2  # Midpoint for ranges like '55-64'
        else:
            return '85+'  # Special value for '85 years and over'
    
    return 0  # Default value

# Convert 'Age' column and create a new column 'Age_Midpoint'
obs_df['Age_Midpoint'] = obs_df['Age'].apply(convert_age_to_midpoint)
# Remove the rows where Age Midpoint is None
obs_df = obs_df.dropna(subset=['Age_Midpoint'])

obs_df.head()


In [None]:
#save this new file
obs_df.to_csv('Resources/Census_clean_Midpoints_added.csv', index=False)

## Creating the Top 10 Fuel type by Registration

Grouping and finding the top 10 for each year

In [None]:
# Load the Excel file
file_path = 'Resources/Cleaned_Postcode-Registrations-Data-1 (1).xlsx'
data = pd.read_excel(file_path)
# Display the first few rows of the dataframe
data.head()

In [None]:
# Grouping and finding top 10 for each year

# For the year 2021
top_2021 = data.groupby(['Fuel Type', 'Postcode','State'])['Registrations as at 31 January 2021'].sum().reset_index()
top_2021 = top_2021.sort_values(by='Registrations as at 31 January 2021', ascending=False).groupby('Fuel Type').head(10)
# Exclude the HFCEV fuel type as it is all zeros
top_2021 = top_2021[top_2021['Fuel Type'] != 'HFCEV']
# For the year 2022
top_2022 = data.groupby(['Fuel Type', 'Postcode','State'])['Registrations as at 31 January 2022'].sum().reset_index()
top_2022 = top_2022.sort_values(by='Registrations as at 31 January 2022', ascending=False).groupby('Fuel Type').head(10)
# Exclude the HFCEV fuel type as it is all zeros
top_2022 = top_2022[top_2022['Fuel Type'] != 'HFCEV']
# For the year 2023
top_2023 = data.groupby(['Fuel Type', 'Postcode','State'])['Registrations as at 31 January 2023'].sum().reset_index()
top_2023 = top_2023.sort_values(by='Registrations as at 31 January 2023', ascending=False).groupby('Fuel Type').head(10)
# Exclude the HFCEV fuel type as it is all zeros
top_2023 = top_2023[top_2023['Fuel Type'] != 'HFCEV']

top_2021, top_2022, top_2023


In [None]:
# Adding a year column to each DataFrame
top_2021['Year'] = 2021
top_2022['Year'] = 2022
top_2023['Year'] = 2023

# Renaming the registration columns
top_2021.rename(columns={'Registrations as at 31 January 2021': 'Registrations'}, inplace=True)
top_2022.rename(columns={'Registrations as at 31 January 2022': 'Registrations'}, inplace=True)
top_2023.rename(columns={'Registrations as at 31 January 2023': 'Registrations'}, inplace=True)

# Concatenating the DataFrames
combined_df = pd.concat([top_2021, top_2022, top_2023])

In [None]:
#Save the file 
combined_df.to_csv('Resources/Top_10_for_all_years.csv', index=False)

# Creating seperate files for each vehicle type

In [None]:
#read in the datafile containing the fuel type and postcode
top10_for2021_df = pd.read_csv('Resources/Cleaned_post_code_registration.csv')

#remove the extra columns using column indices
columns_to_keep_indices = [0,1,2,4]
top10_for2021_df = top10_for2021_df.iloc[:, columns_to_keep_indices]

#Sort the df based on number of registrations
sorted_df = top10_for2021_df.sort_values(by= 'Registrations as at 31 January 2022', ascending=False)

#Group by the fuel type
grouped_df = sorted_df.groupby('Fuel Type')

#Create a dictionary to store each fuel type
fuel_type_ds = {}
#for loop to get the top 10 and store in dictionary, also dropping Hydrogen Cell Vehicle.
for fuel_type, group in grouped_df:
    if fuel_type != 'Hydrogen Cell Vehicle':
        fuel_type_ds[fuel_type] = group.head(10).reset_index(drop=True)


In [None]:
resource_path = Path('Resources')
#read in the income data to merge with the above dictionary
income_df = pd.read_csv('Resources/Census_clean_Midpoints_added.csv')

#create an empty dictionary for the merge
merged_ds = {}

#for loop to merge the datasets
for fuel_type, fuel_type_df in fuel_type_ds.items():
    merged_df = pd.merge(fuel_type_df, income_df, on='Postcode', how ='left')
    #filter rows to remove Population of 0
    merged_df = merged_df[merged_df['Population'] !=0]
    merged_ds[fuel_type]= merged_df
    #Drop extra state column
    merged_df = merged_df.drop('State_y', axis=1)
    #rename remaining state column
    merged_df.rename(columns={'State_x':'State'}, inplace=True)
    #Save the output to a csv file for each fuel type
    filename = resource_path / f'{fuel_type}_top10.csv'
    merged_df.to_csv(filename, index=False)
    #add print statement to show named files created
    print(f'Saved {fuel_type} merge to {filename}')
    



In [None]:
#read in the postcode to suburb file
Suburb_df = pd.read_csv('Resources/australian_postcodes.csv')
Suburb_df = Suburb_df.rename(columns={'postcode': 'Postcode'})
Suburb_df['Postcode'] = Suburb_df['Postcode'].drop_duplicates()

# Manipulate the data in the Battery Electric Vehicle file to produce age distribution and income distributions

In [None]:
#read in the top 10 file
csv_file_path = 'Resources/Battery Electric Vehicle_top10.csv'
BEV_df = pd.read_csv(csv_file_path)


In [None]:
# merge Surburb_df with BEV data
merge_BEV = pd.merge(BEV_df, Suburb_df, on='Postcode', how='inner')
columns_to_keep = ['Postcode', 'State', 'Fuel Type','Registrations as at 31 January 2022','Total personal income (weekly)', 'Age', 'Population', 'Income_Midpoint', 'Age_Midpoint', 'locality']
final_merge = merge_BEV[columns_to_keep]
#reorder the columns
desired_order = ['Postcode', 'locality','State', 'Registrations as at 31 January 2022', 'Population', 'Total personal income (weekly)','Income_Midpoint', 'Age', 'Age_Midpoint','Fuel Type']
# new df with reorder
merge_df_reordered = final_merge[desired_order]

merge_df_reordered
#print(merge_BEV.columns)

In [None]:
#Filter the rows where Income is 'Total'
BEV_subset = merge_df_reordered[merge_df_reordered['Total personal income (weekly)'].str.contains('Total', case=False, na=False)]
#Sort by registrations and then age group
BEV_subset = BEV_subset.sort_values(by=['Registrations as at 31 January 2022', 'Age'], ascending=[False, True])

subset_to_save = 'Resources/BEV_top10_total.csv'
BEV_subset.to_csv(subset_to_save, index=False)
BEV_subset

# Manipulate the data in the Hybrid Vehicle file to produce age distribution and income distributions

In [None]:
#read in the file
csv_file_path = 'Resources/Hybrid_top10.csv'
Hybrid_df = pd.read_csv(csv_file_path)
Hybrid_df


In [None]:
# merge Surburb_df with Hybrid data
merge_Hybrid = pd.merge(Hybrid_df, Suburb_df, on='Postcode', how='inner')
columns_to_keep = ['Postcode', 'State', 'Fuel Type','Registrations as at 31 January 2022','Total personal income (weekly)', 'Age', 'Population', 'Income_Midpoint', 'Age_Midpoint', 'locality']
final_merge = merge_Hybrid[columns_to_keep]
#reorder the columns
desired_order = ['Postcode', 'locality','State', 'Registrations as at 31 January 2022', 'Population', 'Total personal income (weekly)','Income_Midpoint', 'Age', 'Age_Midpoint','Fuel Type']
# new df with reorder
merge_df_reordered = final_merge[desired_order]

merge_df_reordered


In [None]:
#Filter the rows where Income is 'Total'
Hybrid_subset = merge_df_reordered[merge_df_reordered['Total personal income (weekly)'].str.contains('Total', case=False, na=False)]
#Sort by registrations and then age group
Hybrid_subset = Hybrid_subset.sort_values(by=['Registrations as at 31 January 2022', 'Age'], ascending=[False, True])
subset_to_save = 'Resources/Hybrid_top10_total.csv'
Hybrid_subset.to_csv(subset_to_save, index=False)
Hybrid_subset

Manipulate the data in the Internal Combustion Engine Vehicle file to produce age distribution and income distributions

In [None]:
#read in the file
csv_file_path = 'Resources/Internal Combustion Engine_top10.csv'
ICE_df = pd.read_csv(csv_file_path)


In [None]:
# merge Surburb_df with ICE data
merge_Hybrid = pd.merge(ICE_df, Suburb_df, on='Postcode', how='inner')
columns_to_keep = ['Postcode', 'State', 'Fuel Type','Registrations as at 31 January 2022','Total personal income (weekly)', 'Age', 'Population', 'Income_Midpoint', 'Age_Midpoint', 'locality']
final_merge = merge_Hybrid[columns_to_keep]
#reorder the columns
desired_order = ['Postcode', 'locality','State', 'Registrations as at 31 January 2022', 'Population', 'Total personal income (weekly)','Income_Midpoint', 'Age', 'Age_Midpoint','Fuel Type']
# new df with reorder
merge_df_reordered = final_merge[desired_order]

merge_df_reordered


In [None]:
#Filter the rows where Income is 'Total'
ICE_subset = merge_df_reordered[merge_df_reordered['Total personal income (weekly)'].str.contains('Total', case=False, na=False)]
#Sort by registrations and then age group
ICE_subset = ICE_subset.sort_values(by=['Registrations as at 31 January 2022', 'Age'], ascending=[False, True])
subset_to_save = 'Resources/ICE_top10_total.csv'
ICE_subset.to_csv(subset_to_save, index=False)
ICE_subset

## Creating the combined datafiles for plotting

In [None]:
# Load the new CSV file
income_data_path = 'Resources/Census_clean_Midpoints_added.csv'
income_data = pd.read_csv(income_data_path)

# Display the first few rows of the income data
income_data.head()

In [None]:
# Merging the combined top registrations data with the income data based on postcode
merged_data = pd.merge(combined_df, income_data, on='Postcode')

# Displaying the first few rows of the merged DataFrame
merged_data.head()


In [None]:
#drop the extra state column
cols_to_drop = ['Total personal income (weekly)', 'Age', 'State_y']

merged_data = merged_data.drop(columns=cols_to_drop)

merged_data.head()

In [None]:
#rename the remaining columns
cols_to_rename = {'State_x': 'State', 'Income_Midpoint': 'Income', 'Age_Midpoint': 'Age'}
merged_data = merged_data.rename(columns=cols_to_rename)
merged_data.head()

In [None]:
#Save the new merged data ready for plotting manipulation
merged_data.to_csv('Resources/Top10_for_all_years_with_Census_cleaned.csv', index=False)

## Cleaning Median Age and Income Census Data

In [None]:
#create a path to csv file
path1 = Path('Resources/2021Census_G02_AUST_POA.csv')

In [None]:
#read in csv
census = pd.read_csv(path1)

In [None]:
#remove 'POA' in front of poastcode to isolate the number
census['POA_CODE_2021'] = census['POA_CODE_2021'].str[3:]

In [None]:
#remove rows with values of '0'
census = census.loc[~(census==0).any(axis=1)]

In [None]:
#copy postcode column (had issues with merging as had key error if just renamed)
census['Postcode'] = census['POA_CODE_2021'].astype(int)
census

In [None]:
#remove unwanted columns to leave postcodes, age and income
census = census[['Postcode', 'Median_age_persons', 'Median_tot_prsnl_inc_weekly']]

In [None]:
census.head()

In [None]:
census.to_csv('Resources/2021Census_median_income_and_age_cleaned.csv', index=False)