This notebook forecasts population in Victorian suburbs using the ARIMA time series model for 2025-2027. The model utilises fertility and birth rates data obtained from the ABS from 2011-2022 

In [1]:
import pandas as pd

In [2]:
#change this before pushing to git
df = pd.read_excel('/Users/danielletran/Downloads/33010DO002_2022_ST_AUST (1).xlsx', sheet_name='Table 2.2')

In [3]:
row = df.iloc[7] 
null_columns = [idx for idx, val in enumerate(row) if pd.isnull(val)]
years = [2011, 2012, 2013, 2014,2015, 2016,2017,2018,2019,2020,2021,2022] 

In [4]:
df_first_row = df.iloc[5].copy()

# Step 1: Rename NaN entries in the first row
df_first_row[df.columns.get_loc('Unnamed: 1')] = 'Suburb Name'

for i, year in enumerate(years):
    base_index = 2 + (i * 4)  # Start at 2 and increment by 4 for each year
    
    df_first_row[df.columns.get_loc(f"Unnamed: {base_index}")] = f"Estimated resident population_{year}"
    df_first_row[df.columns.get_loc(f"Unnamed: {base_index + 1}")] = f'Births_{year}'
    df_first_row[df.columns.get_loc(f'Unnamed: {base_index + 2}')] = f'Total_fertility_rate_{year}'


df.iloc[5] = df_first_row

# Step 2: Set the 5th row (index 5) as the header
df.columns = df.iloc[5]  # Use the 5th row as the new header
df = df.drop(5)  # Drop the row that has been used as the header

df = df.iloc[6:]
df = df.reset_index(drop = True)
df = df.dropna(axis = 1, how = 'all')
df = df.dropna()

# Create new columns with year-specific names


# Display the resulting DataFrame
df.head(50)

  df_first_row[df.columns.get_loc('Unnamed: 1')] = 'Suburb Name'
  df_first_row[df.columns.get_loc(f"Unnamed: {base_index}")] = f"Estimated resident population_{year}"
  df_first_row[df.columns.get_loc(f"Unnamed: {base_index + 1}")] = f'Births_{year}'
  df_first_row[df.columns.get_loc(f'Unnamed: {base_index + 2}')] = f'Total_fertility_rate_{year}'


5,Place of Usual Residence,Suburb Name,Estimated resident population_2011,Births_2011,Total_fertility_rate_2011,Estimated resident population_2012,Births_2012,Total_fertility_rate_2012,Estimated resident population_2013,Births_2013,...,Total_fertility_rate_2019,Estimated resident population_2020,Births_2020,Total_fertility_rate_2020,Estimated resident population_2021,Births_2021,Total_fertility_rate_2021,Estimated resident population_2022,Births_2022,Total_fertility_rate_2022
0,206011106,Brunswick East,8966,102,np,9208,116,np,9870,105,...,0.87,13064,116,0.78,12964,143,0.77,13296,143,0.75
1,206011107,Brunswick West,13864,180,np,13963,199,np,14057,184,...,1.12,15010,169,1.07,14497,159,1.07,14684,134,0.98
2,206011109,Pascoe Vale South,9860,145,np,9954,135,np,10038,121,...,1.6,10836,101,1.42,10463,113,1.46,10413,104,1.41
3,206011495,Brunswick - North,11981,192,np,12254,173,np,12548,167,...,1.17,14124,142,1.07,13077,129,0.99,13254,129,0.92
4,206011496,Brunswick - South,12006,159,np,12402,153,np,12836,165,...,1.01,14235,155,0.97,13208,134,0.92,13364,117,0.84
5,206011497,Coburg - East,12236,181,np,12483,191,np,12655,163,...,1.3,13265,158,1.23,12675,144,1.18,12949,153,1.15
6,206011498,Coburg - West,13998,226,np,14057,243,np,14188,196,...,1.68,14523,172,1.55,14184,179,1.54,14102,146,1.41
7,20601,Brunswick - Coburg,82911,1185,np,84321,1210,np,86192,1101,...,1.18,95057,1013,1.09,91068,1001,1.07,92062,926,1
8,206021110,Alphington - Fairfield,8561,96,np,8706,118,np,8827,105,...,1.24,9538,90,1.14,9069,98,1.19,9156,94,1.14
9,206021112,Thornbury,18258,275,np,18484,316,np,18773,279,...,1.34,19791,236,1.25,19153,214,1.18,19177,224,1.12


In [5]:
#simple google search to impute the medians for the year
df['Total_fertility_rate_2011'] = 1.748
df['Total_fertility_rate_2012'] = 1.9

Now we are going to start our eda

In [12]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from pmdarima import auto_arima
import warnings
warnings.filterwarnings('ignore')

# Assuming df is your DataFrame as shown in the image

# Function to forecast population for a single suburb
def forecast_suburb(suburb_data):
    model = auto_arima(suburb_data, start_p=0, start_q=0, max_p=5, max_q=5, m=1,
                       d=1, seasonal=False, trace=False,
                       error_action='ignore', suppress_warnings=True, stepwise=True)
    
    forecast = model.predict(n_periods=5)  # Forecasting five steps ahead (2023 - 2027)
    return forecast

# Prepare data for forecasting
population_columns = [f'Estimated resident population_{year}' for year in range(2011, 2023)]
forecasts = []

for _, row in df.iterrows():
    suburb_code = row['Place of Usual Residence']
    suburb_name = row['Suburb Name']
    suburb_data = row[population_columns].values
    forecast = forecast_suburb(suburb_data)
    forecasts.append({
        'Code': suburb_code,
        'Suburb': suburb_name,
        '2023_Forecast': round(forecast[0]),
        '2024_Forecast': round(forecast[1]),
        '2025_Forecast': round(forecast[2]),
        '2026_Forecast': round(forecast[3]),
        '2027_Forecast': round(forecast[4]),
    })

# Create a DataFrame with the results
forecast_df = pd.DataFrame(forecasts)

print(forecast_df)



          Code                      Suburb  2023_Forecast  2024_Forecast  \
0    206011106              Brunswick East          13690          14083   
1    206011107              Brunswick West          14684          14684   
2    206011109           Pascoe Vale South          10388          10375   
3    206011495           Brunswick - North          13254          13254   
4    206011496           Brunswick - South          13364          13364   
..         ...                         ...            ...            ...   
603  217041480         Warrnambool - South          13331          13347   
604      21704                 Warrnambool          53358          53704   
605        217  Warrnambool and South West         128119         128579   
606      2RVIC                Rest of Vic.        1610387        1630549   
607          2              Total Victoria        6724886        6823809   

     2025_Forecast  2026_Forecast  2027_Forecast  
0            14477          14871   

In [15]:
forecast_df.to_csv('/Users/danielletran/Desktop/ADS_A2/project-2-group-real-estate-industry-project-34/data/landing/forecasted_populations.csv')
