# Feature Prediction

Using the population (2016-2021), income (2016-2019), and crime rate (2016-2023) to predict 
future values, where population and income need to predict 2023-2026, and crime rate is 2024-2026.

1. Read the training data we have prepared before.
2. Use Linear Regression and assume their linearity. 
3. Perform hyperparameter tuning by using Grid Search.
4. Save the predicted value as test data

In [5]:
import pandas as pd
import geopandas as gpd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
import pmdarima as pm
import os

In [6]:
def predict_future(data_directory, future_years, sa2_to_predict, feature, file_name):

    """
    Predict future income or another specified feature for specific regions (SA2 codes) over a range of future years.

    Parameters:
    - data_directory: The directory where historical data files are located.
    - future_years: A list of future years for which you want to make predictions.
    - sa2_to_predict: A list of SA2 codes for which predictions should be made.
    - feature: The feature (column) in the historical data to predict for.
    - file_name: The name of the output files and directory for saving predictions.

    Returns:
    - None

    The function loads historical data from CSV files in the specified directory, performs hyperparameter tuning
    to find the best Linear Regression model, and then makes predictions for the specified SA2 codes and future years.
    The predictions are saved in CSV files in a structured directory.

    """

    # Get a list of all CSV files in the specified directory
    data_filenames = [
        os.path.join(
        data_directory, filename
        ) for filename in os.listdir(data_directory) if filename.endswith('.csv')
        ]

    # Load historical income data from multiple files
    all_data = pd.DataFrame()
    for filename in data_filenames:
        # Get the base filename without the path and extension
        base_filename = os.path.basename(filename)

        # Split the filename by '_' and get the year part
        year_part = base_filename.split('_')[2]

        # Extract the year as an integer
        year = int(year_part.split('.')[0])

        data = pd.read_csv(filename, index_col='SA2')
        data['Year'] = year
        all_data = pd.concat([all_data, data])
        train_data = all_data.reset_index()

    # Define features (X) and target variable (Y)
    train_X = train_data[['Year', 'SA2']]
    train_Y = train_data[feature]

    # Perform hyperparameter tuning for linear regression
    param_grid = {
        'fit_intercept': [True, False],
    }

    lr_model = LinearRegression()
    grid_search = GridSearchCV(lr_model, param_grid, cv=5)
    grid_search.fit(train_X, train_Y)
    best_lr_model = grid_search.best_estimator_

    # Make predictions for future years and specified SA2 codes
    predictions = pd.DataFrame({
        'Year': [year for year in future_years for i in sa2_to_predict],
        'SA2': [sa2 for i in future_years for sa2 in sa2_to_predict]
    })
    predictions[feature] = best_lr_model.predict(predictions[['Year', 'SA2']])
    predictions.to_csv(f'../data/curated/predict/{file_name}/{file_name}_all.csv', index=False)


    for year_name in future_years:
        year_pred = predictions[predictions['Year'] == year_name]
        year_pred = year_pred[['SA2', feature]]
        year_pred.to_csv(f"../data/curated/predict/{file_name}/{file_name}_{year_name}.csv", index=False)

    return

In [7]:
gdf = gpd.read_file('../data/raw/SA2_shapefile/VIC_SA2_2021_AUST_GDA2020.shp')
sa2_to_predict = gdf['SA2_CODE21'].tolist()

future_years = [2023, 2024, 2025, 2026]
FEATURES = ['income', 'population', 'crime']

for feature in FEATURES:
    data_directory = f'../data/curated/train/{feature}_data/'

    if feature == 'crime':
        # we have 2023 data for crime
        predict_future(data_directory, future_years[1:], sa2_to_predict, f'avg_{feature}', f'{feature}_data')

    else:
        predict_future(data_directory, future_years, sa2_to_predict, f'avg_{feature}', f'{feature}_data')