# Population Distribution Prediction
This notebook performs single variable regression for each `LocID` separately and combines the results at the end.

## Import Required Libraries
Import the necessary libraries, including pandas, sklearn, and os.

In [1]:
# Import required libraries
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import os

## Load the Dataset
Load the dataset using pandas' read_csv function with low_memory=False.

In [2]:
# Load the training dataset
train_df = pd.read_csv("data/WPP2022_PopulationBySingleAgeSex_Medium_1950-2021.csv", low_memory=False)

# Load the testing dataset
test_df = pd.read_csv("data/WPP2022_PopulationBySingleAgeSex_Medium_2022-2100.csv", low_memory=False)

# Filter the datasets to include only rows where LocTypeName is 'Country/Area'
train_df = train_df[train_df['LocTypeName'] == 'Country/Area']
test_df = test_df[test_df['LocTypeName'] == 'Country/Area']

# Filter the test dataset for the years 2022 to 2024
test_df = test_df[(test_df['Time'] >= 2022) & (test_df['Time'] <= 2024)].copy()

# Change AgeGrp from '100+' to 100 in both datasets
train_df.loc[train_df['AgeGrp'] == '100+', 'AgeGrp'] = 100
test_df.loc[test_df['AgeGrp'] == '100+', 'AgeGrp'] = 100
train_df['AgeGrp'] = train_df['AgeGrp'].astype(int)
test_df['AgeGrp'] = test_df['AgeGrp'].astype(int)


## Define the Regression Function
Define a function to perform regression for each LocID and combine the results.

In [3]:
def perform_regression(train_df, test_df):
    # Initialize an empty DataFrame to store combined results
    combined_predictions = pd.DataFrame()
    combined_metrics = pd.DataFrame()

    # Get unique LocIDs
    loc_ids = train_df['LocID'].unique()

    # Loop through each LocID
    for loc_id in loc_ids:
        # Filter data for the current LocID
        train_loc_df = train_df[train_df['LocID'] == loc_id]
        test_loc_df = test_df[test_df['LocID'] == loc_id]

        # Select features and target
        X_train = train_loc_df[['Time', 'AgeGrp']]
        Y_train = train_loc_df['PopTotal']
        X_test = test_loc_df[['Time', 'AgeGrp']]
        Y_test = test_loc_df['PopTotal']

        # Initialize the Linear Regression model
        model = LinearRegression()

        # Train the model
        model.fit(X_train, Y_train)

        # Make predictions
        Y_pred = model.predict(X_test)

        # Evaluate the model
        mae = mean_absolute_error(Y_test, Y_pred)
        mse = mean_squared_error(Y_test, Y_pred)
        r2 = r2_score(Y_test, Y_pred)

        print(f"LocID: {loc_id} - Mean Absolute Error: {mae}")
        print(f"LocID: {loc_id} - Mean Squared Error: {mse}")
        print(f"LocID: {loc_id} - R^2 Score: {r2}")

        # Save individual LocID predictions to a file
        predictions_df = test_loc_df[['Time', 'AgeGrp', 'LocID']].copy()
        predictions_df['Predicted_PopTotal'] = Y_pred

        # Store metrics
        metrics_df = pd.DataFrame({
            'LocID': [loc_id],
            'mae': [mae],
            'mse': [mse],
            'r2': [r2]
        })
        combined_metrics = pd.concat([combined_metrics, metrics_df])

        # Store predictions
        combined_predictions = pd.concat([combined_predictions, predictions_df])

    return combined_predictions, combined_metrics


## Perform Regression and Save Results
Call the regression function and save the combined results to files.

In [4]:
# Perform regression
combined_predictions, combined_metrics = perform_regression(train_df, test_df)

# Save the combined predictions to a file
combined_predictions.to_csv("data/test/combined_predictions_2022_2024.csv", index=False)

# Save the combined metrics to a file
combined_metrics.to_csv("data/test/combined_metrics_2022_2024.csv", index=False)

# Evaluate the model on the combined predictions
mae_combined = mean_absolute_error(test_df['PopTotal'], combined_predictions['Predicted_PopTotal'])
mse_combined = mean_squared_error(test_df['PopTotal'], combined_predictions['Predicted_PopTotal'])
r2_combined = r2_score(test_df['PopTotal'], combined_predictions['Predicted_PopTotal'])

print(f"Combined - Mean Absolute Error: {mae_combined}")
print(f"Combined - Mean Squared Error: {mse_combined}")
print(f"Combined - R^2 Score: {r2_combined}")

LocID: 108 - Mean Absolute Error: 65.27128061642478
LocID: 108 - Mean Squared Error: 8697.501819861433
LocID: 108 - R^2 Score: 0.5463398902799658
LocID: 174 - Mean Absolute Error: 3.079386204356907
LocID: 174 - Mean Squared Error: 15.031515692215873
LocID: 174 - R^2 Score: 0.7267258293324834
LocID: 262 - Mean Absolute Error: 4.208525481720999
LocID: 262 - Mean Squared Error: 20.69116188445457
LocID: 262 - R^2 Score: 0.7259526831544008
LocID: 232 - Mean Absolute Error: 14.622738586295773
LocID: 232 - Mean Squared Error: 328.6811666585859
LocID: 232 - R^2 Score: 0.729787693759118
LocID: 231 - Mean Absolute Error: 588.4772097268597
LocID: 231 - Mean Squared Error: 586602.4429022871
LocID: 231 - R^2 Score: 0.5924601173206661
LocID: 404 - Mean Absolute Error: 242.68991538752783
LocID: 404 - Mean Squared Error: 86277.55232313447
LocID: 404 - R^2 Score: 0.6603826295365403
LocID: 450 - Mean Absolute Error: 131.9229295459763
LocID: 450 - Mean Squared Error: 29997.79303769785
LocID: 450 - R^2 Sc