# Import Required Libraries
Import the necessary libraries, including pandas, sklearn, and os.

In [17]:
# Import required libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the Dataset
Load the dataset using pandas' read_csv function with low_memory=False.

In [18]:
# Load the training dataset
train_df = pd.read_csv("data/WPP2022_PopulationBySingleAgeSex_Medium_1950-2021.csv", low_memory=False)

# Load the testing dataset
test_df = pd.read_csv("data/WPP2022_PopulationBySingleAgeSex_Medium_2022-2100.csv", low_memory=False)

# Filter the test dataset for the years 2022 to 2024
test_df = test_df[(test_df['Time'] >= 2022) & (test_df['Time'] <= 2024)].copy()

# Change AgeGrp from '100+' to 100 in both datasets
train_df.loc[train_df['AgeGrp'] == '100+', 'AgeGrp'] = 100
test_df.loc[test_df['AgeGrp'] == '100+', 'AgeGrp'] = 100
train_df['AgeGrp'] = train_df['AgeGrp'].astype(int)
test_df['AgeGrp'] = test_df['AgeGrp'].astype(int)

# Encode 'Location' using the mean score
location_mean = train_df.groupby('Location')['PopTotal'].mean()
train_df['Location'] = train_df['Location'].map(location_mean)
test_df['Location'] = test_df['Location'].map(location_mean)


# Select Features and Target
Select the features including the categorical column 'Location' and the target column 'PopTotal'.

In [19]:
print(train_df.columns)

Index(['SortOrder', 'LocID', 'Notes', 'ISO3_code', 'ISO2_code', 'SDMX_code',
       'LocTypeID', 'LocTypeName', 'ParentID', 'Location', 'VarID', 'Variant',
       'Time', 'MidPeriod', 'AgeGrp', 'AgeGrpStart', 'AgeGrpSpan', 'PopMale',
       'PopFemale', 'PopTotal'],
      dtype='object')


In [20]:
# Select the features and target for multivariate regression
data_X= train_df[['Time', 'AgeGrp', 'Location']]
data_Y = train_df['PopTotal']

print(data_X.head())


   Time  AgeGrp      Location
0  1950       0  49589.039509
1  1950       1  49589.039509
2  1950       2  49589.039509
3  1950       3  49589.039509
4  1950       4  49589.039509


In [21]:
print(data_Y.head())

0    81711.675
1    72672.416
2    66708.833
3    62569.586
4    58214.285
Name: PopTotal, dtype: float64


In [22]:
print(test_df[['Time', 'AgeGrp', 'Location', 'PopTotal']].head())

   Time  AgeGrp      Location    PopTotal
0  2022       0  49589.039509  130904.542
1  2022       1  49589.039509  130519.393
2  2022       2  49589.039509  132047.198
3  2022       3  49589.039509  133853.549
4  2022       4  49589.039509  135781.617


In [23]:
# Initialize the Linear Regression model
model = LinearRegression()

# Split the data into training and testing sets
X_train = data_X
Y_train = data_Y
X_test = test_df[['Time', 'AgeGrp', 'Location']]
Y_test = test_df['PopTotal']

# Train the model
model.fit(X_train, Y_train)

# Make predictions
Y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(Y_test, Y_pred)
mse = mean_squared_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

# Save the predictions to a file
predictions_df = test_df[['Time', 'AgeGrp', 'Location']].copy()
predictions_df['Predicted_PopTotal'] = Y_pred
predictions_df.to_csv("data/test/predictions_2022_2024.csv", index=False)
print(predictions_df.head())

Mean Absolute Error: 2411.1796252350146
Mean Squared Error: 51118141.070942596
R^2 Score: 0.6047579487083637
