Introduce tools that needed to conduct the regression analysis

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

Prepare and load dataset used for linear regression

In [None]:
file_path = 'linear.xlsx'
data = pd.read_excel(file_path)
X = data['Year'].values.reshape(-1, 1) 
y = data['Population'].values  

Build linear regression model and use the method of LOOCV to validate the model

In [None]:
loo = LeaveOneOut()
intercepts = []
slopes = []
predictions = []
actuals = []

for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    
    model_loo = LinearRegression()
    model_loo.fit(X_train, y_train)
    intercepts.append(model_loo.intercept_)
    slopes.append(model_loo.coef_[0])
    y_pred = model_loo.predict(X_test)
    predictions.append(y_pred[0])
    actuals.append(y_test[0])

average_intercept = np.mean(intercepts)
average_slope = np.mean(slopes)
mse_loo = mean_squared_error(actuals, predictions)

Print the result and MSE

In [None]:
print(f"average_intercept: {average_intercept}")
print(f"average_slope: {average_slope}")
print(f"LOOCV MSE: {mse_loo}")


After build one linear regression model, let's move on to build the LASSO model

Prepare and load dataset used for LASSO regression. Divide training set and test set.

In [None]:
file_path = 'linear.xlsx' 
data = pd.read_excel(file_path)
X = data['Year'].values.reshape(-1, 1)  
y = data['Population'].values          

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Start to build the LASSO regression model.

In [None]:
lasso_model = Lasso(alpha=0.1) 
lasso_model.fit(X_train, y_train)
y_pred = lasso_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


Print the calculation result.

In [None]:
print(f"Intercept: {lasso_model.intercept_}")
print(f"Coefficients: {lasso_model.coef_}")
print(f"MSE: {mse}")
print(f"R^2: {r2}")

After completing the establishment of the LASSO regression model, we began to establish polynomial models by DCDs, gender, and age.

Prepare dataset differentiated by DCDs.

In [None]:
data = pd.read_excel('dcd_population.xlsx')
districts = data['District Council district (DCD)'].unique()

Try to build polynomial regression models with degree = 3. Print the result.

In [None]:
degree = 3 
models = {}
mse_scores = {}

for district in districts:
    district_data = data[data['District Council district (DCD)'] == district]

    X = district_data['Year'].values.reshape(-1, 1)
    y = district_data['Total population'].values

    polyreg = make_pipeline(PolynomialFeatures(degree), LinearRegression())
    polyreg.fit(X, y)

    y_pred = polyreg.predict(X)

    mse = mean_squared_error(y, y_pred)
    models[district] = polyreg
    mse_scores[district] = mse

    print(f'District: {district}')
    print(f'Intercept: {polyreg.named_steps["linearregression"].intercept_}')
    print(f'Coefficients: {polyreg.named_steps["linearregression"].coef_}')
    print(f'MSE: {mse}\n')

Use the built models to predict data in 2025, 2028, and 2031.

In [None]:
future_years = [2025, 2028, 2031]

for district in models:
    print(f"Predictions for {district}:")
    
    for year in future_years:
        X_future = np.array([[year]])
        
        predicted_population = models[district].predict(X_future)
        print(f"Year {year}: Predicted population is {predicted_population[0]:.0f}")
        
    print("\n") 

Build polynomial models by different genders in the same method. Print the result after calculation.

In [None]:
data = pd.read_excel('gender.xlsx')
districts = data['District Council district (DCD)'].unique()
genders = data['Sex'].unique()
degree = 3 
models = {}
mse_scores = {}

for district in districts:
    for gender in genders:
        district_gender_data = data[(data['District Council district (DCD)'] == district) & (data['Sex'] == gender)]

        X = district_gender_data['Year'].values.reshape(-1, 1)
        y = district_gender_data['Population'].values

        polyreg = make_pipeline(PolynomialFeatures(degree), LinearRegression())
        polyreg.fit(X, y)

        y_pred = polyreg.predict(X)

        mse = mean_squared_error(y, y_pred)

        models[(district, gender)] = polyreg
        mse_scores[(district, gender)] = mse
        
        print(f'Gender: {gender}')
        print(f'Intercept: {polyreg.named_steps["linearregression"].intercept_}')
        print(f'Coefficients: {polyreg.named_steps["linearregression"].coef_}')
        print(f'MSE: {mse}\n')

Use the built models to predict data in 2025, 2028, and 2031.

In [None]:
prediction_years = np.array([2025, 2028, 2031]).reshape(-1, 1)
predictions = {}

for (district, gender), model in models.items():
    predicted_population = model.predict(prediction_years)
    predictions[(district, gender)] = predicted_population
    print(f"Predictions for {district} - {gender}:")
    for year, population in zip(prediction_years.flatten(), predicted_population):
        print(f"Year {year}: {population:.0f}")
    print("\n")

Build polynomial models by different age groups in the same method. Print the result after calculation.

In [None]:
data = pd.read_excel('age.xlsx')
age_groups = ['0 - 14', '15 - 24', '25 - 34', '35 - 44', '45 - 54', '55 - 64', '65 and over']
degree = 3 
models = {}
mse_scores = {}

for age_group in age_groups:
    X = data['Year'].values.reshape(-1, 1)
    y = data[age_group].values
    polyreg = make_pipeline(PolynomialFeatures(degree), LinearRegression())
    polyreg.fit(X, y)
    y_pred = polyreg.predict(X)
    mse = mean_squared_error(y, y_pred)
    models[age_group] = polyreg
    mse_scores[age_group] = mse

    print(f'Age Group: {age_group}')
    print(f'Intercept: {polyreg.named_steps["linearregression"].intercept_}')
    print(f'Coefficients: {polyreg.named_steps["linearregression"].coef_}')
    print(f'MSE: {mse}\n')

Use the built models to predict data in 2025, 2028, and 2031.

In [None]:
future_years = np.array([2025, 2028, 2031]).reshape(-1, 1)
future_predictions = {}

for age_group, model in models.items():
    predicted_population = model.predict(future_years)
    future_predictions[age_group] = predicted_population
    print(f"Predictions for Age Group: {age_group}")
    for year, population in zip(future_years.flatten(), predicted_population):
        print(f"Year {year}: {population:.2f}")
    print() 