In [None]:
'''
 -----------------------------------------------------------
          Artificial Intelligence Workshop RUG
 -----------------------------------------------------------
            R.M. (Rolando) Gonzales Martinez
 -----------------------------------------------------------
 ~ ~ ~ ~ ~ ~ ~  Population forecasts with AI  ~ ~ ~ ~ ~ ~ ~
         Small area population forecasts with LSTM
 -----------------------------------------------------------
'''
import pandas as pd

# Uploading Data
# raw GitHub URL for the CSV
url = "https://raw.githubusercontent.com/rogon666/AI_workshop/refs/heads/main/02_databases/Berlinpopulation.csv"

# load into DataFrame
df = pd.read_csv(url)
df.head()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM

# Train/test split
test_size =  # <----------------------------------  years for testing (e.g. 12)
train_df = df.iloc[:-test_size]
test_df = df.iloc[-test_size:]

# Prepare features and targets
X_train = train_df['year'].values.reshape(-1, 1)
y_train = train_df['population'].values
X_test = test_df['year'].values.reshape(-1, 1)
y_test = test_df['population'].values

# 1. Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
pred_lin = lin_reg.predict(X_test)

# 2. Polynomial Regression (deg=2 and deg=4)
poly2 = PolynomialFeatures(degree= ) # <----------- fill here
X_train_poly2 = poly2.fit_transform(X_train)
X_test_poly2 = poly2.transform(X_test)
lin_reg2 = LinearRegression().fit(X_train_poly2, y_train)
pred_poly2 = lin_reg2.predict(X_test_poly2)

poly4 = PolynomialFeatures(degree=) # <----------- fill here
X_train_poly4 = poly4.fit_transform(X_train)
X_test_poly4 = poly4.transform(X_test)
lin_reg4 = LinearRegression().fit(X_train_poly4, y_train)
pred_poly4 = lin_reg4.predict(X_test_poly4)

# 3. LSTM Model
# Scale the 'Population' series using train data only
scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(train_df['population'].values.reshape(-1, 1))

# Create LSTM sequences
sequence_length = 30 # <-------------------------------- years for look-up sliding window
x_train_seq, y_train_seq = [], []
for i in range(sequence_length, len(train_scaled)):
    x_train_seq.append(train_scaled[i-sequence_length:i, 0])
    y_train_seq.append(train_scaled[i, 0])
x_train_seq = np.array(x_train_seq).reshape(-1, sequence_length, 1)
y_train_seq = np.array(y_train_seq)

# Build and train LSTM
model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(sequence_length, 1)),
    LSTM(50),
    Dense(1)
])
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(x_train_seq, y_train_seq, epochs=32, batch_size=1, verbose=1)

# Prepare test sequences for LSTM
full_scaled = scaler.transform(df['population'].values.reshape(-1, 1))
inputs = full_scaled[-(test_size + sequence_length):, 0]
x_test_seq = np.array([inputs[i-sequence_length:i] for i in range(sequence_length, len(inputs))])
x_test_seq = x_test_seq.reshape(-1, sequence_length, 1)

# Forecast with LSTM
pred_lstm_scaled = model.predict(x_test_seq, verbose=0)
pred_lstm = scaler.inverse_transform(pred_lstm_scaled).flatten()

print('----- Training completed ------')

In [None]:
# --- Compute in-sample (train) predictions ---

# 1) Linear
pred_lin_train   = lin_reg.predict(X_train)

# 2) Polynomial (deg=2)
pred_poly2_train = lin_reg2.predict(X_train_poly2)

# 3) Polynomial (deg=4)
pred_poly4_train = lin_reg4.predict(X_train_poly4)

# 4) LSTM
# LSTM: use the sequences you built as x_train_seq
pred_s_train     = model.predict(x_train_seq, verbose=0).flatten()
pred_lstm_train  = scaler.inverse_transform(pred_s_train.reshape(-1,1)).flatten()

# --- Plot 1: Linear fit on train ---
plt.figure(figsize=(10,5))
plt.plot(train_df['year'], y_train,
         label='Train Data', color='black', alpha=0.6)
plt.plot(train_df['year'], pred_lin_train,
         label='Linear Fit', color='red', linestyle='--')
plt.title('Linear Regression: In-Sample Fit')
plt.xlabel('year'); plt.ylabel('Population')
plt.legend(); plt.grid(True)

# --- Plot 2: Polynomial (deg=2) fit on train ---
plt.figure(figsize=(10,5))
plt.plot(train_df['year'], y_train,
         label='Train Data', color='black', alpha=0.6)
plt.plot(train_df['year'], pred_poly2_train,
         label='Poly deg=2 Fit', color='green', linestyle='--')
plt.title('Polynomial (deg=2): In-Sample Fit')
plt.xlabel('year'); plt.ylabel('Population')
plt.legend(); plt.grid(True)

# --- Plot 3: Polynomial (deg=4) fit on train ---
plt.figure(figsize=(10,5))
plt.plot(train_df['year'], y_train,
         label='Train Data', color='black', alpha=0.6)
plt.plot(train_df['year'], pred_poly4_train,
         label='Poly deg=4 Fit', color='purple', linestyle='--')
plt.title('Polynomial (deg=4): In-Sample Fit')
plt.xlabel('year'); plt.ylabel('Population')
plt.legend(); plt.grid(True)

# --- Plot 4: LSTM fit on train (from year L onward) ---
plt.figure(figsize=(10,5))
years_lstm_train = train_df['year'].values[sequence_length:]
plt.plot(train_df['year'], y_train,
         label='Train Data', color='black', alpha=0.6)
plt.plot(years_lstm_train, pred_lstm_train,
         label='LSTM Fit', color='orange', linestyle='--')
plt.title(f'LSTM: In‐Sample Fit (first {sequence_length} years omitted)')
plt.xlabel('year'); plt.ylabel('Population')
plt.legend(); plt.grid(True)

plt.show()


In [None]:
# Plot comparison
plt.figure(figsize=(6, 4))
plt.plot(df['year'], df['population'], label='Train data', color='black',linewidth=2)
plt.plot(test_df['year'], y_test, label='Test Data', color='black', marker ='s', markerfacecolor='none', markeredgecolor='black', markersize=4)
plt.plot(test_df['year'], pred_lin, label='Linear Forecast', linestyle='--',linewidth=2)
plt.plot(test_df['year'], pred_poly2, label='Polynomial (deg=2)', linestyle='--',linewidth=2)
plt.plot(test_df['year'], pred_poly4, label='Polynomial (deg=4)', linestyle='--',linewidth=2)
plt.plot(test_df['year'], pred_lstm, label='LSTM Forecast', linestyle='--',linewidth=2)
plt.title('Berlin Population: Actual vs. Forecasts')
plt.xlabel('year')
plt.ylabel('Population')
plt.legend()
plt.grid(True)
plt.show()
