In [None]:
import numpy as np
import pandas as pd
from math import sqrt
import matplotlib.pyplot as plt
from matplotlib import rcParams
from keras.layers import Dense
from keras.layers import LSTM
from keras.models import Sequential
from keras.models import Model
from keras.layers import Input
from sklearn.metrics import r2_score 
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error

%matplotlib inline
#%tensorflow_version 1.x
import tensorflow as tf
#print(tf.__version_)

In [None]:
# Load and clean the dataset
df = pd.read_csv('C:/Users/User/Documents/GitHub/Health-impacts-of-air-pollution/MortData/GertPollCardMort.csv', sep=';', header=0, index_col=0, parse_dates=True)

In [None]:
# Drop rows with any NaN values
df_cleaned = df.dropna()

In [None]:
# Separate target variable (mortality) and use everything else as features
target = 'death_count'
features = df_cleaned.drop(columns=[target])

In [None]:
# Scale the features and the target separately
scaler_features = MinMaxScaler(feature_range=(0, 1))
scaler_target = MinMaxScaler(feature_range=(0, 1))

scaled_features = scaler_features.fit_transform(features)
scaled_target = scaler_target.fit_transform(df_cleaned[[target]])

In [None]:
# Step 2: Prepare the Data for LSTM
def create_sequences(data, target_data, n_steps):
    X, y = [], []
    for i in range(len(data) - n_steps):
        X.append(data[i:i + n_steps])
        y.append(target_data[i + n_steps])  # Predict next day's mortality
    return np.array(X), np.array(y)

n_steps = 30  # 30 days look-back for daily data
X, y = create_sequences(scaled_features, scaled_target, n_steps)

In [None]:
# Reshape the data to fit LSTM input format (samples, time steps, features)
X = X.reshape((X.shape[0], X.shape[1], X.shape[2]))

In [None]:
# Split into training and test sets
train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

In [None]:
# Step 3: Build the LSTM Model
model = Sequential()
model.add(LSTM(50, input_shape=(n_steps, X.shape[2])))
model.add(Dense(50, activation="relu"))
model.add(Dense(50, activation="relu"))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')


In [None]:
# Step 4: Train the Model
model.fit(X_train, y_train, epochs=30, batch_size=32, verbose=1)

In [None]:
# Step 5: Make Predictions for Daily Mortality
daily_predictions = model.predict(X_test)

In [None]:
# Inverse transform the predictions to the original mortality scale
daily_predictions_rescaled = scaler_target.inverse_transform(daily_predictions)

In [None]:
# Step 6: Aggregate Daily Predictions to Monthly Predictions
df_predictions = pd.DataFrame({'date': df_cleaned.index[-len(daily_predictions_rescaled):], 'predicted_mortality': daily_predictions_rescaled.flatten()})
df_predictions.set_index('date', inplace=True)

# Sum daily predictions by month
monthly_predictions = df_predictions.resample('M').sum()

In [None]:
# Inverse transform y_test to get the actual mortality values for the test set
y_test_rescaled = scaler_target.inverse_transform(y_test)


In [None]:
# Get the date range corresponding to the test set
test_dates = df_cleaned.index[-len(y_test_rescaled):]

In [None]:
# Create a DataFrame for y_test with the appropriate date index
df_actual_test = pd.DataFrame({
    'actual_mortality': y_test_rescaled.flatten()
}, index=test_dates)


In [None]:
# Resample the actual test values by month to get monthly sums
df_actual_monthly = df_actual_test.resample('M').sum()


In [None]:
rcParams['font.weight'] = 'bold'
rcParams['font.size'] = '15'
plt.figure(figsize=(12, 8))

# Plot actual vs predicted monthly mortality
plt.plot(df_actual_monthly.index, df_actual_monthly.values, label='Actual Monthly Mortality')
plt.plot(monthly_predictions.index, monthly_predictions['predicted_mortality'].values, label='Predicted Monthly Mortality')

# Customize the x-axis labels
#plt.xticks(rotation=90)

plt.ylabel('Mortality', fontname="Times New Roman", size=30, fontweight="bold")
plt.xlabel('Date', fontname="Times New Roman", size=30, fontweight="bold")
plt.title('Nkangala cardiovascular mortality LSTM', fontname="Times New Roman", size=28, fontweight="bold")

# Set legend properties
legend_properties = {'weight': 'bold'}
plt.legend(prop=legend_properties)

plt.show()

In [None]:
mean_absolute_error(df_actual_monthly, monthly_predictions)

In [None]:
rmse = sqrt(mean_squared_error(df_actual_monthly, monthly_predictions))
print(rmse)

In [None]:
r2_score(df_actual_monthly, monthly_predictions)

# LSTM FOR SHAP

In [None]:
import shap
import numpy as np

# Sample a subset of the test set (SHAP is computationally expensive)
#X_test_sample = X_test[:10]  # Use a small sample for demonstration (First 10)
X_test_sample = X_test[np.random.choice(X_test.shape[0], 100, replace=False)] #(random 10)

# Reshape the 3D data into 2D for SHAP (flatten time steps and features)
X_train_flattened = X_train.reshape((X_train.shape[0], X_train.shape[1] * X_train.shape[2]))
X_test_sample_flattened = X_test_sample.reshape((X_test_sample.shape[0], X_test_sample.shape[1] * X_test_sample.shape[2]))

# Create a SHAP KernelExplainer using the model's prediction function
explainer = shap.KernelExplainer(
    lambda x: model.predict(x.reshape((x.shape[0], n_steps, X.shape[2]))).reshape(-1),  # Ensure the output is 1D
    X_train_flattened[:100]
)

# Calculate SHAP values for the test sample
shap_values = explainer.shap_values(X_test_sample_flattened)

# Feature names (flattened time steps and features)
flattened_feature_names = [f"{feature}_timestep_{i}" for i in range(n_steps) for feature in features.columns]

# Ensure the length of feature names matches the number of features
assert len(flattened_feature_names) == X_test_sample_flattened.shape[1], "Feature names length mismatch."

# Plot summary plot of SHAP values
shap.summary_plot(shap_values, X_test_sample_flattened, feature_names=flattened_feature_names)

# LSTM WITH ATTENTION

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from keras.models import Sequential
from keras.layers import LSTM, Dense, Input, Flatten, Dot, Softmax
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

# Define the attention mechanism
def attention_layer(inputs):
    query = Dense(1, use_bias=False)(inputs)  # Query layer
    keys = Dense(1, use_bias=False)(inputs)   # Key layer
    values = Dense(1, use_bias=False)(inputs) # Value layer

    # Compute attention scores
    scores = Dot(axes=[2, 2])([query, keys])  # Shape: (batch_size, n_steps, 1)
    scores = Softmax()(scores)  # Normalize scores
    context_vector = Dot(axes=[1, 1])([scores, values])  # Shape: (batch_size, 1, features)

    return context_vector, scores

# Define the model
def build_model(n_steps, n_features):
    inputs = Input(shape=(n_steps, n_features))
    x = LSTM(50, return_sequences=True)(inputs)
    context_vector, attention_scores = attention_layer(x)  # Apply attention mechanism and capture scores
    x = Flatten()(context_vector)
    x = Dense(50, activation='relu')(x)
    outputs = Dense(1)(x)  # Output is just the prediction
    model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer='adam', loss='mse')
    return model, attention_scores  # Return attention scores for later use

# Load and preprocess data
df = pd.read_csv('NkaPollCardMort.csv', sep=';', header=0, index_col=0, parse_dates=True)
df_cleaned = df.dropna()
target = 'death_count'
features = df_cleaned.drop(columns=[target])

scaler_features = MinMaxScaler(feature_range=(0, 1))
scaler_target = MinMaxScaler(feature_range=(0, 1))
scaled_features = scaler_features.fit_transform(features)
scaled_target = scaler_target.fit_transform(df_cleaned[[target]])

# Create sequences
def create_sequences(data, target_data, n_steps):
    X, y = [], []
    for i in range(len(data) - n_steps):
        X.append(data[i:i + n_steps])
        y.append(target_data[i + n_steps])
    return np.array(X), np.array(y)

n_steps = 30
X, y = create_sequences(scaled_features, scaled_target, n_steps)
X = X.reshape((X.shape[0], X.shape[1], X.shape[2]))

train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Build and train the model
model, attention_scores = build_model(n_steps, X.shape[2])
history = model.fit(X_train, y_train, epochs=30, batch_size=32, verbose=1)

# Make predictions
predictions = model.predict(X_test)

# Rescale the predictions
daily_predictions_rescaled = scaler_target.inverse_transform(predictions)

# Evaluate the model to get attention scores
example_input = X_test[:1]  # Use a sample from your test data
attention_model = tf.keras.models.Model(inputs=model.input, outputs=attention_scores)
attention_scores_sample = attention_model.predict(example_input)

# Plot the attention scores
plt.figure(figsize=(10, 6))
plt.imshow(attention_scores_sample[0], aspect='auto', cmap='viridis')
plt.colorbar(label='Attention Score')
plt.xlabel('Timesteps')
plt.ylabel('Attention Scores')
plt.title('Attention Scores Visualization')
plt.show()


Interpreting the attention scores plot is a great way to understand how your model is focusing on different time steps. Here’s how you can make sense of it:

Heatmap Colors: The colors on the heatmap represent the attention scores, ranging from low to high values. Typically, a color gradient (like yellow to purple in viridis colormap) indicates this range. Brighter colors (closer to yellow) suggest higher attention scores, meaning the model is paying more attention to those time steps. Darker colors (closer to purple) indicate lower attention scores.

X-axis (Timesteps): This axis represents the sequence of time steps in your input data. For example, if you’re using 30 time steps in each input sequence, the x-axis will range from 1 to 30.

Y-axis (Attention Scores): This axis shows the attention scores corresponding to each time step. Since you used a sample input from your test data, this visualizes how much importance the model assigns to each of the 30 time steps for that specific sample.

Understanding Model Focus: By looking at the attention scores, you can determine which time steps had the most significant impact on the model’s prediction. High attention scores at certain timesteps suggest those periods are particularly informative or influential in making the prediction.

Practical Insight:
Clusters of High Attention: If you see clusters of bright colors, it implies that the model is focusing on specific periods in your time series data.

Consistent High Attention: If high attention is spread evenly across several time steps, it suggests that the model considers all these steps equally important for making predictions.

Patterns: Look for patterns in the attention scores to see if the model’s focus changes over time in a meaningful way that correlates with known events or trends in your data.