In [None]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import data_analysis

## Settings

In [None]:
OSM_ID = 8269826
MAP_HEX_SIZE = 7
COMMENT = "artificial_min" # String or None
if COMMENT:
    DATA_FILE = f"../data/NO2_train_dataset_osm_{OSM_ID}_hex_{MAP_HEX_SIZE}_{COMMENT}.csv"
else:
    DATA_FILE = f"../data/NO2_train_dataset_osm_{OSM_ID}_hex_{MAP_HEX_SIZE}.csv"
SELECTED_PARAMETERS = [
    "tree_cover",
    "grassland",
    "population_density",
    "low_vegetation",
    "medium_vegetation",
    "high_vegetation",
    "road",
    "residential_1",
    "residential_2",
    "residential_3",
    "residential_4",
    "non-residential_1",
    "non-residential_2",
    "non-residential_3",
    "non-residential_4",
    "temperature",
    "temperature_trend_3h",
    "temperature_trend_6h",
    "temperature_anomaly",
    "relative_humidity",
    "relative_humidity_trend_3h",
    "relative_humidity_trend_6h",
    "pressure",
    "pressure_trend_3h",
    "pressure_trend_6h",
    "precipitation",
    "precipitation_trend_3h",
    "precipitation_trend_6h",
    "wind_u",
    "wind_v",
    "traffic_daily_fraction",
    "traffic_yearly_mean_fraction",
    "no2_anomaly",
]
TARGET = "no2_gios"
SPLIT_RANDOM_STATE = None # integer or None for random split
VALIDATION_SPLIT = 0.3
TUNE_HYPERPARAMETERS = False # True or False to skip
# Model parameters to use without tuning
N_ESTIMATORS=600
MAX_DEPTH=26
MIN_SAMPLES_SPLIT=4
MIN_SAMPLES_LEAF=1
MAX_FEATURES="log2"
BOOTSTRAP=True
# Dictionary with parameters to tune
HYPERPARAMETERS_GRID = {
    "n_estimators": [600, 1000], # integer
    "max_depth": [50, 30, None], # integer, None
    "min_samples_split": [2, 5], # integer
    "min_samples_leaf": [1, 2], # integer
    "max_features": ["sqrt", "log2"], # "sqrt", "log2"
    "bootstrap": [True, False], # True, False
}
LIMIT_DATASET = None # Number of rows or None to skip
EXPORT_MODEL = True # True or False to skip
PLOT_MODEL_PARAMETERS_INFLUENCE = False # True or False to skip

In [None]:
if not SPLIT_RANDOM_STATE:
    from random import randint

    SPLIT_RANDOM_STATE = randint(1, 1000)
    print(f"Random split state: {SPLIT_RANDOM_STATE}")

## Set default font for graphs

In [None]:
mpl.rcParams["font.family"] = "Palatino Linotype"

## Helper functions

In [None]:
from typing import Tuple

def rounded_range(data: pd.Series, resolution: int = 10) -> Tuple[int, int]:
    bottom = round(data.min() / resolution - 0.5) * resolution
    top = round(data.max() / resolution + 0.5) * resolution

    return (bottom, top)

In [None]:
def normalize_to_range(values, new_min=0.01, new_max=1.0):
    old_min = values.min()
    old_max = values.max()
    return new_min + (values - old_min) * (new_max - new_min) / (old_max - old_min)

## Read data

In [None]:
df = pd.read_csv(DATA_FILE)
df = df[[TARGET] + SELECTED_PARAMETERS]
df.dropna(inplace=True)
df.head()

In [None]:
if LIMIT_DATASET:
    df = df.sample(n=LIMIT_DATASET)
    df

## Exploratory data analysis

In [None]:
data_analysis.correlation_matrix_heatmap(df, output_file=None, annotate=False)

## Prepare data for training

In [None]:
df.dropna(inplace=True)
df.reset_index(inplace=True, drop=True)

In [None]:
bins_resolution = 10
_, top_range = rounded_range(df[TARGET], resolution = bins_resolution)
bins = list(range(0, top_range + bins_resolution, bins_resolution))
labels = [f"{bins[i]}-{bins[i+1]}" for i in range(len(bins) - 1)]
df["concentration_bin"] = pd.cut(df[TARGET], bins=bins, labels=labels)

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=VALIDATION_SPLIT, random_state=SPLIT_RANDOM_STATE)

for train_index, validation_index in split.split(df, df["concentration_bin"]):
    df_stratified_training = df.loc[train_index]
    df_stratified_validation = df.loc[validation_index]

In [None]:
bin_counts = df_stratified_training["concentration_bin"].value_counts()
bin_weights = 1 / bin_counts
bin_weights = normalize_to_range(bin_weights)
df_stratified_training["sample_weight"] = df_stratified_training["concentration_bin"].map(bin_weights)
df_stratified_training["sample_weight"] = df_stratified_training["sample_weight"].astype("float64")
weights = df_stratified_training["sample_weight"]

In [None]:
bin_counts = bin_counts.sort_index()

plt.figure(figsize=(8, 6), dpi=300)
bars = plt.bar(bin_counts.index.astype(str), bin_counts.values)

for i, bar in enumerate(bars):
    plt.text(
        bar.get_x() + bar.get_width() / 2,
        100,
        str(bin_counts.values[i]),
        ha="center",
        va="bottom",
        fontsize=16,
        rotation=90,
        color="black",
    )

plt.title("Measurements counts in NO₂ concentration bins", fontsize=16)
plt.xlabel("NO₂ concentration bin (μg/m³)", fontsize=16)
plt.ylabel("Measurements count", fontsize=16)
plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)
plt.grid(axis="y", linestyle='--', alpha=0.7)
plt.tight_layout()

if COMMENT:
    plt.savefig(f"dataset_distribution_{COMMENT}.png")
else:
    plt.savefig("dataset_distribution.png")

plt.show()

In [None]:
for dataset in [df_stratified_training, df_stratified_validation]:
    dataset.drop(columns=["concentration_bin", "sample_weight"], inplace=True, errors="ignore")

In [None]:
X_training, y_training = data_analysis.split_data(df_stratified_training, TARGET)
X_validation, y_validation = data_analysis.split_data(df_stratified_validation, TARGET)

## Train model

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
if TUNE_HYPERPARAMETERS:
    from sklearn.model_selection import GridSearchCV
    
    rf = RandomForestRegressor(random_state=10)

    grid_search = GridSearchCV(
        estimator=rf,
        param_grid=HYPERPARAMETERS_GRID,
        cv=5,
        scoring="r2", # "r2" or "neg_mean_squared_error"
        n_jobs=-1,
        verbose=2
    )

    grid_search.fit(X_training, y_training, sample_weight=weights)

    print("Best parameters:", grid_search.best_params_)

    rf_model = grid_search.best_estimator_
else:
    print("Skipped parameters tuning")
    
    rf_model = RandomForestRegressor(
        n_estimators=N_ESTIMATORS,
        max_depth=MAX_DEPTH,
        min_samples_split=MIN_SAMPLES_SPLIT,
        min_samples_leaf=MIN_SAMPLES_LEAF,
        max_features=MAX_FEATURES,
        bootstrap=BOOTSTRAP,
        random_state=SPLIT_RANDOM_STATE,
        n_jobs=-1,
    )
    
    rf_model.fit(X_training, y_training, sample_weight=weights)

In [None]:
if EXPORT_MODEL:
    import joblib
    
    if COMMENT:
        joblib.dump(rf_model, f"../data/random_forest_{TARGET}_{COMMENT}.pkl")
    else:
        joblib.dump(rf_model, f"../data/random_forest_{TARGET}.pkl")

## Analyze model performance

In [None]:
from sklearn.metrics import root_mean_squared_error

In [None]:
rf_pred = rf_model.predict(X_validation)

rmse = root_mean_squared_error(y_validation, rf_pred)
r_squared = rf_model.score(X_validation, y_validation)

print("Performance for unknown data:")
print(f"Mean Squared Error: {rmse:.2f}")
print(f"R-squared value: {r_squared:.2f}")

In [None]:
rf_pred_training = rf_model.predict(X_training)

rmse_training = root_mean_squared_error(y_training, rf_pred_training)
r_squared_training = rf_model.score(X_training, y_training)

print("Performance for known data:")
print(f"Mean Squared Error: {rmse_training:.2f}")
print(f"R-squared value: {r_squared_training:.2f}")

In [None]:
_, top_y = rounded_range(y_validation, resolution=10)
_, top_rf_pred = rounded_range(rf_pred, resolution=10)

axis_min = 0
axis_max = max(top_y, top_rf_pred)

plt.figure(figsize=(10, 8), dpi=300)

sns.scatterplot(x=y_training, y=rf_model.predict(X_training), color="red", label="Predictions on train dataset")

plt.plot([axis_min, axis_max], [axis_min, axis_max], "k--", label="Perfect prediction")

plt.xlabel("Measured NO₂ concentration (μg/m³)", fontsize=16)
plt.ylabel("Predicted NO₂ concentration (μg/m³)", fontsize=16)
plt.title("Predictions of NO₂ concentration on train dataset", fontsize=16, fontweight="bold")
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.legend(fontsize=16)
plt.grid(True)
plt.xlim(axis_min, axis_max)
plt.ylim(axis_min, axis_max)
plt.gca().set_aspect("equal", adjustable="box")
plt.tight_layout()

if COMMENT:
    plt.savefig(f"ML_rf_train_data_{TARGET}_osm_{OSM_ID}_hex_{MAP_HEX_SIZE}_{COMMENT}.png")
else:
    plt.savefig(f"ML_rf_train_data_{TARGET}_osm_{OSM_ID}_hex_{MAP_HEX_SIZE}.png")
    
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

_, top_y = rounded_range(y_validation, resolution=10)
_, top_rf_pred = rounded_range(rf_pred, resolution=10)

axis_min = 0
axis_max = max(top_y, top_rf_pred)

plt.figure(figsize=(10, 8), dpi=300)

sns.scatterplot(x=y_validation, y=rf_pred, color="blue", label="Predictions on validation dataset")
p = sns.regplot(x=y_validation, y=rf_pred, scatter=False, color="black", label="Regression line")
slope, intercept, r, p, sterr = scipy.stats.linregress(x=p.get_lines()[0].get_xdata(), y=p.get_lines()[0].get_ydata())

plt.plot([axis_min, axis_max], [axis_min, axis_max], "k--", label="Perfect prediction")

plt.xlabel("Measured NO₂ concentration (μg/m³)", fontsize=16)
plt.ylabel("Predicted NO₂ concentration (μg/m³)", fontsize=16)
plt.title(f"Predictions of NO₂ concentration on validation dataset\nRMSE: {rmse:.2f}, R²: {r_squared:.2f}, y = {slope:.3f} x + {intercept:.3f}", fontsize=16, fontweight="bold")
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.legend(fontsize=16)
plt.grid(True)
plt.xlim(axis_min, axis_max)
plt.ylim(axis_min, axis_max)
plt.gca().set_aspect("equal", adjustable="box")
plt.tight_layout()

if COMMENT:
    plt.savefig(f"ML_rf_validation_data_{TARGET}_osm_{OSM_ID}_hex_{MAP_HEX_SIZE}_{COMMENT}.png")
else:
    plt.savefig(f"ML_rf_validation_data_{TARGET}_osm_{OSM_ID}_hex_{MAP_HEX_SIZE}.png")

plt.show()

In [None]:
coeffs = pd.Series(rf_model.feature_importances_, index=X_training.columns)
print(coeffs.sort_values(ascending=False))

In [None]:
plt.figure(figsize=(8, 6), dpi=300)
bars = plt.bar(range(len(coeffs.sort_values(ascending=False)[:10])), coeffs.sort_values(ascending=False)[:10].values)

for i, bar in enumerate(bars):
    plt.text(
        bar.get_x() + bar.get_width() / 2, 0.005,
        coeffs.sort_values(ascending=False)[:10].index[i],
        ha="center",
        va="bottom",
        fontsize=16,
        rotation=90,
        color="black",
    )

plt.title("Feature importances", fontsize=16)
plt.xlabel("Feature", fontsize=16)
plt.ylabel("Relative importance", fontsize=16)
plt.xticks([])
plt.yticks(fontsize=16)
plt.grid(axis="y", linestyle='--', alpha=0.7)
plt.tight_layout()

if COMMENT:
    plt.savefig(f"ML_rf_feature_importances_{TARGET}_osm_{OSM_ID}_hex_{MAP_HEX_SIZE}_{COMMENT}.png")
else:
    plt.savefig(f"ML_rf_feature_importances_{TARGET}_osm_{OSM_ID}_hex_{MAP_HEX_SIZE}.png")

plt.show()

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(rf_model, X_training, y_training, cv=5, scoring='neg_root_mean_squared_error')
print(f"Cross-validated RMSE: {-scores.mean():.2f} +/- {scores.std():.2f}")

In [None]:
if PLOT_MODEL_PARAMETERS_INFLUENCE:
    train_errors = []
    test_errors = []
    
    for estimators in range(100, 1300, 200):
        model = RandomForestRegressor(
            n_estimators=estimators,
            max_depth=MAX_DEPTH,
            min_samples_split=MIN_SAMPLES_SPLIT,
            min_samples_leaf=MIN_SAMPLES_LEAF,
            max_features=MAX_FEATURES,
            bootstrap=BOOTSTRAP,
            random_state=SPLIT_RANDOM_STATE,
            n_jobs=-1,
        )
        model.fit(X_training, y_training, sample_weight=weights)
        
        train_pred = model.predict(X_training)
        test_pred = model.predict(X_validation)
        
        train_rmse = root_mean_squared_error(y_training, train_pred)
        test_rmse = root_mean_squared_error(y_validation, test_pred)
        
        train_errors.append(train_rmse)
        test_errors.append(test_rmse)
    plt.plot(range(100, 1300, 200), train_errors, label="Train data")
    plt.plot(range(100, 1300, 200), test_errors, label="Test data")
    plt.xlabel("n_estimators")
    plt.ylabel("RMSE")
    plt.title("RMSE n_estimators")
    plt.legend()
    plt.show()

In [None]:
if PLOT_MODEL_PARAMETERS_INFLUENCE:
    train_errors = []
    test_errors = []
    
    for depth in range(2, 29, 2):
        model = RandomForestRegressor(
            n_estimators=N_ESTIMATORS,
            max_depth=depth,
            min_samples_split=MIN_SAMPLES_SPLIT,
            min_samples_leaf=MIN_SAMPLES_LEAF,
            max_features=MAX_FEATURES,
            bootstrap=BOOTSTRAP,
            random_state=SPLIT_RANDOM_STATE,
            n_jobs=-1,
        )
        model.fit(X_training, y_training, sample_weight=weights)
        
        train_pred = model.predict(X_training)
        test_pred = model.predict(X_validation)
        
        train_rmse = root_mean_squared_error(y_training, train_pred)
        test_rmse = root_mean_squared_error(y_validation, test_pred)
        
        train_errors.append(train_rmse)
        test_errors.append(test_rmse)
    plt.plot(range(2, 29, 2), train_errors, label="Train data")
    plt.plot(range(2, 29, 2), test_errors, label="Test data")
    plt.xlabel("max_depth")
    plt.ylabel("RMSE")
    plt.title("RMSE max_depth")
    plt.legend()
    plt.show()

In [None]:
if PLOT_MODEL_PARAMETERS_INFLUENCE:
    train_errors = []
    test_errors = []
    
    for split in range(2, 21, 2):
        model = RandomForestRegressor(
            n_estimators=N_ESTIMATORS,
            max_depth=MAX_DEPTH,
            min_samples_split=split,
            min_samples_leaf=MIN_SAMPLES_LEAF,
            max_features=MAX_FEATURES,
            bootstrap=BOOTSTRAP,
            random_state=SPLIT_RANDOM_STATE,
            n_jobs=-1,
        )
        model.fit(X_training, y_training, sample_weight=weights)
        
        train_pred = model.predict(X_training)
        test_pred = model.predict(X_validation)
        
        train_rmse = root_mean_squared_error(y_training, train_pred)
        test_rmse = root_mean_squared_error(y_validation, test_pred)
        
        train_errors.append(train_rmse)
        test_errors.append(test_rmse)
    
    plt.plot(range(2, 21, 2), train_errors, label="Train data")
    plt.plot(range(2, 21, 2), test_errors, label="Test data")
    plt.xlabel("min_samples_split")
    plt.ylabel("RMSE")
    plt.title("RMSE min_samples_split")
    plt.legend()
    plt.show()

In [None]:
if PLOT_MODEL_PARAMETERS_INFLUENCE:
    train_errors = []
    test_errors = []
    
    for leaf in range(1, 20, 2):
        model = RandomForestRegressor(
            n_estimators=N_ESTIMATORS,
            max_depth=MAX_DEPTH,
            min_samples_split=MIN_SAMPLES_SPLIT,
            min_samples_leaf=leaf,
            max_features=MAX_FEATURES,
            bootstrap=BOOTSTRAP,
            random_state=SPLIT_RANDOM_STATE,
            n_jobs=-1,
        )
        model.fit(X_training, y_training, sample_weight=weights)
        
        train_pred = model.predict(X_training)
        test_pred = model.predict(X_validation)
        
        train_rmse = root_mean_squared_error(y_training, train_pred)
        test_rmse = root_mean_squared_error(y_validation, test_pred)
        
        train_errors.append(train_rmse)
        test_errors.append(test_rmse)
    
    plt.plot(range(1, 20, 2), train_errors, label="Train data")
    plt.plot(range(1, 20, 2), test_errors, label="Test data")
    plt.xlabel("min_samples_leaf")
    plt.ylabel("RMSE")
    plt.title("RMSE min_samples_leaf")
    plt.legend()
    plt.show()

## Generate output file

In [None]:
if COMMENT:
    text_file_name = f"ML_rf_model_parameters_{TARGET}_osm_{OSM_ID}_hex_{MAP_HEX_SIZE}_{COMMENT}.txt"
else:
    text_file_name = f"ML_rf_model_parameters_{TARGET}_osm_{OSM_ID}_hex_{MAP_HEX_SIZE}.txt"

with open(text_file_name, "w") as f:
    if TUNE_HYPERPARAMETERS:
        f.write("Best model parameters:\n")
        for parameter in grid_search.best_params_:
            f.write(f"{parameter}: {grid_search.best_params_[parameter]}\n\n")
    else:
        f.write("Model parameters:\n")
        f.write(f"N_ESTIMATORS: {N_ESTIMATORS}")
        f.write(f"\nMAX_DEPTH: {MAX_DEPTH}")
        f.write(f"\nMIN_SAMPLES_SPLIT: {MIN_SAMPLES_SPLIT}")
        f.write(f"\nMIN_SAMPLES_LEAF: {MIN_SAMPLES_LEAF}")
        f.write(f"\nMAX_FEATURES: {MAX_FEATURES}")
        f.write(f"\nBOOTSTRAP: {BOOTSTRAP}")
        f.write("\n\n")
    f.write("Performance for validation data:\n")
    f.write(f"Root Mean Squared Error: {rmse:.2f}\n")
    f.write(f"R-squared value: {r_squared:.2f}\n")
    f.write("\nPerformance for train data:\n")
    f.write(f"Root Mean Squared Error: {rmse_training:.2f}\n")
    f.write(f"R-squared value: {r_squared_training:.2f}\n")
    f.write("\nFeature importances:\n")
    f.write(coeffs.sort_values(ascending=False).to_string())
    f.write(f"\n\nCross-validated RMSE: {-scores.mean():.2f} +/- {scores.std():.2f}")