<a href="https://colab.research.google.com/github/Sally-Ama-Sampson/Estimating-Tair-from-CS-generated-Data-/blob/main/RF_Prediction%20of%20Tair.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import geopandas as gpd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import sklearn.ensemble
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import math
from tqdm import tqdm
import zipfile

In [None]:
ALL_DATA = '/content/drive/My Drive/PHD/Onekana/Modelling/Objective 3(Modelling)/50m_grids.gpkg'

In [None]:
## Adjusting for elevation influence
ALL_DATA['Temp_Elevation'] = ALL_DATA['temp_adjust'] + ((6.5 / 1000) * (ALL_DATA['Elevation'] - 1683))

## Feature Selection and Engineering

Feature Selection based on Correlation Analysis:

This process identifies strongly correlated feature pairs (|correlation| > 0.80) to detect multicollinearity and redundancy among predictors.
Features that appear frequently in these high-correlation pairs are selected as key representatives of correlated groups.
Additionally, important domain-specific columns are retained regardless of correlation.
The goal is to simplify the dataset by reducing redundant features, improving model stability, interpretability, and computational efficiency.



Feature Engineering using Log Transformation:

Reduce skewness in features by applying a log transformatio: log1p (log(1 + x))
This helps make data distributions more balanced, improving model performance and stability.
Key columns are excluded from transformation to preserve their original meaning.

In [None]:
X_features = ALL_DATA.drop(columns=['temp_adjust','temp_crt', 'geometry'])
corr_matrix = X_features.corr()
mask = np.abs(corr_matrix) < 0.80
filtered_corr = corr_matrix.copy()
filtered_corr[mask] = np.nan
plt.figure(figsize=(16, 12))
sns.set(font_scale=1)

ax = sns.heatmap(
    filtered_corr,
    annot=True,
    fmt=".2f",
    cmap="coolwarm",
    linewidths=0.5,
    linecolor='gray',
    vmin=-1, vmax=1
)
plt.title("Correlation Heatmap (|r| > 0.80)", fontsize=16)
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.show()

In [None]:
high_corr_pairs = (
    corr_matrix.where(np.abs(corr_matrix) > 0.80)
    .stack()
    .reset_index()
)
high_corr_pairs.columns = ['Feature_1', 'Feature_2', 'Correlation']
high_corr_pairs = high_corr_pairs[high_corr_pairs['Feature_1'] != high_corr_pairs['Feature_2']]
high_corr_pairs = high_corr_pairs.drop_duplicates(subset=['Correlation'])
high_corr_pairs = high_corr_pairs.sort_values(by='Correlation', ascending=False)
print(high_corr_pairs.head(20))

In [None]:
threshold = 4
feature_counts = pd.concat([high_corr_pairs["Feature_1"], high_corr_pairs["Feature_2"]]).value_counts()
selected_features = feature_counts[feature_counts > threshold].index.tolist()
unselected_features = feature_counts[feature_counts <= threshold].index.tolist()
print(f"{len(unselected_features)} unselected_features")
print(unselected_features)

In [None]:
columns_to_keep = ['NDVI','NDWI', 'NDBI','LST_Night','LST_DAY','Elevation', 'geometry', 'bOrient_me','temp_adjust']
selected_features = list(selected_features)
final_columns = list(set(columns_to_keep + selected_features))
ALL_DATA = ALL_DATA[final_columns]

In [None]:
ALL_DATA.columns.to_list()

['Join_Cou_1',
 'LST_Night',
 'LST_DAY',
 'Elevation',
 'NDVI',
 'uID',
 'tArea',
 'tCirCom',
 'tERI',
 'tcOrient',
 'tARatio',
 'tWNeigh',
 'tFArRatio',
 'Height',
 'bArea',
 'bCirCom',
 'bCorner',
 'bSquare',
 'bERI',
 'bFormFact',
 'bCCD',
 'bShdwall',
 'bCellAll',
 'bAli',
 'bNeiDis',
 'bAdj',
 'bMIBD',
 'tCirCom_me',
 'tERI_meanI',
 'tCArea_mea',
 'bArea_mean',
 'bVol_meanI',
 'bCirCom_me',
 'bCorner_me',
 'bSquare_me',
 'bFormFact_',
 'bOrient_me',
 'bShdwall_m',
 'bCellAll_m',
 'bAli_meanI',
 'tCirCom_ra',
 'tERI_range',
 'tcOrient_r',
 'tARatio_ra',
 'tWNeigh_ra',
 'tFArRati_1',
 'Height_ran',
 'bArea_rang',
 'bCirCom_ra',
 'bElong_ran',
 'bFormFac_1',
 'bOrient_ra',
 'bAli_range',
 'bAdj_range',
 'bMIBD_rang',
 'tArea_thei',
 'tcOrient_t',
 'tCArea_the',
 'tFArRati_2',
 'Height_the',
 'bArea_thei',
 'bCorner_th',
 'bSquare_th',
 'bCCD_theil',
 'bOrient_th',
 'bCellAll_t',
 'bAli_theil',
 'bNeiDis_th',
 'bMIBD_thei',
 'temp_crt',
 'NDBI',
 'NDWI',
 'Shape_Leng',
 'temp_adjust',

In [None]:

X_features = ALL_DATA_newtemp.drop(columns=['temp_adjust','geometry'])
corr_matrix = X_features.corr()
mask = np.abs(corr_matrix) < 0.80
filtered_corr = corr_matrix.copy()
filtered_corr[mask] = np.nan

plt.figure(figsize=(16, 12))
sns.set(font_scale=1)

ax = sns.heatmap(
    filtered_corr,
    annot=True,
    fmt=".2f",
    cmap="coolwarm",
    linewidths=0.5,
    linecolor='gray',
    vmin=-1, vmax=1
)

plt.title("Correlation Heatmap (|r| > 0.80)", fontsize=16)
plt.xticks(rotation=90)
plt.yticks(rotation=0)

plt.show()

In [None]:
skewness = ALL_DATA.skew().sort_values(ascending=False)
print("Top 10 Most Skewed Features:")
print(skewness.head(40))


In [None]:
skewed_features = skewness[abs(skewness) > 1].index[:40]
plt.figure(figsize=(25, 25))
for i, col in enumerate(skewed_features):
    plt.subplot(10, 4, i+1)
    sns.histplot(Train_datafiltered[col], kde=True)
    plt.title(f"Skewness of {col}: {skewness[col]:.2f}")
plt.tight_layout()
plt.show()


In [None]:
ALL_DATA_log = ALL_DATA.copy()
exclude_cols = ['temp_adjust','temp_crt' ,'geometry', 'Temp_Elevation']
numeric_cols = [col for col in ALL_DATA_log.columns if col not in exclude_cols and np.issubdtype(ALL_DATA_log[col].dtype, np.number)]
ALL_DATA_log[numeric_cols] = np.log1p(ALL_DATA_log[numeric_cols])
new_skewness = ALL_DATA_log[numeric_cols].skew().sort_values(ascending=False)
print("\n📉 Skewness after Log Transformation:")
print(new_skewness.head(10))
print("\n🛠️ Columns in ALL_DATA_log:", ALL_DATA_log.columns.tolist())


In [None]:
skewed_features = Train_datafiltered.skew().sort_values(ascending=False).index[:40]

plt.figure(figsize=(25, 25))
for i, col in enumerate(skewed_features):

    plt.subplot(10, 4, i+1)
    sns.histplot(ALL_DATA_log[col], kde=True)
    plt.title(f"After Log: {col}")
plt.tight_layout()
plt.show()

## Exploring Different Splitting (Train, Test) Strategies

#### STRATIFIED RANDOM SPLITS

In [None]:
gdf = ALL_DATA_log.copy()
minx, miny, maxx, maxy = gdf.total_bounds
rows, cols = 2, 5
grid_width = (maxx - minx) / cols
grid_height = (maxy - miny) / rows

grids = []
for i in range(cols):
    for j in range(rows):
        grid = box(
            minx + i * grid_width,
            miny + j * grid_height,
            minx + (i + 1) * grid_width,
            miny + (j + 1) * grid_height
        )
        grids.append(grid)

grid_gdf = gpd.GeoDataFrame(geometry=grids, crs=gdf.crs)
gdf['grid_id'] = gdf.geometry.apply(
    lambda point: next((idx for idx, grid in enumerate(grids) if grid.contains(point)), None)
)
train_list = []
test_list = []

for grid_id in gdf['grid_id'].unique():
    subset = gdf[gdf['grid_id'] == grid_id]
    shuffled_indices = subset.sample(frac=1, random_state=229).index
    train_size = int(0.8 * len(shuffled_indices))
    train_indices = shuffled_indices[:train_size]
    test_indices = shuffled_indices[train_size:]
    train_list.append(gdf.loc[train_indices])
    test_list.append(gdf.loc[test_indices])
train_gdf = gpd.GeoDataFrame(pd.concat(train_list).sample(frac=1, random_state=229), crs=gdf.crs).reset_index(drop=True)
test_gdf = gpd.GeoDataFrame(pd.concat(test_list).sample(frac=1, random_state=229), crs=gdf.crs).reset_index(drop=True)
print(f"Training Data: {len(train_gdf)} points")
print(f"Testing Data: {len(test_gdf)} points")


####SPLITTING TRAINING AND VALIDATION SETS SPATIAL DISJOINT




In [None]:
grid_crs = ALL_DATA_log.crs
minx, miny, maxx, maxy = ALL_DATA_log.total_bounds
n_divisions = 20
x_divisions = np.linspace(minx, maxx, n_divisions + 1)
y_divisions = np.linspace(miny, maxy, n_divisions + 1)
grid_cells = []
for i in range(len(x_divisions) - 1):
    for j in range(len(y_divisions) - 1):
        grid_cells.append(
            shapely.geometry.box(x_divisions[i], y_divisions[j], x_divisions[i + 1], y_divisions[j + 1])
        )
grid = gpd.GeoDataFrame(grid_cells, columns=['geometry'], crs=grid_crs)
Train_data_with_grid_id = gpd.sjoin(ALL_DATA_log, grid, how='left', predicate='within')
Train_data_with_grid_id = Train_data_with_grid_id.dropna(subset=['index_right'])
intersecting_grid_ids = Train_data_with_grid_id['index_right'].unique()
filtered_grid = grid.loc[intersecting_grid_ids]
ALL_DATA_log['grid_id'] = Train_data_with_grid_id['index_right']
intersection_counts = filtered_grid.apply(lambda x: filtered_grid.intersects(x.geometry).sum(), axis=1)
selected_grid_ids = intersection_counts[intersection_counts <= 5].index
selected_grid = filtered_grid.loc[selected_grid_ids]

selected_points = gpd.sjoin(ALL_DATA_log, selected_grid, how='inner', predicate='within')
selected_points = selected_points.drop(columns=['index_right'])
selected_points_gdf = gpd.GeoDataFrame(selected_points, crs=ALL_DATA_log.crs)
test_gdf = selected_points_gdf

sampled_uids = test_gdf['uID']
train_gdf = ALL_DATA_log.loc[~ALL_DATA_log.index.isin(sampled_uids)]

fig, ax = plt.subplots(1, 1, figsize=(10, 10))
test_gdf.plot(ax=ax, color='green', edgecolors='None', label='VALIDATION')
train_gdf.plot(ax=ax, color='red',edgecolors= 'None' ,label='TRAINTEST')

####RANDOMISED SPLITTING

In [None]:
train_gdf, test_gdf = train_test_split(ALL_DATA_log, test_size=0.2, random_state=200)

In [None]:
train_gdf.shape

In [None]:
test_gdf.shape

In [None]:
X_train_gdf = train_gdf.drop(['temp_adjust','temp_crt','Elevation','geometry','Temp_Elevation', 'ISI_rf_pred', 'Temp_Elevation1','pointid','LST_20210228_12h44','LST_20210312_19h41', 'NDWI'], axis=1)
y_train_gdf = train_gdf['Temp_Elevation']

In [None]:
X_test_gdf = test_gdf.drop(['temp_adjust','temp_crt','geometry','Elevation','Temp_Elevation', 'ISI_rf_pred', 'Temp_Elevation1','pointid','LST_20210228_12h44','LST_20210312_19h41','NDWI'], axis=1)
y_test_gdf = test_gdf['Temp_Elevation']

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_gdf_scaled = scaler.fit_transform(X_train_gdf)
X_test_gdf_scaled = scaler.fit_transform(X_test_gdf)

## RF Hyperparameter Tuning and Training

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV


rf = RandomForestRegressor(oob_score=True, random_state=229)
param_dist = {
    "n_estimators": [200, 500, 800, 1000],
    "max_depth": [20, 30, 40, None],
    "max_features": ["sqrt", "log2", 0.5, 0.7, None],
    "min_samples_split": [5, 10, 20, 30],
    "min_samples_leaf": [1, 4, 10, 20],
    "bootstrap": [True],
}

n_iter_search = 50
random_search = RandomizedSearchCV(
        estimator=rf,
        param_distributions=param_dist,
        n_iter=n_iter_search,
        cv=5,
        scoring="neg_mean_absolute_error",
        n_jobs=-1,
        verbose=0,
        random_state=42,
    )

random_search.fit(X_train_gdf, y_train_gdf)
best_params = random_search.best_params_

best_rf = RandomForestRegressor(**best_params, oob_score=True, random_state=12)
best_rf.fit(X_train_gdf, y_train_gdf)

print(f"\nBest Hyperparameters: {best_params}")
print(f"Best Cross-Validation MAE: {best_score:.4f}")
print(f"OOB Score: {best_rf.oob_score_:.4f}")



In [None]:
from sklearn.metrics import mean_absolute_error, r2_score

best_mae = abs(random_search.best_score_)

y_pred_train = best_rf.predict(X_train_gdf)

best_r2 = r2_score(y_train_gdf, y_pred_train)
final_mae = mean_absolute_error(y_train_gdf, y_pred_train)

print(f"\nBest Hyperparameters: {best_params}")
print(f"Best Cross-Validation MAE: {best_mae:.4f}")
print(f"OOB Score: {best_rf.oob_score_:.4f}")
print(f"Best R² Score on Training Data: {best_r2:.4f}")
print(f"Final MAE on Training Data: {final_mae:.4f}")


In [5]:
RF_1 =  RandomForestRegressor(oob_score=True, n_estimators = 800, min_samples_split = 8, min_samples_leaf= 8, max_features = 0.7, max_depth = 25, criterion = "absolute_error", bootstrap = True,
                             random_state=129, n_jobs=-1)

In [None]:
RF_1.fit(X_train_gdf, y_train_gdf)

In [None]:
import pickle

with open(file_path, 'wb') as file:
    pickle.dump(RF_1, file)

In [None]:
training_R2 = r2_score(y_train_gdf, y_train_pred)
print(f"training_R2: {training_R2}")
rmseTrain = math.sqrt(mean_squared_error(y_train_gdf, y_train_pred))
print(f"training_RMSE: {rmseTrain}")
oob_error_rate = RF_FINALBOSS4_scale.oob_score_
print(f"oob_error_rate: {oob_error_rate}")

In [None]:
from sklearn.metrics import mean_absolute_error
y_train_pred = RF_1.predict(X_train_gdf_scaled)
training_mae = mean_absolute_error(y_train_gdf, y_train_pred)
print(f"Training Mean Absolute Error: {training_mae}")

In [None]:
VALFIN = RF_1.predict(X_test_gdf_scaled)
VALFIN

In [None]:
Testing_accuracy = RF_1.score(X_test_gdf_scaled, y_test_gdf)
print(f'Testing_accuracy: {Testing_accuracy}')
r2VAL = r2_score(y_test_gdf, VALFIN)
print(f'Testing R-squared (R²) Score: {r2VAL}')
rmseVAL = math.sqrt(mean_squared_error(y_test_gdf, VALFIN))
print(f'Testing Regression RMSE: {rmseVAL}')

In [None]:
testing_mae = mean_absolute_error(y_test_gdf, VALFIN)
print(f"Testing Mean Absolute Error: {testing_mae}")

####MODE PREDICTIONS OF THE TREES


In [None]:
import numpy as np
from scipy.stats import mode
from sklearn.metrics import r2_score

tree_predictions = np.array([tree.predict(X_test_gdf_scaled) for tree in RF_FINALBOSS4_scale.estimators_])
rounded_predictions = np.round(tree_predictions, 3)
final_predictions = mode(rounded_predictions, axis=0)[0].flatten()
print("Final Predictions (Mode of Rounded Predictions):", final_predictions)
r2 = r2_score(y_test_gdf, final_predictions)
print("R² Score:", r2)

In [None]:

tree_predictions = np.array([tree.predict(X_test_gdf_scaled) for tree in RF_FINALBOSS4_scale.estimators_])
mean_predictions = np.mean(tree_predictions, axis=0)
p5_predictions   = np.percentile(tree_predictions, 5, axis=0)
p95_predictions  = np.percentile(tree_predictions, 95, axis=0)
median_predictions = np.median(tree_predictions, axis=0)

results_df = pd.DataFrame({
    'Mean Predictions': mean_predictions,
    '5th Percentile': p5_predictions,
    '95th Percentile': p95_predictions,
    'Median Predictions': median_predictions,
    'Uncertainty (PI Width)': p95_predictions - p5_predictions
})



In [None]:
test_gdf = test_gdf.copy()
test_gdf["mean_pred"] = mean_predictions
test_gdf["p5_pred"] = p5_predictions
test_gdf["p95_pred"] = p95_predictions
test_gdf["pi_width"] = p95_predictions - p5_predictions


In [None]:
test_gdf.plot(column="mean_pred", cmap="coolwarm", legend=True)
test_gdf.plot(column="pi_width", cmap="viridis", legend=True)


In [None]:
# HOTSPOT : Moran’s I
results = []
min_cells = 10

for s_id, subset in grid_gdf.groupby("settlement_id"):
    if len(subset) < min_cells:
        print(f"Skipping settlement {s_id} (too few cells: {len(subset)})")
        subset["Is"] = np.nan
        subset["q"] = np.nan
        subset["p_sim"] = np.nan
        results.append(subset)
        continue

    print(f"Running Local Moran's I for settlement {s_id} ...")
    w = ps.weights.DistanceBand.from_dataframe(subset, threshold=threshold, silence_warnings=True)


    y = subset["mean_pred"].values
    moran_loc = Moran_Local(y, w)
    subset["Is"] = moran_loc.Is
    subset["q"] = moran_loc.q
    subset["p_sim"] = moran_loc.p_sim

    results.append(subset)
grid_gdf_moran = gpd.GeoDataFrame(pd.concat(results), crs=grid_gdf.crs)
def classify_hotcold(row):
    if row["q"] == 1:
        if row["p_sim"] < 0.01: return "Hotspot (99%)"
        elif row["p_sim"] < 0.05: return "Hotspot (95%)"
        elif row["p_sim"] < 0.10: return "Hotspot (90%)"
    elif row["q"] == 2:
        if row["p_sim"] < 0.01: return "Coldspot (99%)"
        elif row["p_sim"] < 0.05: return "Coldspot (95%)"
        elif row["p_sim"] < 0.10: return "Coldspot (90%)"
    return "Insignificant"

grid_gdf_moran["hotcold_label"] = grid_gdf_moran.apply(classify_hotcold, axis=1)

In [None]:
def classify_hotcold_numeric(row):
    if row["q"] == 1:  # High-High (Hotspot)
        if row["p_sim"] < 0.01: return 1   # Hotspot 99%
        elif row["p_sim"] < 0.05: return 2  # Hotspot 95%
        elif row["p_sim"] < 0.10: return 3  # Hotspot 90%
    elif row["q"] == 2:  # Low-Low (Coldspot)
        if row["p_sim"] < 0.01: return 4   # Coldspot 99%
        elif row["p_sim"] < 0.05: return 5  # Coldspot 95%
        elif row["p_sim"] < 0.10: return 6  # Coldspot 90%
    return 7  # Insignificant

grid_gdf_moran["hotcold_code"] = grid_gdf_moran.apply(classify_hotcold_numeric, axis=1)


In [None]:
importances = RF1.feature_importances_
for i, feature_name in enumerate(X_test_gdf.columns):
    print(f"{feature_name}: {importances[i]}")

In [None]:
feature_importances_df = pd.DataFrame({
   'Feature': X_test_gdf.columns,
    'Importance': importances
})

In [None]:
import matplotlib.pyplot as plt

data = feature_importances_df.head(8).copy()

plt.figure(figsize=(12, 8))
norm = plt.Normalize(data['Importance'].min(), data['Importance'].max())
colors = plt.cm.Greens(norm(data['Importance']))

bars = plt.barh(data['Feature'], data['Importance'],
                color=colors, edgecolor='black')
for bar in bars:
    plt.text(bar.get_width(),
             bar.get_y() + bar.get_height()/2,
             f'{bar.get_width():.3f}',
             va='center', fontsize=12, color='black')

plt.xlabel('Importance', fontsize=20, fontweight='bold')
plt.ylabel('Feature', fontsize=20, fontweight='bold')
plt.xticks(fontsize=15, fontweight='bold')
plt.yticks(fontsize=15, fontweight='bold')
plt.title('Gini Feature Importance: Nairobi', fontsize=24, fontweight='bold', loc='center')

plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.gca().invert_yaxis()

plt.tight_layout()
plt.show()


In [None]:
from sklearn.inspection import permutation_importance
results = permutation_importance(RF1, X_test_gdf_scaled, y_test_gdf, n_repeats=30, random_state=42)

feature_importances_df = pd.DataFrame({
    'Feature': X_test_gdf.columns,
    'Importance': results.importances_mean
})

feature_importances_df = feature_importances_df.sort_values('Importance', ascending=False)
feature_importances_df20 = feature_importances_df.head(20)



##PREDICTION

In [None]:
FINAL_PRED = '/content/drive/My Drive/PHD/Onekana/Modelling/Objective 3(Modelling)/Input/Predict_Grids.gpkg'
PRED_AOI = gpd.read_file(FINAL_PRED)

In [None]:
final_gdf = PRED_AOI
concatenated_gdf_GEOM= final_gdf['geometry'].to_frame()

In [None]:
final_gdf = final_gdf.drop(columns=['geometry'])

In [None]:
import pickle
with open(file_path, 'rb') as file:
    RF1 = pickle.load(file)

In [None]:
predictions = RF1.predict(final_gdf)

sns.set(style="whitegrid")
sns.kdeplot(predictions, fill=True, color="darkblue");

plt.title('PDF of Predicted Values')
plt.xlabel('Predicted Value')
plt.ylabel('Density')
plt.savefig('PDF Model1.png' , format ='png', dpi=300)
plt.show()

In [None]:
final_gdf['geometry'] = concatenated_gdf_GEOM
final_gdf['predicted_value'] = predictions