In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error

In [3]:
data = pd.read_csv('data/GLDAS_training_data.csv')
data

Unnamed: 0,system:index,GWS,VH,VV,elevation,slope,geo
0,+444-12_0,967388,0.405442,0.394813,0.038251,0.000265,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
1,+444-7_0,1445203,0.426488,0.429302,0.098361,0.006801,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
2,+444-8_0,1351189,0.414319,0.425277,0.085330,0.001630,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
3,+445-10_0,896596,0.412824,0.447019,0.047079,0.000468,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
4,+445-11_0,823435,0.424786,0.419317,0.044557,0.000239,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
...,...,...,...,...,...,...,...
184,+461-4_0,1955011,0.425445,0.436948,0.073981,0.001262,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
185,+461-5_0,1900232,0.421689,0.427566,0.074821,0.001084,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
186,+461-6_0,1863659,0.414085,0.419250,0.083228,0.007087,"{""geodesic"":false,""type"":""Point"",""coordinates""..."
187,+461-7_0,1859649,0.422525,0.427543,0.116015,0.003079,"{""geodesic"":false,""type"":""Point"",""coordinates""..."


In [4]:
# Extract the features and target variable, excluding the .geo column
X = data[['VH', 'VV', 'elevation', 'slope']]
y = data['GWS']

In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [6]:
# Train a Random Forest model
model = RandomForestRegressor(
    n_estimators=500,  # Number of trees
    min_samples_split=2,  # Minimum number of samples required to split an internal node
    min_samples_leaf=1,  # Minimum number of samples required to be at a leaf node
    max_features='sqrt',  # Number of features to consider when looking for the best split
    random_state=42  # Seed for reproducibility
)

In [7]:
model.fit(X_train, y_train)

In [8]:
# Predict on the test set
y_pred = model.predict(X_test)

# Calculate Mean Absolute Percentage Error (MAPE)
from sklearn.metrics import mean_absolute_percentage_error
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f'MAPE: {mape}')

# Optionally, calculate other metrics
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'MSE: {mse}')
print(f'R2 Score: {r2}')

MAPE: 0.24940422927173858
MSE: 170354289160.87518
R2 Score: 0.4379435048235367


In [9]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 300, 500, 700],
    'max_features': ['auto', 'sqrt', 'log2'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

In [10]:
# Create a Grid Search object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best parameters: {'bootstrap': True, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}


360 fits failed out of a total of 1080.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
182 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Asus Vivobook\anaconda3\envs\gws-env\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Asus Vivobook\anaconda3\envs\gws-env\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Users\Asus Vivobook\anaconda3\envs\gws-env\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\Asus Vivobook\anaconda3\envs\gws-env\Lib\site-packages\sklearn\utils\_par

In [11]:
# Train the model with the best parameters
best_model = RandomForestRegressor(**best_params, random_state=42)
best_model.fit(X_train, y_train)

# Predict on the test set
y_pred_best = best_model.predict(X_test)

# Calculate performance metrics
mape_best = mean_absolute_percentage_error(y_test, y_pred_best)
mse_best = mean_squared_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)

print(f"Best MAPE: {mape_best}")
print(f"Best MSE: {mse_best}")
print(f"Best R2 Score: {r2_best}")

Best MAPE: 0.24759576932646454
Best MSE: 167725082612.00824
Best R2 Score: 0.4466181476824296


In [22]:
# Combine predictions with test data
test_data_with_predictions = X_test.copy()
test_data_with_predictions['GWS_true'] = y_test
test_data_with_predictions['GWS_pred'] = y_pred
test_data_with_predictions['geo'] = data.loc[test_data_with_predictions.index, 'geo']

# Display the first few rows
test_data_with_predictions.head()

# Save the test data with predictions to a CSV file
test_data_with_predictions.to_csv('data/GLDAS_test_data_with_predictions.csv', index=False)

In [14]:
import json

# Extract coordinates from the .geo column
def extract_coordinates(geo_str):
    geo_dict = json.loads(geo_str)
    return geo_dict['coordinates'][::-1]  # Reverse to (lat, lon)

data['coordinates'] = data['geo'].apply(extract_coordinates)

# Calculate the center of the coordinates
lats = [coord[0] for coord in data['coordinates']]
lons = [coord[1] for coord in data['coordinates']]
center_lat = sum(lats) / len(lats)
center_lon = sum(lons) / len(lons)

print(f'Center coordinates: ({center_lat}, {center_lon})')

Center coordinates: (-1.6053016406293648, 113.4370499302251)


In [21]:
import json
import folium
from folium.plugins import MarkerCluster

# Initialize a folium map with the calculated center coordinates
m = folium.Map(location=[center_lat, center_lon], zoom_start=10)

# Function to convert GeoJSON string to a dictionary
def geojson_to_dict(geojson_str):
    return json.loads(geojson_str)

# Create a marker cluster
marker_cluster = MarkerCluster(name='GWS Comparison').add_to(m)

# Add GWS values to the map
for idx, row in test_data_with_predictions.iterrows():
    geo_dict = geojson_to_dict(row['geo'])
    coords = geo_dict['coordinates'][::-1]  # Reverse coordinates to (lat, lon)
    popup_text = f"True GWS: {row['GWS_true']:.2f}<br>Predicted GWS: {row['GWS_pred']:.2f}"
    folium.CircleMarker(
        location=coords,
        radius=5,
        color='green',
        fill=True,
        fill_color='green',
        fill_opacity=0.6,
        popup=popup_text
    ).add_to(marker_cluster)

# Add layer control to the map
folium.LayerControl().add_to(m)

# Save the map to an HTML file
m.save('gws_comparison_map.html')
print("Map saved successfully.")

Map saved successfully.
