<a href="https://colab.research.google.com/github/Sizwe100/Python-Random-Forest/blob/main/RS_Index_Teleconnection_ML_and_RF_Model_2019_2023.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# MSc Project Analysis
# Authors: Nkanyiso and Sizwe
# Institution: University of Zululand
# Date: 15/12/2024

# Required Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.interpolate import interp1d
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from matplotlib.dates import DateFormatter, YearLocator

In [None]:
# Reading the Data
data_path = "C:/R/Sizwe/MonthlyRemoteSensingData3_Mtuba.csv"
mydata = pd.read_csv(data_path, sep=";")

from google.colab import files
uploaded = files.upload()

Saving EVI.csv to EVI.csv


In [None]:
# Converting date column to datetime
mydata['date'] = pd.to_datetime(mydata['date'], format="%Y/%m/%d")

# Display the first few rows of the data
print(mydata.head(3))

In [None]:
# Example Time Series Plot
plt.figure(figsize=(10, 5))
plt.plot(mydata['date'], mydata['ET'], label='Evapotranspiration (ET)')
plt.xlabel('Date')
plt.ylabel('ET (kg/m^2/8day)')
plt.title('Keiskammahoek ET Time Series')
plt.legend()
plt.grid()
plt.show()

In [None]:
# Data Interpolation (SPEI3 and SPI3)
for col in ['SPEI3', 'SPI3']:
    non_na_data = mydata.dropna(subset=[col])
    f = interp1d(non_na_data.index, non_na_data[col], kind='cubic', fill_value="extrapolate")
    mydata[col] = f(np.arange(len(mydata)))

# Trend Analysis Example (Visualization Only)
def plot_trend(data, variable, ylabel, title):
    sns.regplot(x=data.index, y=data[variable], ci=None, scatter_kws={'s': 10}, line_kws={'color': 'red'})
    plt.xlabel('Index')
    plt.ylabel(ylabel)
    plt.title(title)
    plt.show()

plot_trend(mydata, 'VCI', ylabel='VCI Index', title='Trend in VCI (Mtuba)')

In [None]:
# Correlation Matrix
corr_matrix = mydata.iloc[:, 1:].corr()  # Exclude date column
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title('Correlation Matrix Heatmap')
plt.show()

# Random Forest Model
# Preparing Data for Random Forest
features = mydata[['Nino3.4', 'DMI', 'ET', 'SPI3', 'SPEI3', 'SAVI', 'MSAVI', 'EVI', 'RF', 'NDVI']]
labels = mydata['VCI']

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42)

In [None]:
# Training the Model
rf_model = RandomForestRegressor(n_estimators=300, random_state=42)
rf_model.fit(X_train, y_train)

# Variable Importance
importances = rf_model.feature_importances_
feature_importance_df = pd.DataFrame({"Feature": features.columns, "Importance": importances}).sort_values(by="Importance", ascending=False)
print(feature_importance_df)

sns.barplot(x="Importance", y="Feature", data=feature_importance_df)
plt.title('Feature Importance in Random Forest')
plt.show()

In [None]:
# Predictions and Model Evaluation
train_preds = rf_model.predict(X_train)
test_preds = rf_model.predict(X_test)

print("Train R^2:", r2_score(y_train, train_preds))
print("Test R^2:", r2_score(y_test, test_preds))
print("Test RMSE:", np.sqrt(mean_squared_error(y_test, test_preds)))

# Sequential Mann-Kendall Test (Placeholder - Requires External Library)
# For real implementation, use PyMannKendall or similar package.
def seq_mk_test(data, variable):
    from pymannkendall import original_test
    result = original_test(data[variable])
    return result

# Example
# mk_result = seq_mk_test(mydata, 'VCI')
# print(mk_result)


         date  EVI_LaChagra   EVI_UVS   Nino3.4
0  2019/03/31      0.596020  0.456100  0.867725
1  2019/04/30      0.533050  0.475743  0.740107
2  2019/05/31      0.511700  0.418975  0.685350
3  2019/06/30      0.442189  0.389600  0.595610
4  2019/07/31      0.386030  0.368830  0.477663
