<a href="https://colab.research.google.com/github/StratagemGIS/notebooks/blob/main/projects/39_performing_rf_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Regression Analysis: Performing Random Forest Regression Using ArcGIS Pro**

Vaasudevan Srinivasan 🧑🏻‍💻  
StratagemGIS Solutions

Reference: www.esri.com/training/catalog/6410be434d750615175b2b53/

In [None]:
import fiona
import geopandas as gpd
import pandas as pd
import pooch
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [None]:
zip_file = pooch.retrieve(
    'https://github.com/StratagemGIS/datasets/raw/main/esri/zip/RegressForestRandom.zip',
    known_hash='84b4c8fafdb3043ed2aaaf49f3ff5da2d5a65f35dc168ddcfec2ac46b7e688e2',
    processor=pooch.Unzip(extract_dir='/content')
)

gdb_file = 'RegressForestRandom/RegressionModelMap.gdb'

Downloading data from 'https://github.com/StratagemGIS/datasets/raw/main/esri/zip/RegressForestRandom.zip' to file '/root/.cache/pooch/3a470b9bf3417bd8785dd38a30593563-RegressForestRandom.zip'.
Unzipping contents of '/root/.cache/pooch/3a470b9bf3417bd8785dd38a30593563-RegressForestRandom.zip' to '/content'


In [None]:
print(fiona.listlayers(gdb_file))

['Large_Water_Body', 'result_station_data_no_missing', 'station_data_no_missing', 'tl_2018_us_county', 'US_polygon']


In [None]:
stations = gpd.read_file(gdb_file, layer=2)

In [None]:
variables = (
    stations.drop(
        columns=['WBANNO', 'LST_YRMO', 'LST_YRMO_Converted',
                 'T_MONTHLY', 'geometry']
    ).columns
)

x, y = stations[variables].values, stations['T_MONTHLY'].values
x_train, x_test, y_train, y_test  = train_test_split(x, y, test_size=0.1)

In [None]:
rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=31,
    oob_score=True,
    random_state=42,
    n_jobs=-1,
)

rf.fit(x_train, y_train)

In [None]:
pd.DataFrame(
    zip(variables, rf.feature_importances_),
    columns=['Variable', 'Importance']
).sort_values('Importance', ascending=False).head(5)

Unnamed: 0,Variable,Importance
27,tasmax,0.816891
19,rlutcs,0.042256
14,ps,0.023899
22,rsdt,0.009474
31,tauv,0.008182


In [None]:
rf.score(x_test, y_test)

0.9204130811248918