In [19]:
import os
import glob

import datetime as dt

import numpy as np
import pandas as pd

from imblearn.over_sampling import BorderlineSMOTE, SMOTE

from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import matplotlib.pyplot as plt
import seaborn as sns

# ABOUT 
__Author__: Pat McCornack

__Date__: 12/18/24

__About__: This notebook contains various configurations of models used to infer fog presence usin meteorological observations from the Pozo SMO2 site. Information on the best configuration from this notebook is then applied in ml-evaluation.ipynb to assess performance at different sites. 

# Functions

In [20]:
def label_condition(df):
    df['condition'] = 'clear'
    for index, row in df.iterrows():
        condition='clear'
        if row['fog tips'] > 0:
            df.loc[index, 'condition'] = 'fog'

        if row['rain (mm)'] > 0:
            df.loc[index, 'condition'] = 'rain'
        
        if (row['rain (mm)'] > 0) & (row['fog tips'] > 0):
            df.loc[index, 'condition'] = 'both'
    
    return df

def prep_data(src_fpath, var_subset=None):
    df = pd.read_csv(src_fpath, index_col=0)
    if var_subset:
        df = df[var_subset]
    df = df.drop('fog', axis=1)  # Use fog tips
    df = label_condition(df)
    df['time (PST)'] = pd.to_datetime(df['time (PST)'])
    df.set_index('time (PST)', inplace=True)

    return df

def run_cv(rf, X_train, y_train): 
    cv_result = cross_val_score(rf, X_train, y_train, cv=5, scoring='f1_macro')
    return cv_result

In [None]:
def ml_wrapper(src_fpath, model):
    df = prep_data(src_fpath)

    # Drop rain/both observations for now
    df= df.loc[~df['condition'].isin(['both', 'rain'])]

    # Prepare train/test sets
    X = df.drop(['fog tips', 'condition'], axis=1)
    y = df['condition']
    #X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1234, test_size=0.3, stratify=y, shuffle=True)
    X_resampled, y_resampled = BorderlineSMOTE().fit_resample(X, y)

    # Run CV 
    cv_result = run_cv(model, X_resampled, y_resampled)
    print(cv_result)

# Main 

## Parameters

In [21]:
#### Specify parameters #### 
seed = 1234

src_datadir = '/Users/patmccornack/Documents/ucsb_fog_project/SCI_Fog_Project_Repo/data/02_clean'

var_subset = ['fog tips', 'air temperature (C)', 'relative humidity (%)', 'rain (mm)', 'wind speed (m/s)', 'wind direction (deg)', 'leaf wetness (mv)']

rf = RandomForestClassifier(
        random_state=seed
    )

In [None]:
# Read in data
src_fname = 'sci-pozo-smo2-clean-2021-2023.csv'
src_fpath = os.path.join(src_datadir, src_fname)
df = prep_data(src_fpath)

# Drop rain/both observations for now
df= df.loc[~df['condition'].isin(['both', 'rain'])]

df.head(3)

## Feature Importance
Get an idea of which features are contributing to the model. Note that there is correlation between predictors, so these results should not be taken completely at face value. 

In [None]:
#### Check Feature Importances ####
seed = 1234

rfc = RandomForestClassifier(
    random_state=seed,
    class_weight='balanced'
)

hgbc = HistGradientBoostingClassifier(
    random_state=seed,
    class_weight='balanced',
    learning_rate=0.01,
    max_iter=1000
)

X = df.drop(['fog tips', 'rain (mm)', 'condition'], axis=1).copy()
y = df['condition'].copy()

X_resampled, y_resampled = BorderlineSMOTE().fit_resample(X, y)
rfc.fit(X_resampled, y_resampled)
importances = rfc.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print("Feature Importances:\n", feature_importance_df)

## Oversampling Cross Validation Results 
Check whether a simple random forest classifier or histogram-based gradient boosting classifier performs better on this dataset. Data is oversampled using BorderlineSMOTE to address class imbalance. 

In [None]:
#### Get CV results for each model type using oversampling ####
rfc = RandomForestClassifier(
    random_state=seed
)

hgbc = HistGradientBoostingClassifier(
    random_state=seed,
    learning_rate=0.01,
    max_iter=1000
)

X = df.drop(['fog tips', 'rain (mm)', 'condition'], axis=1).copy()
y = df['condition'].copy()

X_resampled, y_resampled = BorderlineSMOTE().fit_resample(X, y)

for model in [rfc, hgbc]:
    cv_result = cross_val_score(model, X_resampled, y_resampled, cv=5, scoring='f1_macro')
    print(cv_result)


## Hyperparameter Tuning
Perform some basic tuning to attempt to improve model performance. Optimal parameters here will be applied in ml-evaluation.ipynb.

In [34]:
#### Basic RF Hyperparameter Tuning ####
rf = RandomForestClassifier(
    random_state=1234
)

param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': ['sqrt', 'log2', None]
}

X = df.drop(['fog tips', 'rain (mm)', 'condition'], axis=1).copy()
y = df['condition'].copy()

X_resampled, y_resampled = BorderlineSMOTE().fit_resample(X, y)

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_resampled, y_resampled)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Parameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 100}
Best Score: 0.6802886789014736


In [35]:
#### Basic HGBC Hyperparamater Tuning ####
seed = 1234

hgb = HistGradientBoostingClassifier(
    random_state=seed
)

param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'min_samples_leaf': [10, 20],
    'max_bins': [128, 256],
    'subsample': [0.8, 1.0]
}

X = df.drop(['fog tips', 'rain (mm)', 'condition'], axis=1).copy()
y = df['condition'].copy()

X_resampled, y_resampled = BorderlineSMOTE().fit_resample(X, y)

grid_search = GridSearchCV(estimator=hgbc, param_grid=param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_resampled, y_resampled)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)



ValueError: Invalid parameter 'learning_rate' for estimator RandomForestClassifier(random_state=1234). Valid parameters are: ['bootstrap', 'ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'monotonic_cst', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'].