In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import optuna, joblib

In [2]:
sc = StandardScaler()

In [3]:
### create an objective function 
def objective(
    trial #this is the ONLY arguement is default and a must
    ):
    
    #get data
    df_train = pd.read_csv('train_data.csv')
    df_test = pd.read_csv('test_data.csv')
    df_train.columns = ['Aerosol', 
              'Blue', 
              'Green', 
              'Red', 
              'NIR', 
              'SWIR-1', 
              'SWIR-2',
              'Chl-a']
              
    df_test.columns = ['Aerosol', 
              'Blue', 
              'Green', 
              'Red', 
              'NIR', 
              'SWIR-1', 
              'SWIR-2',
              'Chl-a']
    
    #separate X and y
    # and compute NDVI
    X_train, y_train = df_train.iloc[:,:-1], df_train.iloc[:,-1]
    X_test, y_test = df_test.iloc[:,:-1], df_test.iloc[:,-1]
    
    #add NDVI as features
    X_train['NDVI'] = (X_train['NIR'] - X_train['Red']) / (X_train['NIR'] + X_train['Red']) 
    X_test['NDVI'] = (X_test['NIR'] - X_test['Red']) / (X_test['NIR'] + X_test['Red']) 
    
    X_train_sc = sc.fit_transform(X_train)
    X_test_sc = sc.transform(X_test)
    
    dtrain = lgb.Dataset(X_train_sc, label = y_train)
    dvalid = lgb.Dataset(X_test_sc, label = y_test)
    
    # in optuna, setting of parameters is as follows:
    '''
    dictionary = {
    hyperparameter_alias: trial.suggest_distribution(
                           'hyperparameter_alias',
                           ['distribution values]'
                           )
    }    
    
    distribution to choose from:
    
    uniform — float values
    loguniform — float values
    discrete_uniform — float values with intervals
    int — integer values
    categorical — categorical values from a list
    '''
    
    #example:
    params = {
        'task': 'train',
        'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'rf']),
        'metric': 'rmse',
        'objective': trial.suggest_categorical('objective', ['regression_l1', 'regression_l2']),
        'verbosity': -1,
        "seed": 42,
        "learning_rate": trial.suggest_loguniform('learning_rate', 0.05, 1),
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.1, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.1, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 20),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100)
    }        

    model = lgb.train(
            params, 
            dtrain, 
            valid_names = ['eval', 'train'], 
            valid_sets = [dvalid, dtrain],
            early_stopping_rounds = 500,
            num_boost_round  = 10000,
            )

    joblib.dump(model, 'OPTIMIZED_MODEL.sav') #save model for future use

    prediction = model.predict(X_test_sc) #apply to test
    
    rmse = np.sqrt(mean_squared_error(y_test, prediction))
    return rmse

In [4]:
#optimize hyper param
study = optuna.create_study(direction = 'minimize') # minimize for regression, maximize for classification
study.optimize(objective, n_trials = 5)

[32m[I 2021-03-03 15:36:49,516][0m A new study created in memory with name: no-name-c7dab3e8-d28d-4543-a095-90cf02667754[0m


[1]	train's rmse: 2.99506	eval's rmse: 3.00353
Training until validation scores don't improve for 500 rounds
[2]	train's rmse: 2.93015	eval's rmse: 2.93234
[3]	train's rmse: 2.61272	eval's rmse: 2.69925
[4]	train's rmse: 2.64666	eval's rmse: 2.69467
[5]	train's rmse: 2.68301	eval's rmse: 2.7087
[6]	train's rmse: 2.63863	eval's rmse: 2.67335
[7]	train's rmse: 2.66574	eval's rmse: 2.68692
[8]	train's rmse: 2.59698	eval's rmse: 2.63718
[9]	train's rmse: 2.56849	eval's rmse: 2.61073
[10]	train's rmse: 2.57669	eval's rmse: 2.62344
[11]	train's rmse: 2.57865	eval's rmse: 2.62074
[12]	train's rmse: 2.58407	eval's rmse: 2.63854
[13]	train's rmse: 2.58818	eval's rmse: 2.64217
[14]	train's rmse: 2.5963	eval's rmse: 2.64969
[15]	train's rmse: 2.59481	eval's rmse: 2.65083
[16]	train's rmse: 2.56124	eval's rmse: 2.6343
[17]	train's rmse: 2.56211	eval's rmse: 2.62984
[18]	train's rmse: 2.56913	eval's rmse: 2.63306
[19]	train's rmse: 2.5744	eval's rmse: 2.64042
[20]	train's rmse: 2.57635	eval's rmse:

[32m[I 2021-03-03 15:36:50,270][0m Trial 0 finished with value: 2.6099139499408865 and parameters: {'boosting_type': 'rf', 'objective': 'regression_l1', 'learning_rate': 0.14151953592301206, 'lambda_l1': 1.0686240494610734e-07, 'lambda_l2': 4.229676225427452e-06, 'num_leaves': 152, 'feature_fraction': 0.5365645293810755, 'bagging_fraction': 0.9957567956761502, 'bagging_freq': 8, 'min_child_samples': 53}. Best is trial 0 with value: 2.6099139499408865.[0m



[435]	train's rmse: 2.61671	eval's rmse: 2.67808
[436]	train's rmse: 2.61683	eval's rmse: 2.67834
[437]	train's rmse: 2.6169	eval's rmse: 2.67828
[438]	train's rmse: 2.61709	eval's rmse: 2.67828
[439]	train's rmse: 2.61751	eval's rmse: 2.67868
[440]	train's rmse: 2.61769	eval's rmse: 2.67869
[441]	train's rmse: 2.6166	eval's rmse: 2.67691
[442]	train's rmse: 2.61684	eval's rmse: 2.67704
[443]	train's rmse: 2.61713	eval's rmse: 2.67731
[444]	train's rmse: 2.61742	eval's rmse: 2.67758
[445]	train's rmse: 2.61767	eval's rmse: 2.67772
[446]	train's rmse: 2.61764	eval's rmse: 2.67799
[447]	train's rmse: 2.61751	eval's rmse: 2.67807
[448]	train's rmse: 2.61773	eval's rmse: 2.67814
[449]	train's rmse: 2.61792	eval's rmse: 2.67806
[450]	train's rmse: 2.61801	eval's rmse: 2.67823
[451]	train's rmse: 2.61832	eval's rmse: 2.67856
[452]	train's rmse: 2.61879	eval's rmse: 2.67896
[453]	train's rmse: 2.61891	eval's rmse: 2.67899
[454]	train's rmse: 2.61902	eval's rmse: 2.67902
[455]	train's rmse: 2

[32m[I 2021-03-03 15:36:58,988][0m Trial 1 finished with value: 2.1639710216717334 and parameters: {'boosting_type': 'gbdt', 'objective': 'regression_l1', 'learning_rate': 0.2578575675088548, 'lambda_l1': 0.003711179824750167, 'lambda_l2': 1.7350168837523894e-08, 'num_leaves': 153, 'feature_fraction': 0.3624277916271902, 'bagging_fraction': 0.5453585923122038, 'bagging_freq': 5, 'min_child_samples': 64}. Best is trial 1 with value: 2.1639710216717334.[0m


[1]	train's rmse: 3.58203	eval's rmse: 3.50544
Training until validation scores don't improve for 500 rounds
[2]	train's rmse: 3.2044	eval's rmse: 3.18722
[3]	train's rmse: 2.97531	eval's rmse: 2.99572
[4]	train's rmse: 2.88704	eval's rmse: 2.93322
[5]	train's rmse: 2.8143	eval's rmse: 2.86723
[6]	train's rmse: 2.77763	eval's rmse: 2.84595
[7]	train's rmse: 2.69616	eval's rmse: 2.75291
[8]	train's rmse: 2.6386	eval's rmse: 2.71748
[9]	train's rmse: 2.5524	eval's rmse: 2.64721
[10]	train's rmse: 2.51426	eval's rmse: 2.60158
[11]	train's rmse: 2.47495	eval's rmse: 2.57001
[12]	train's rmse: 2.45898	eval's rmse: 2.55332
[13]	train's rmse: 2.43192	eval's rmse: 2.52514
[14]	train's rmse: 2.41912	eval's rmse: 2.52235
[15]	train's rmse: 2.39499	eval's rmse: 2.49995
[16]	train's rmse: 2.38404	eval's rmse: 2.49579
[17]	train's rmse: 2.36952	eval's rmse: 2.48341
[18]	train's rmse: 2.35913	eval's rmse: 2.47723
[19]	train's rmse: 2.34757	eval's rmse: 2.47617
[20]	train's rmse: 2.33414	eval's rmse:

[32m[I 2021-03-03 15:37:13,354][0m Trial 2 finished with value: 2.189458839882471 and parameters: {'boosting_type': 'gbdt', 'objective': 'regression_l1', 'learning_rate': 0.4228855757012985, 'lambda_l1': 6.648905948816577, 'lambda_l2': 2.2825630895907307e-08, 'num_leaves': 40, 'feature_fraction': 0.6928480649261152, 'bagging_fraction': 0.9926194371857651, 'bagging_freq': 15, 'min_child_samples': 92}. Best is trial 1 with value: 2.1639710216717334.[0m


[1]	train's rmse: 4.5421	eval's rmse: 4.43202
Training until validation scores don't improve for 500 rounds
[2]	train's rmse: 4.36503	eval's rmse: 4.26119
[3]	train's rmse: 4.16768	eval's rmse: 4.08058
[4]	train's rmse: 4.03707	eval's rmse: 3.95767
[5]	train's rmse: 3.94107	eval's rmse: 3.89539
[6]	train's rmse: 3.85933	eval's rmse: 3.8411
[7]	train's rmse: 3.78582	eval's rmse: 3.7914
[8]	train's rmse: 3.72408	eval's rmse: 3.76036
[9]	train's rmse: 3.66671	eval's rmse: 3.72632
[10]	train's rmse: 3.54834	eval's rmse: 3.61622
[11]	train's rmse: 3.5058	eval's rmse: 3.59854
[12]	train's rmse: 3.47071	eval's rmse: 3.57892
[13]	train's rmse: 3.43534	eval's rmse: 3.5705
[14]	train's rmse: 3.40941	eval's rmse: 3.56072
[15]	train's rmse: 3.38526	eval's rmse: 3.55819
[16]	train's rmse: 3.32556	eval's rmse: 3.49835
[17]	train's rmse: 3.31106	eval's rmse: 3.4937
[18]	train's rmse: 3.26341	eval's rmse: 3.44839
[19]	train's rmse: 3.24262	eval's rmse: 3.44182
[20]	train's rmse: 3.18852	eval's rmse: 3

[32m[I 2021-03-03 15:37:24,233][0m Trial 3 finished with value: 3.1715428719476653 and parameters: {'boosting_type': 'gbdt', 'objective': 'regression_l1', 'learning_rate': 0.08114413225108405, 'lambda_l1': 0.06304816274492346, 'lambda_l2': 0.17001261942495663, 'num_leaves': 235, 'feature_fraction': 0.1659365312208716, 'bagging_fraction': 0.9433092146274807, 'bagging_freq': 8, 'min_child_samples': 25}. Best is trial 1 with value: 2.1639710216717334.[0m


[1]	train's rmse: 4.23751	eval's rmse: 4.10901
Training until validation scores don't improve for 500 rounds
[2]	train's rmse: 3.99279	eval's rmse: 3.8819
[3]	train's rmse: 3.66994	eval's rmse: 3.57949
[4]	train's rmse: 3.42076	eval's rmse: 3.35817
[5]	train's rmse: 3.21388	eval's rmse: 3.16978
[6]	train's rmse: 3.05906	eval's rmse: 3.038
[7]	train's rmse: 2.93422	eval's rmse: 2.93618
[8]	train's rmse: 2.81831	eval's rmse: 2.84615
[9]	train's rmse: 2.73682	eval's rmse: 2.78817
[10]	train's rmse: 2.67388	eval's rmse: 2.72904
[11]	train's rmse: 2.59944	eval's rmse: 2.669
[12]	train's rmse: 2.55626	eval's rmse: 2.62706
[13]	train's rmse: 2.5133	eval's rmse: 2.59813
[14]	train's rmse: 2.47646	eval's rmse: 2.57545
[15]	train's rmse: 2.44845	eval's rmse: 2.5481
[16]	train's rmse: 2.40079	eval's rmse: 2.51178
[17]	train's rmse: 2.38105	eval's rmse: 2.49984
[18]	train's rmse: 2.35068	eval's rmse: 2.47579
[19]	train's rmse: 2.32987	eval's rmse: 2.46225
[20]	train's rmse: 2.30495	eval's rmse: 2.

[32m[I 2021-03-03 15:37:25,197][0m Trial 4 finished with value: 2.0314945784852907 and parameters: {'boosting_type': 'gbdt', 'objective': 'regression_l2', 'learning_rate': 0.16558232876243925, 'lambda_l1': 0.03209018871646033, 'lambda_l2': 0.00016375740065050355, 'num_leaves': 7, 'feature_fraction': 0.726948224763009, 'bagging_fraction': 0.8718560974989947, 'bagging_freq': 13, 'min_child_samples': 17}. Best is trial 4 with value: 2.0314945784852907.[0m


In [5]:
study.best_trial #contains the hyperparameter settings

FrozenTrial(number=4, values=[2.0314945784852907], datetime_start=datetime.datetime(2021, 3, 3, 15, 37, 24, 234323), datetime_complete=datetime.datetime(2021, 3, 3, 15, 37, 25, 197005), params={'boosting_type': 'gbdt', 'objective': 'regression_l2', 'learning_rate': 0.16558232876243925, 'lambda_l1': 0.03209018871646033, 'lambda_l2': 0.00016375740065050355, 'num_leaves': 7, 'feature_fraction': 0.726948224763009, 'bagging_fraction': 0.8718560974989947, 'bagging_freq': 13, 'min_child_samples': 17}, distributions={'boosting_type': CategoricalDistribution(choices=('gbdt', 'rf')), 'objective': CategoricalDistribution(choices=('regression_l1', 'regression_l2')), 'learning_rate': LogUniformDistribution(high=1, low=0.05), 'lambda_l1': LogUniformDistribution(high=10.0, low=1e-08), 'lambda_l2': LogUniformDistribution(high=10.0, low=1e-08), 'num_leaves': IntUniformDistribution(high=256, low=2, step=1), 'feature_fraction': UniformDistribution(high=1.0, low=0.1), 'bagging_fraction': UniformDistributi

In [6]:
print('Best trial:')
trial = study.best_trial

print('  Value: {}'.format(trial.value))

print('  Params: ')
for key, value in trial.params.items():
    print('    "{}": {},'.format(key, value))

Best trial:
  Value: 2.0314945784852907
  Params: 
    "boosting_type": gbdt,
    "objective": regression_l2,
    "learning_rate": 0.16558232876243925,
    "lambda_l1": 0.03209018871646033,
    "lambda_l2": 0.00016375740065050355,
    "num_leaves": 7,
    "feature_fraction": 0.726948224763009,
    "bagging_fraction": 0.8718560974989947,
    "bagging_freq": 13,
    "min_child_samples": 17,


In [7]:
model_tuned = joblib.load('OPTIMIZED_MODEL.sav') #call saved model

In [8]:
df_test = pd.read_csv('test_data.csv')
df_test.columns = ['Aerosol', 
          'Blue', 
          'Green', 
          'Red', 
          'NIR', 
          'SWIR-1', 
          'SWIR-2',
          'Chl-a']

In [9]:
X_test, y_test = df_test.iloc[:,:-1], df_test.iloc[:,-1]
X_test['NDVI'] = (X_test['NIR'] - X_test['Red']) / (X_test['NIR'] + X_test['Red']) 

In [10]:
X_test_sc = sc.fit_transform(X_test) #standardize

In [11]:
pred_test = model_tuned.predict(X_test_sc)
rmse = np.sqrt(mean_squared_error(y_test, pred_test))
rmse

2.1926783991837997

In [12]:
r2_score(y_test, pred_test)

0.7656795538974155