### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import tensorflow as tf
from hyperopt import tpe, hp, fmin

In [2]:
pd.set_option("display.max_columns", None)

In [3]:
df = pd.read_csv(r"weather_processed.csv")

In [4]:
df = df.dropna()

In [5]:
df['M_SESSION_TIME'] = df['M_SESSION_TIME'].astype(int)

In [6]:
series = []
for idx, (_, sess) in enumerate(df.groupby(by='M_SESSION_UID')):
    if len(sess) < 50:
        continue
    sess = pd.DataFrame(sess)
    sess['M_SESSION_UID'] = idx
    series.append(sess)

In [7]:
train, valid = train_test_split(series, test_size = 0.2, random_state=1)
train, valid = pd.concat(train), pd.concat(valid)

In [8]:
scaler = StandardScaler()
scaler.fit(train)
scaled_train = pd.DataFrame(scaler.transform(train))
scaled_valid = pd.DataFrame(scaler.transform(valid))
scaled_train.columns = df.columns
scaled_valid.columns = df.columns

In [9]:
y_columns = [
     'M_RAIN_PERCENTAGE_5',
     'M_RAIN_PERCENTAGE_10',
    'M_RAIN_PERCENTAGE_15',
     'M_RAIN_PERCENTAGE_30',
     'M_RAIN_PERCENTAGE_45',
     'M_RAIN_PERCENTAGE_60',
    'M_WEATHER_FORECAST_SAMPLES_M_WEATHER_5',
     'M_WEATHER_FORECAST_SAMPLES_M_WEATHER_10',
     'M_WEATHER_FORECAST_SAMPLES_M_WEATHER_15',
     'M_WEATHER_FORECAST_SAMPLES_M_WEATHER_30',
    'M_WEATHER_FORECAST_SAMPLES_M_WEATHER_45',
     'M_WEATHER_FORECAST_SAMPLES_M_WEATHER_60',]

x_columns = ['M_SESSION_UID',
 'M_SESSION_TIME',
 'TIMESTAMP',
 'M_TRACK_ID',
 'M_TRACK_TEMPERATURE',
 'M_AIR_TEMPERATURE',
 'M_WEATHER',
 'M_WEATHER_FORECAST_SAMPLES_M_TRACK_TEMPERATURE_5',
 'M_TRACK_TEMPERATURE_CHANGE_5',
 'M_WEATHER_FORECAST_SAMPLES_M_AIR_TEMPERATURE_5',
 'M_AIR_TEMPERATURE_CHANGE_5',
 'M_WEATHER_FORECAST_SAMPLES_M_TRACK_TEMPERATURE_10',
 'M_TRACK_TEMPERATURE_CHANGE_10',
 'M_AIR_TEMPERATURE_CHANGE_10',
 'M_WEATHER_FORECAST_SAMPLES_M_TRACK_TEMPERATURE_15',
 'M_TRACK_TEMPERATURE_CHANGE_15',
'M_WEATHER_FORECAST_SAMPLES_M_AIR_TEMPERATURE_10',
'M_WEATHER_FORECAST_SAMPLES_M_AIR_TEMPERATURE_15',
 'M_AIR_TEMPERATURE_CHANGE_15',
 'M_WEATHER_FORECAST_SAMPLES_M_TRACK_TEMPERATURE_30',
 'M_TRACK_TEMPERATURE_CHANGE_30',
 'M_WEATHER_FORECAST_SAMPLES_M_AIR_TEMPERATURE_30',
 'M_AIR_TEMPERATURE_CHANGE_30',
 'M_WEATHER_FORECAST_SAMPLES_M_WEATHER_45',
 'M_WEATHER_FORECAST_SAMPLES_M_TRACK_TEMPERATURE_45',
 'M_TRACK_TEMPERATURE_CHANGE_45',
 'M_WEATHER_FORECAST_SAMPLES_M_AIR_TEMPERATURE_45',
 'M_AIR_TEMPERATURE_CHANGE_45',
 'M_WEATHER_FORECAST_SAMPLES_M_TRACK_TEMPERATURE_60',
 'M_TRACK_TEMPERATURE_CHANGE_60',
 'M_WEATHER_FORECAST_SAMPLES_M_AIR_TEMPERATURE_60',
 'M_AIR_TEMPERATURE_CHANGE_60']

In [10]:
x_train, y_train = scaled_train[x_columns], scaled_train[y_columns]

In [11]:
x_valid, y_valid = scaled_valid[x_columns], scaled_valid[y_columns]

In [12]:
n_input = 50
n_features = len(x_columns)

In [21]:
train_ts = tf.keras.preprocessing.sequence.TimeseriesGenerator(x_train.to_numpy(), y_train.to_numpy(), length = n_input, batch_size=10000)
valid_ts = tf.keras.preprocessing.sequence.TimeseriesGenerator(x_valid.to_numpy(), y_valid.to_numpy(), length = n_input, batch_size=10000)

In [22]:
def build_model(lr, dense_layers, gru_layers, first_gru_neurons_num,first_dense_neurons_num, dropout):
    opt = tf.keras.optimizers.Adam(learning_rate=lr)
    model = tf.keras.models.Sequential()
    for _ in range(gru_layers):
        if _ == gru_layers -1:
            model.add(tf.keras.layers.GRU(int(first_gru_neurons_num /(_ +1)), recurrent_dropout=dropout))
        elif _ == 0:
            model.add(tf.keras.layers.GRU(int(first_gru_neurons_num), input_shape=(n_input, n_features), recurrent_dropout=dropout, return_sequences=True))
        else:
            model.add(tf.keras.layers.GRU(int(first_gru_neurons_num/(_ +1)), recurrent_dropout=dropout,return_sequences=True))
    
    for _ in range(dense_layers):
        model.add(tf.keras.layers.Dense(int(first_dense_neurons_num/(_+1))))
        model.add(tf.keras.layers.Dropout(dropout))
    model.add(tf.keras.layers.Dense(12))
    model.compile(optimizer=opt, loss='mse')
    model.summary()
    return model


In [23]:
def objective(params):
    
    model = build_model(**params)
    epochs = 100
    es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=20, verbose=0, mode='auto', baseline=None, restore_best_weights=True)

    history = model.fit(train_ts, epochs=120, validation_data=valid_ts, use_multiprocessing=True, callbacks = [es])
    return np.min(history.history['val_loss'])

search_space = {'dense_layers':hp.randint('dense_layers',1,3),
                'gru_layers': hp.randint('gru_layers',1,4),
                'first_gru_neurons_num': hp.randint('first_gru_neurons_num',50,1000),
                'first_dense_neurons_num': hp.randint('first_dense_neurons_num',50,300),
                'lr':hp.loguniform('lr',np.log(0.0001),np.log(0.001)),
                'dropout': hp.choice('dropout', [0.0, 0.1, 0.2, 0.3, 0.5])
                 }

In [None]:
best = fmin(fn=objective,
            space=search_space,
            algo=tpe.suggest,
            max_evals=50)

print(best)

Model: "sequential_2"                                 
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru_2 (GRU)                 (None, 50, 732)           1682136   
 gru_3 (GRU)                 (None, 366)               1207800   
 dense_5 (Dense)             (None, 293)               107531    
 dropout_3 (Dropout)         (None, 293)               0         
 dense_6 (Dense)             (None, 12)                3528      
Total params: 3,000,995                               
Trainable params: 3,000,995                           
Non-trainable params: 0                               
_________________________________________________________________
Epoch 1/120                                           
 - ETA: 45s - loss: 0.3766                            
                                                     
 - ETA: 23s - loss: 0.3908                            
                                 