In [1]:
import os.path
from datetime import datetime as dt

import keras.utils.vis_utils
import keras_tuner as kt
import pandas as pd
import plotly.graph_objects as go
import tensorflow as tf
from IPython.display import display, clear_output
from dateutil.relativedelta import relativedelta
from keras import Sequential, layers, losses, metrics
from keras import callbacks
from keras.api.keras import optimizers

import file_helper

In [2]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
DATE_FORMAT = '%d.%m.%Y'

In [4]:
def convert_string_to_date(string_date: str) -> dt:
    return dt.strptime(string_date, DATE_FORMAT)

def normalize_columns_in_dataframe(data: pd.DataFrame, columns: list[str] = None) -> None:
    data_columns = data[columns]
    min_values = data_columns.min()
    max_values = data_columns.max()
    data[columns] = (data_columns - min_values) / (max_values - min_values)

In [5]:
geochem_data = pd.read_excel(file_helper.get_geo_chem_file_path(), 'dubki_h_tau')

In [6]:
events_catalog_data = pd.read_csv(file_helper.get_events_catalog_file_path(), sep=';').iloc[10000:]
events_catalog_data['Date'] = pd.to_datetime(events_catalog_data['Date'], format=DATE_FORMAT)

In [7]:
DATE_TARGET_OFFSET = relativedelta(months=3)
DATE_TARGET_DURATION = relativedelta(months=1)
STARTING_EVENT_CLASS = 13

In [8]:
def split_data(date_target_offset: relativedelta, date_target_duration: relativedelta, starting_event_class) -> (pd.DataFrame, pd.DataFrame):
    preprocessed_data = pd.DataFrame(geochem_data).copy()

    normalize_columns_in_dataframe(preprocessed_data, ['events', 'ascend', 'maximum', 'descend', 'minimum'])

    breakpoint_date = convert_string_to_date('31.12.2021') - date_target_offset - date_target_duration
    reserved_data = preprocessed_data[breakpoint_date < preprocessed_data['to date']].drop(['from date'], axis=1)
    preprocessed_data = preprocessed_data[preprocessed_data['to date'] <= breakpoint_date]
    preprocessed_data['target'] = preprocessed_data['to date'].map(lambda date: 1 if len(events_catalog_data[
        ((date + date_target_offset) <= events_catalog_data['Date'])
        & (events_catalog_data['Date'] <= (date + date_target_offset + date_target_duration))
        & (starting_event_class <= events_catalog_data['Class'])
    ]) > 0 else 0)
    preprocessed_data = preprocessed_data.drop(['from date', 'to date'], axis=1)

    return preprocessed_data, reserved_data

In [9]:
preprocessed_data, reserved_data = split_data(DATE_TARGET_OFFSET, DATE_TARGET_DURATION, STARTING_EVENT_CLASS)

In [10]:
# display(
#     preprocessed_data.tail(),
#     reserved_data.head()
# )
# geochem_data.head()

In [11]:
RANDOM_STATE = 42
TRAIN_FRAC = .8

In [12]:
def generate_train_test(data: pd.DataFrame):
    x = data.drop(['target'], axis=1)
    y = data['target']

    x_train, y_train = x.sample(frac=TRAIN_FRAC, random_state=RANDOM_STATE), y.sample(frac=TRAIN_FRAC, random_state=RANDOM_STATE)
    x_test, y_test = x.drop(x_train.index), y.drop(y_train.index)

    return (x_train, y_train), (x_test, y_test)

In [13]:
(x_train, y_train), (x_test, y_test) = generate_train_test(preprocessed_data)

In [14]:
def model_builder(hp: kt.HyperParameters):
    # hp_date_target_offset_months = hp.Int('offset', 1, 12)
    # hp_date_target_duration_months = hp.Int('duration', 1, 12)

    hp_activation = hp.Choice('activation', values=['linear', 'relu', 'tanh', 'sigmoid'])
    hp_units = hp.Int('units', min_value=8, max_value=128, step=8)
    hp_activation_1 = hp.Choice('activation_1', values=['linear', 'relu', 'tanh', 'sigmoid'])
    hp_units_1 = hp.Int('units_1', min_value=8, max_value=128, step=8)
    hp_activation_2 = hp.Choice('activation_2', values=['linear', 'relu', 'tanh', 'sigmoid'])
    hp_units_2 = hp.Int('units_2', min_value=8, max_value=128, step=8)
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

    model = Sequential([
        layers.InputLayer((5,), name='input'),
        layers.Dense(units=hp_units, activation=hp_activation, name='dense_0'),
        layers.Dense(units=hp_units_1, activation=hp_activation_1, name='dense_1'),
        layers.Dense(units=hp_units_2, activation=hp_activation_2, name='dense_2'),
        layers.Dense(1, name='output')
    ])

    model.compile(
        optimizer=optimizers.Adam(learning_rate=hp_learning_rate),
        loss=losses.BinaryCrossentropy(from_logits=True),
        metrics=[metrics.BinaryAccuracy(name='accuracy')]
    )

    return model

In [15]:
tuner = kt.Hyperband(
    model_builder,
    objective='val_accuracy',
    max_epochs=200,
    factor=3,
    directory=os.path.join(file_helper.get_root_path(), f'data/hypermodel/date_o_d/{DATE_TARGET_OFFSET.months}/{DATE_TARGET_DURATION.months}'),
    project_name='geo_analysis'
)

INFO:tensorflow:Reloading Oracle from existing project C:\Users\saaru\PycharmProjects\geo\data/hypermodel/date_o_d/3/1\geo_analysis\oracle.json
INFO:tensorflow:Reloading Tuner from C:\Users\saaru\PycharmProjects\geo\data/hypermodel/date_o_d/3/1\geo_analysis\tuner0.json


In [16]:
stop_early = callbacks.EarlyStopping(monitor='val_loss', patience=10)

In [17]:
tuner.search(x_train, y_train, epochs=100, validation_split=.2, callbacks=[stop_early])

best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

INFO:tensorflow:Oracle triggered exit


In [18]:
def search_best_model(seed: int=None):
    START_MONTH = 1
    END_MONTH = 12

    last_score = -1.
    best_score = -1.
    best_offset, best_duration = (-1, -1)

    for offset in range(START_MONTH, END_MONTH + 1):
        for duration in range(START_MONTH, END_MONTH + 1):
            clear_output(wait=True)
            print(f'{"Offset":10}: {offset}\n{"Duration":10}: {duration}')
            print(f'Best offset: {best_offset}\nBest duration: {best_duration}')
            print(f'Last score: {last_score}')
            print(f'Best score: {best_score}')

            date_target_offset = relativedelta(months=offset)
            date_target_duration = relativedelta(months=duration)

            stop_early = callbacks.EarlyStopping(monitor='val_loss', patience=10)

            preprocessed_data, reserved_data = split_data(date_target_offset, date_target_duration, STARTING_EVENT_CLASS)
            (x_train, y_train), _ = generate_train_test(preprocessed_data)

            # split_index = int(x_train.shape[0] * .8)
            #
            # x_val, y_val = x_train[split_index:], y_train[split_index:]
            # x_train, y_train = x_train[:split_index], y_train[:split_index]
            #
            # train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).batch(32).cache().prefetch(tf.data.AUTOTUNE)
            # val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val)).batch(32).cache().prefetch(tf.data.AUTOTUNE)

            tuner = kt.Hyperband(
                model_builder,
                objective='val_accuracy',
                max_epochs=200,
                factor=3,
                seed=seed,
                directory=os.path.join(file_helper.get_root_path(), f'data/hypermodel/date_o_d/{offset}/{duration}'),
                project_name='geo_analysis'
            )

            tuner.search(
                # train_dataset,
                x_train, y_train,
                epochs=100,
                validation_split=.2,
                # validation_data=val_dataset,
                callbacks=[stop_early]
            )
            score = tuner.oracle.get_best_trials(num_trials=1)[0].score

            if best_score < score:
                best_score = score
                best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
                best_preprocessed = preprocessed_data
                best_reserved = reserved_data
                best_tuner = tuner
                best_offset, best_duration = (offset, duration)
            last_score = score

    model = best_tuner.hypermodel.build(best_hps)

    print(f'{"Best score":10}:{best_score}\n{"Offset":10}:{best_offset}\n{"Duration":10}:{best_duration}')

    return model, best_preprocessed, best_reserved, best_hps

In [19]:
# model, preprocessed_data, reserved_data, best_hps = search_best_model(seed=42)

In [20]:
print(*[f'{k:20}: {v}' for k, v in best_hps.values.items()], sep='\n')

activation          : tanh
units               : 120
activation_1        : relu
units_1             : 72
activation_2        : tanh
units_2             : 64
learning_rate       : 0.001
tuner/epochs        : 200
tuner/initial_epoch : 67
tuner/bracket       : 2
tuner/round         : 2
tuner/trial_id      : 0228


In [21]:
# model = Sequential([
#     layers.Dense(64, name='dense_0'),
#     layers.Dense(32, activation=activations.tanh, name='dense_1'),
#     layers.Dense(16, activation=activations.tanh, name='dense_2'),
#     layers.Dense(1, name='output')
# ])
# model.compile(
#     optimizer=optimizers.Adam(name='adam'),
#     loss=losses.BinaryCrossentropy(name='binary_crossentropy', from_logits=True),
#     metrics=[metrics.BinaryAccuracy(name='accuracy')]
# )

model = tuner.hypermodel.build(best_hps)

In [None]:
# (x_train, y_train), (x_test, y_test) = generate_train_test(preprocessed_data)

# For performance, change later model input
# train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
# test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))

history = model.fit(
    x_train, y_train,
    # train_dataset,
    epochs=200,
    # epochs=best_hps.get('tuner/epochs'),
    validation_split=.2,
    # verbose=0,
    callbacks=[stop_early],
)
len(history.history['val_loss'])

In [None]:
model.summary()

In [None]:
keras.utils.vis_utils.plot_model(model, show_shapes=True)

In [None]:
model.evaluate(x_test, y_test)

In [None]:
condition = (convert_string_to_date('1.1.2021') <= reserved_data['to date'])

X_pred = reserved_data[condition].drop(['to date'], axis=1)

# X_pred = processed_data[condition]

result_pred = model.predict(X_pred)
result_proba = tf.nn.sigmoid(result_pred).numpy()

In [None]:
figure_x = reserved_data[condition]['to date']
figure_x_text = figure_x.map(lambda e: f'[{(e + DATE_TARGET_OFFSET).strftime("%d.%m.%Y")}..{(e + DATE_TARGET_OFFSET + DATE_TARGET_DURATION).strftime("%d.%m.%Y")}]')
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=figure_x,
    y=1-result_proba[:,0],
    mode='lines',
    stackgroup='one',
    line=dict(width=.25),
    groupnorm='percent',
    name='No event',
    text=figure_x_text
))
fig.add_trace(go.Scatter(
    x=figure_x,
    y=result_proba[:,0],
    mode='lines',
    stackgroup='one',
    line=dict(width=.25),
    name='Event predicted',
    text=figure_x_text
))
fig.update_layout(
    title='Probability of event',
    # width=500, height=500
)
fig.show()

In [None]:
result = pd.DataFrame(reserved_data[condition])

In [None]:
result['predicted'] = result_pred
result[(result['predicted'] > 0)]['to date'].map(
    lambda date:
    f'K >= {STARTING_EVENT_CLASS}; Dates [{(date + DATE_TARGET_OFFSET).strftime("%d.%m.%Y")}..{(date + DATE_TARGET_OFFSET + DATE_TARGET_DURATION).strftime("%d.%m.%Y")}]'
)