In [17]:
import os
import datetime
import uuid
import json
import datetime
import time

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline

from tensorflow.keras import layers
from tensorflow.keras import losses

import tensorflow as tf
import sklearn.metrics as metrics

#####################################################
# PARAMETERS
#####################################################
data_path = 'data/final_twitter.csv'
results_path = 'data'
embeddings_path = 'data/embeddings/glove.6B.100d.txt'
embedding_dim = 100
# sprawdzałem te inputy i raczej nie ma sytuacji żeby w 6h wrzucili ponad 200 tekstow
window_width = 80 # number of rates to use (from window_with only values with time earlier than 6h are selected)
txt_window = 15 # number of texts to use
# series_window = 20
epochs = 5
early_stopping_min_delta = 1e-3
early_stopping_patience = 2
text_max_features = 30000 # number of most frequenct words to use
text_sequence_length = 30 # number of word to take from single record
categories = 3

#####################################################

hours_window = 6
shuffle_buffer_size = 74726 # to fit whole training set in buffer

df = pd.read_csv(data_path).drop('Unnamed: 0', axis=1)
df.sort_values('date', axis=0, ascending=True, inplace=True, kind='quicksort', na_position='last', ignore_index=True, key=None)
if categories == 2:
    df['direction'] = df['direction'].map(lambda x : float(1 & int(x)))
df = pd.concat([df, pd.get_dummies(df['direction'])], axis=1).drop(['direction'], axis=1)

beg = datetime.datetime.strptime(df['date'][0], '%Y-%m-%d %H:%M:%S')

def date_to_hours(date: str):
    d = datetime.datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
    return (d - beg).total_seconds() / 3600

df['hours'] = df['date'].map(date_to_hours)
df = df.drop(['date'], axis=1)

shuffle = False
train, test = train_test_split(df, shuffle=shuffle, test_size=0.15)
train, validation = train_test_split(train, shuffle=shuffle, test_size=0.15)

print('input shape\ntrain:    {}\nvalidate: {}\ntest:     {}\n'.format(train.shape, validation.shape, test.shape))

input shape
train:    (35653, 5)
validate: (6292, 5)
test:     (7403, 5)



In [18]:
#all
total = 74726
c0 = 14985
c1 = 17482
c2 = 42259

#twitter
total = 31582
c0 = 14985
c1 = 6362
c2 = 17196


weight_for_0 = (1 / c0)*(total)/3.0 
weight_for_1 = (1 / c1)*(total)/3.0
weight_for_2 = (1 / c2)*(total)/3.0

class_weight = {0: weight_for_0, 1: weight_for_1, 2: weight_for_2}
class_weight

{0: 0.7025247469691914, 1: 1.6547207377135071, 2: 0.6121966348763278}

In [19]:
class DataGenerator():
  def __init__(self, input_width, txt_width, label_width=1, shift=1, text_max_features=1000, text_sequence_length=40,
               train_df=None, val_df=None, test_df=None):
    
    self.train_df = train_df
    self.val_df = val_df
    self.test_df = test_df

    self.text_max_features = text_max_features
    self.text_sequence_length =  text_sequence_length

    # Work out the window parameters.
    self.input_width = input_width
    self.label_width = label_width
    self.txt_width = txt_width
    self.shift = shift

    self.total_window_size = input_width + shift

    self.input_slice = slice(0, input_width)
    self.input_indices = np.arange(self.total_window_size)[self.input_slice]

    self.txt_input_start = self.total_window_size - self.txt_width
    self.txt_input_slice = slice(self.txt_input_start, None)
    self.txt_input_indices = np.arange(self.total_window_size)[self.txt_input_slice]

    self.label_start = self.total_window_size - self.label_width
    self.labels_slice = slice(self.label_start, None)
    self.label_indices = np.arange(self.total_window_size)[self.labels_slice]

    self._init_vectorizer()

  def __repr__(self):
    return '\n'.join([
        f'Total window size: {self.total_window_size}',
        f'Input indices: {self.input_indices}',
        f'Label indices: {self.label_indices}'])

  def _init_vectorizer(self):
    self.vectorizer = tf.keras.layers.experimental.preprocessing.TextVectorization(
        max_tokens = self.text_max_features,
        output_sequence_length = self.text_sequence_length
    )

    self.vectorizer.adapt(self.train_df['text'].values)


  def split_window(self, features):
    series_inputs = features[:, self.input_slice, -categories-1:-1]
    series_times = features[:, self.input_slice, -1]

    text_inputs = features[:, self.txt_input_slice, :-categories-1]
    labels = features[:, self.labels_slice, -categories-1:-1]    
    current_time = features[:, self.labels_slice, -1]    
        

    series_inputs.set_shape([None, self.input_width, None])
    text_inputs.set_shape([None, self.txt_width, None])
    labels.set_shape([None, self.label_width, None])

    series_times.set_shape([None, self.input_width])
    current_time.set_shape([None, self.label_width])

    # print('series_inputs', series_inputs.shape)
    # print('series_times', series_times.shape)
    # print('current_time', current_time.shape)
    
    mask = tf.math.less(series_times, current_time - hours_window)    
    # print('mask', mask.shape)
    series_inputs = tf.ragged.boolean_mask(series_inputs, mask)
    # print('series_inputs masked', series_inputs.shape)
    
    
    text_inputs = tf.reshape(text_inputs, [-1, self.txt_width*text_inputs.shape[2]])
    labels = tf.reshape(labels, [-1, categories])
  
    return {'series': series_inputs, 'text': tf.cast(text_inputs, tf.int64)}, labels

  def make_dataset(self, data, shuffle):
    txt = np.array(self.vectorizer(data['text']).numpy(), dtype=np.float32)
    data = np.array(data.drop('text', axis=1), dtype=np.float32)
    data = np.concatenate([txt, data], axis=1)

    ds = tf.keras.preprocessing.timeseries_dataset_from_array(
        data=data,
        targets=None,
        sequence_length=self.total_window_size,
        sequence_stride=1,
        shuffle=True,
        batch_size=32,)

    ds = ds.map(self.split_window)

    return ds

  def train(self):
    return self.make_dataset(self.train_df, shuffle=True)

  def val(self):
    return self.make_dataset(self.val_df, shuffle=False)

  def test(self):
    return self.make_dataset(self.test_df, shuffle=False)

In [20]:
# dg = DataGenerator(input_width=window_width, txt_width=txt_window, label_width=1, shift=1, train_df=train, val_df=validation, test_df=test,
#                    text_max_features=text_max_features, text_sequence_length=text_sequence_length)
# tr = dg.train().take(1)

In [25]:

def create_embeddings_matrix(vectorizer, embeddings_path, embedding_dim=100):
    embeddings_index = {}
    with open(embeddings_path) as f:
        for line in f:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, "f", sep=" ")
            embeddings_index[word] = coefs

    voc = vectorizer.get_vocabulary()
    word_index = dict(zip(voc, range(len(voc))))

    num_tokens = len(voc) + 2
    hits = 0
    misses = 0

    # Prepare embedding matrix
    embedding_matrix = np.zeros((num_tokens, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            # This includes the representation for "padding" and "OOV"
            embedding_matrix[i] = embedding_vector
            hits += 1
        else:
            misses += 1

    print("Converted %d words (%d misses)" % (hits, misses))

    return layers.Embedding(
            num_tokens,
            embedding_dim,
            embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
            trainable=False,
            mask_zero=True
    )

def create_model_lstm(embedding_layer, window_length, num_labels=3):
    
    text_input = layers.Input(shape=(None,), name='text')
    txt = embedding_layer(text_input)
    txt = layers.Bidirectional(tf.keras.layers.LSTM(64, recurrent_dropout=0.5, dropout=0.5))(txt)

    # series_input = layers.Input(shape=(window_length,num_labels), name='series')
    # series = layers.LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)(series_input)
    # series = layers.LSTM(64, dropout=0.2, recurrent_dropout=0.2)(series)    
    # series = layers.Reshape([-1])(series)

    # x = layers.concatenate([txt, series])

    x = layers.Dropout(0.25)(txt)
    out = layers.Dense(num_labels, activation='softmax')(x)

    return tf.keras.Model(inputs=[text_input], outputs=[out])


def create_model_gru(embedding_layer, window_length, num_labels=3):
    
    text_input = layers.Input(shape=(None,), name='text')
    txt = embedding_layer(text_input)
    txt = tf.keras.layers.GRU(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)(txt)
    txt = tf.keras.layers.GRU(64, dropout=0.2, recurrent_dropout=0.2)(txt)
    # txt = layers.Bidirectional(tf.keras.layers.GRU(64, recurrent_dropout=0.5, dropout=0.5))(txt)

    series_input = layers.Input(shape=(None, num_labels), name='series')
    series = layers.GRU(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)(series_input)
    series = layers.GRU(64, dropout=0.2, recurrent_dropout=0.2)(series)    
    series = layers.Reshape([-1])(series)

    x = layers.concatenate([txt, series])

    # txt = layers.Dropout(0.25)(x)

    x = layers.Dense(64)(x)

    x = layers.Dropout(0.2)(x)
    out = layers.Dense(num_labels, activation='softmax')(x)

    return tf.keras.Model(inputs=[text_input, series_input], outputs=[out])


def create_model_lstm_big(embedding_layer, window_length, num_labels=3):
    
    text_input = layers.Input(shape=(None,), name='text')
    txt = embedding_layer(text_input)
    txt = layers.Bidirectional(tf.keras.layers.LSTM(64, recurrent_dropout=0.5, dropout=0.5))(txt)
    txt = layers.Dense(32)(txt)

    # series_input = layers.Input(shape=(window_length,num_labels), name='series')
    # series = layers.LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)(series_input)
    # series = layers.LSTM(64, dropout=0.2, recurrent_dropout=0.2)(series)
    # series = layers.Dense(32)(series)
    # series = layers.Reshape([-1])(series)

    # x = layers.concatenate([txt, series])

    x = layers.Dropout(0.25)(txt)
    out = layers.Dense(num_labels, activation='softmax')(x)

    return tf.keras.Model(inputs=[text_input], outputs=[out])


def build_model(embeddings_layer, window_width, model_fn):
    model = model_fn(embeddings_layer, window_width, categories)
    model.summary()
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=[tf.keras.metrics.CategoricalAccuracy()])
    return model

def early_stopping(min_delta=1e-3, patience=3):
    return tf.keras.callbacks.EarlyStopping(
        monitor='val_categorical_accuracy',
        min_delta=min_delta,
        patience=patience,
        verbose=1,
        restore_best_weights=True
    )


class Analyzer:

    def __init__(self, reports_path):
        self.root_reports_path = reports_path

    def train_model(self, model, model_name, dg, model_parameters=None):

        callbacks = callbacks=[early_stopping(early_stopping_min_delta, early_stopping_patience)]

        history = model.fit(dg.train(), validation_data=dg.val(), batch_size=32, epochs=epochs, class_weight=class_weight, callbacks=callbacks)

        self.model = model
        
        timestamp = datetime.datetime.now()

        self.reports_path = self._report_directory(model_name, timestamp)

        self._compute_metrics_val(dg)
        self._compute_metrics_test(dg)

        self._print_metrics(model_name)

        self._generate_md_report(model_name, timestamp, model_parameters, history)
        self._generate_metrics_json()

        self._save_training_history(history)

        model.save(os.path.join(self.reports_path, 'model'))


    def _save_training_history(self, history):
        pd.DataFrame(history.history).to_csv(os.path.join(self.reports_path, 'history.csv'), index_label='epoch')

    def _compute_metrics_val(self, dg):
        val_ds = dg.val()
        y_true = []
        y_pred = []
        for x, y in val_ds:    
            y_pred.append(self.model.predict(x))
            y_true.append(y.numpy())

        yt = np.argmax(np.concatenate(y_true), axis=1)
        yp = np.argmax(np.concatenate(y_pred), axis=1)
        
        self.accuracy_val = metrics.accuracy_score(yt, yp)
        self.conf_matrix_val = metrics.confusion_matrix(yt, yp)

    def _compute_metrics_test(self, dg):        
        test_ds = dg.test()
        y_true = []
        y_pred = []
        for x, y in test_ds:    
            y_pred.append(self.model.predict(x))
            y_true.append(y.numpy())

        yt = np.argmax(np.concatenate(y_true), axis=1)
        yp = np.argmax(np.concatenate(y_pred), axis=1)

        self.accuracy_test = metrics.accuracy_score(yt, yp)
        self.conf_matrix_test = metrics.confusion_matrix(yt, yp)

    def map_ds_to_array(self, ds):
        y = []
        for batch in ds:
            y.append(batch)
        return np.concatenate(y, axis=0)

    def _print_metrics(self, model_name):
        print('{} metrics:'.format(model_name))
        print('accuracy val: {:.3f}'.format(self.accuracy_val))
        print('accuracy test: {:.3f}'.format(self.accuracy_test))
        print('confusion matrics val:\n', self.conf_matrix_val)
        print('confusion matrics test:\n', self.conf_matrix_test)

    def _generate_metrics_json(self):
        m = {
            'accuracy_val': self.accuracy_val,            
            'conf_matrix_val': self.conf_matrix_val.tolist(),
            'accuracy_test': self.accuracy_test,            
            'conf_matrix_test': self.conf_matrix_test.tolist()
        }

        with open(os.path.join(self.reports_path, 'metrics.json'), 'w') as f:
            json.dump(m, f)

    def _generate_md_report(self, model_name, timestamp, model_parameters, history):
        with open(os.path.join(self.reports_path, 'report.md'), 'w') as f:
            f.write('# {}\n*{}*\n'.format(model_name, timestamp.strftime("%Y-%m-%d %H:%M:%S")))
            self._add_model_summary(f)

            if model_parameters is not None:
                self._add_model_parameters(f, model_parameters)

            self._add_metrics(f)
            self._add_confusion_matrix(f, 'val', self.conf_matrix_val)
            self._add_confusion_matrix(f, 'test', self.conf_matrix_test)
            self._add_history(f, history)

    def _add_confusion_matrix(self, file, mat_type, conf_mat):
        text = '## Confusion matrix {}\n'.format(mat_type)

        class_num = conf_mat.shape[0]
        text += ' | '.join([str(x) for x in range(1, class_num+1)]) + '\n'
        text += ' | '.join(['---']*class_num) + '\n'
        for row in conf_mat:
            text += ' | '.join([str(x) for x in row])
            text += '\n'        
        file.write(text)

    def _add_metrics(self, file):
        text = '## Metrics\n'
        text += '| Metric | Value \n --- | ---\n'
        text += ' {} | {:.3f} \n'.format('accuracy val', self.accuracy_val)
        text += ' {} | {:.3f} \n'.format('accuracy test', self.accuracy_test)
        file.write(text)


    def _add_model_parameters(self, file, parameters):
        text = '### Model parameters\n'
        text += '| Prameters | Value \n --- | ---\n'
        for k, v in parameters.items():
            text += '{} | {}\n'.format(k,v)
        file.write(text)

    def _add_model_summary(self, file):
        file.write('## Model\n```')
        self.model.summary(print_fn=lambda x : file.write(x + '\n'))
        file.write('```\n')

    def _add_history(self, file, history):        
        df = pd.DataFrame(history.history)
        header = ' | '.join(df.columns)
        text = '## History\n'
        text += header + '\n'
        text += ' | '.join(len(df.columns)*['---']) + '\n'
        for _, row in df.iterrows():
            text += ' | '.join([str(round(x,4)) for x in row.values])
            text += '\n'
        file.write(text)

    def _report_directory(self, model_name, timestamp):
        dir_name = 'results_{}_{}_{}/'.format(model_name, timestamp.strftime("%m-%d_%H-%M"), uuid.uuid1().hex[:7])
        path = os.path.join(self.root_reports_path, dir_name)
        try:
            os.mkdir(path)
        except FileExistsError:
            pass
        return path

In [26]:
dg = DataGenerator(input_width=window_width, txt_width=txt_window, label_width=1, shift=1, train_df=train, val_df=validation, test_df=test, text_max_features=text_max_features, text_sequence_length=text_sequence_length)

print("creating embeddings matrix")
emb_matrix = create_embeddings_matrix(dg.vectorizer, embeddings_path, embedding_dim)

print("building model")
model = build_model(emb_matrix, window_width, create_model_gru)

params = {
    'window_width': window_width,
    'embeddings_path': embeddings_path,
    'embedding_dim': embedding_dim,
    'epochs': epochs,
    'early_stopping_min_delta': early_stopping_min_delta,
    'early_stopping_patience': early_stopping_patience,
    'text_max_features': text_max_features,
    'text_sequence_length': text_sequence_length
}

an = Analyzer(results_path)
print("training and validating model")
an.train_model(model, 'gru', dg, params)

creating embeddings matrix
Converted 15157 words (14843 misses)
building model
Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text (InputLayer)               [(None, None)]       0                                            
__________________________________________________________________________________________________
series (InputLayer)             [(None, None, 3)]    0                                            
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, None, 100)    3000200     text[0][0]                       
__________________________________________________________________________________________________
gru_19 (GRU)                    (None, None, 64)     13248       series[0][0]                     
_____________