In [1]:
import os

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline

from tensorflow.keras import layers
from tensorflow.keras import losses

import tensorflow as tf

In [2]:
def collect_data_from_directory(path):
    files = os.listdir(path)
    df = pd.DataFrame()    
    for file in files:
        partial = pd.read_csv(os.path.join(path, file)).drop(['title', 'url'], axis=1).dropna()
        df = pd.concat([df,partial],ignore_index=True)
    return df      

In [13]:
df = collect_data_from_directory('./data/news_with_directions/twitter/')
df.sort_values('date', axis=0, ascending=True, inplace=True, kind='quicksort', na_position='last', ignore_index=True, key=None)
df = pd.concat([df, pd.get_dummies(df['direction'])], axis=1).drop(['direction', 'date'], axis=1)

In [14]:
df

Unnamed: 0,text,0.0,1.0,2.0
0,Bitcoin Groups and Law Enforcement Unite to Fo...,0,0,1
1,Prominent #Bitcoin Industry Players Form ‘#Blo...,0,0,1
2,CoinDesk's New BitLicense Report Released Toda...,0,0,1
3,Australia's government is set to review how th...,0,0,1
4,Bitcoin Startup Abra Moves to Launch Mobile Re...,0,0,1
...,...,...,...,...
49343,Drinks in Quarantine - Funding BTC Development...,0,0,1
49344,A popular narrative says that the actions take...,0,0,1
49345,Consumer advocacy group @Public_Citizen is try...,0,0,1
49346,Yield Farming Pool Concept May Solidify Ethere...,0,0,1


In [22]:
df.values[0,0]

'Bitcoin Groups and Law Enforcement Unite to Form Blockchain Alliance https://t.co/Fl0729Qne2 https://t.co/JBx8Pu2gdA'

In [12]:
df

Unnamed: 0,date,text,0.0,1.0,2.0
0,2015-10-22 13:33:00,Bitcoin Groups and Law Enforcement Unite to Fo...,0,0,1
1,2015-10-22 14:39:00,Prominent #Bitcoin Industry Players Form ‘#Blo...,0,0,1
2,2015-10-22 14:43:00,CoinDesk's New BitLicense Report Released Toda...,0,0,1
3,2015-10-22 15:01:00,Australia's government is set to review how th...,0,0,1
4,2015-10-22 15:02:00,Bitcoin Startup Abra Moves to Launch Mobile Re...,0,0,1
...,...,...,...,...,...
49343,2020-06-26 21:47:00,Drinks in Quarantine - Funding BTC Development...,0,0,1
49344,2020-06-26 21:56:00,A popular narrative says that the actions take...,0,0,1
49345,2020-06-26 22:17:00,Consumer advocacy group @Public_Citizen is try...,0,0,1
49346,2020-06-26 22:35:00,Yield Farming Pool Concept May Solidify Ethere...,0,0,1


In [24]:
train, test = train_test_split(df, shuffle=False, test_size=0.2)

In [95]:
class DataGenerator():
  def __init__(self, input_width, label_width=1, shift=1, text_max_features=1000, text_sequence_length=40,
               train_df=None, val_df=None):
    
    self.train_df = train_df
    self.val_df = val_df

    self.text_max_features = text_max_features
    self.text_sequence_length =  text_sequence_length

    # Work out the window parameters.
    self.input_width = input_width
    self.label_width = label_width
    self.shift = shift

    self.total_window_size = input_width + shift

    self.input_slice = slice(0, input_width)
    self.input_indices = np.arange(self.total_window_size)[self.input_slice]

    self.label_start = self.total_window_size - self.label_width
    self.labels_slice = slice(self.label_start, None)
    self.label_indices = np.arange(self.total_window_size)[self.labels_slice]

    self._init_vectorizer()

  def __repr__(self):
    return '\n'.join([
        f'Total window size: {self.total_window_size}',
        f'Input indices: {self.input_indices}',
        f'Label indices: {self.label_indices}'])

  def _init_vectorizer(self):
    self.vectorizer = tf.keras.layers.experimental.preprocessing.TextVectorization(
        max_tokens = self.text_max_features,
        output_sequence_length = self.text_sequence_length
    )

    self.vectorizer.adapt(self.train_df['text'].values)


  def split_window(self, features):

    series_inputs = features[:, self.input_slice, -3:]
    text_inputs = features[:, self.labels_slice, :-3]
    labels = features[:, self.labels_slice, -3:]

    series_inputs.set_shape([None, self.input_width, None])
    text_inputs.set_shape([None, self.label_width, None])
    labels.set_shape([None, self.label_width, None])

    text_inputs = tf.transpose(text_inputs, [0,2,1])

    return {'series': series_inputs, 'text': text_inputs}, labels

  def make_dataset(self, data):
    txt = np.array(self.vectorizer(data['text']).numpy(), dtype=np.float32)
    data = np.array(data.drop('text', axis=1), dtype=np.float32)
    data = np.concatenate([txt, data], axis=1)

    ds = tf.keras.preprocessing.timeseries_dataset_from_array(
        data=data,
        targets=None,
        sequence_length=self.total_window_size,
        sequence_stride=1,
        shuffle=False,
        batch_size=32,)

    ds = ds.map(self.split_window)

    return ds


  def train(self):
    return self.make_dataset(self.train_df)

  def val(self):
    return self.make_dataset(self.val_df)

In [116]:
dg = DataGenerator(input_width=100, label_width=1, shift=1, train_df=train, val_df=test)

In [117]:
dg.train()

<MapDataset shapes: ({series: (None, 100, 3), text: (None, 40, 1)}, (None, 1, 3)), types: ({series: tf.float32, text: tf.float32}, tf.float32)>

In [118]:
embeddings_path = 'data/embeddings/glove.6B.100d.txt'
embedding_dim = 100

def create_embeddings_matrix(vectorizer, embeddings_path, embedding_dim=100):
    embeddings_index = {}
    with open(embeddings_path) as f:
        for line in f:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, "f", sep=" ")
            embeddings_index[word] = coefs

    voc = vectorizer.get_vocabulary()
    word_index = dict(zip(voc, range(len(voc))))

    num_tokens = len(voc) + 2
    hits = 0
    misses = 0

    # Prepare embedding matrix
    embedding_matrix = np.zeros((num_tokens, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            # This includes the representation for "padding" and "OOV"
            embedding_matrix[i] = embedding_vector
            hits += 1
        else:
            misses += 1

    print("Converted %d words (%d misses)" % (hits, misses))

    return layers.Embedding(
            num_tokens,
            embedding_dim,
            embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
            trainable=False,
    )

In [119]:
def crate_test_and_series_model(embedding_layer, window_length, num_labels=3):
    
    text_input = layers.Input(shape=(None,), name='text')
    txt = embedding_layer(text_input)
    txt = layers.Bidirectional(tf.keras.layers.LSTM(64, recurrent_dropout=0.5, dropout=0.5))(txt)
    # txt = layers.Dense(32)(txt)

    series_input = layers.Input(shape=(window_length,num_labels), name='series')
    series = layers.LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)(series_input)
    series = layers.LSTM(64, dropout=0.2, recurrent_dropout=0.2)(series)
    # series = layers.Dense(32)(series)
    series = layers.Reshape([-1])(series)

    x = layers.concatenate([txt, series])

    layers.Dropout(0.25),
    out = layers.Dense(num_labels, activation='softmax')(x)
    out = layers.Reshape([1, -1])(out)

    return tf.keras.Model(inputs=[text_input, series_input], outputs=[out])

In [120]:
emb_matrix = create_embeddings_matrix(dg.vectorizer, embeddings_path, embedding_dim)

Converted 935 words (65 misses)


In [121]:
model = crate_test_and_series_model(emb_matrix, 100, 3)
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
series (InputLayer)             [(None, 100, 3)]     0                                            
__________________________________________________________________________________________________
text (InputLayer)               [(None, None)]       0                                            
__________________________________________________________________________________________________
lstm_19 (LSTM)                  (None, 100, 64)      17408       series[0][0]                     
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 100)    100200      text[0][0]                       
____________________________________________________________________________________________

In [115]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=[tf.keras.metrics.CategoricalAccuracy()])
model.fit(dg.train(), validation_data=dg.val(), batch_size=32, epochs=2)

Epoch 1/5
Epoch 2/5
Epoch 3/5

KeyboardInterrupt: 