In [0]:
query = """
SELECT SQLDATE AS Date, 
    Actor1Geo_CountryCode AS Actor1Geo,
    Actor2Geo_CountryCode AS Actor2Geo,
    SUM(AvgTone * NumMentions) / SUM(NumMentions) AS AvgTone
FROM `gdelt-bq.full.events`
WHERE SQLDATE >= 20130101
AND Actor1Geo_CountryCode IS NOT NULL
AND Actor2Geo_CountryCode IS NOT NULL
GROUP BY SQLDATE, Actor1Geo_CountryCode, Actor2Geo_CountryCode
ORDER BY SQLDATE, Actor1Geo_CountryCode, Actor2Geo_CountryCode
"""

In [1]:
from google.colab import drive
drive.mount('/content/drive')
#file = '/content/drive/My Drive/bq-results-20190719-084806-7mgsp0nfduak/bq-results-20190719-084806-7mgsp0nfduak.csv'
file = '/content/drive/My Drive/bq-results-20190721-123458-idt59xkni8d5/bq-results-20190721-123458-idt59xkni8d5.csv'

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import pandas as pd
import numpy as np
from itertools import product

In [3]:
df = pd.read_csv(file)
df[:5]

Unnamed: 0,Date,Actor1Geo,Actor2Geo,AvgTone
0,20130101,AC,AC,1.796449
1,20130101,AC,HK,2.845528
2,20130101,AE,AE,3.267259
3,20130101,AE,AS,4.061859
4,20130101,AE,BA,2.690583


In [4]:
treshold = 250000

actor1_geo_count = pd.read_csv('actor1_geo_count.csv')
actor2_geo_count = pd.read_csv('actor2_geo_count.csv')
a1_countries = set(actor1_geo_count[actor1_geo_count['Count'] < treshold]['Actor1Geo_CountryCode'])
a2_countries = set(actor2_geo_count[actor2_geo_count['Count'] < treshold]['Actor2Geo_CountryCode'])
countries_to_remove = list(a1_countries.intersection(a2_countries))
len(countries_to_remove), df.Actor1Geo.nunique(), df.Actor2Geo.nunique()

(113, 262, 262)

In [5]:
print("before", df.shape[0])
df = df[(~ df['Actor1Geo'].isin(countries_to_remove)) & (~ df['Actor2Geo'].isin(countries_to_remove))]
print("after", df.shape[0])

before 8128850
after 7105127


In [0]:
!mkdir data 2>/dev/null

In [0]:
start_year, end_year = int(df['Date'].min() / 10000), int(df['Date'].max() / 10000)

In [0]:
data_files = []
count_of_day = set()
shape = (df['Actor1Geo'].nunique(), df['Actor2Geo'].nunique())


for year in np.arange(start_year, end_year + 1):
  start_of_year, end_of_year = year * 10000 + 101, year * 10000 + 1231
  year_df = df[(df['Date'] >= start_of_year) & (df['Date'] <= end_of_year)]
  
  index_combination = product(year_df['Date'].unique(), 
                              df['Actor1Geo'].unique(), 
                              df['Actor2Geo'].unique())
  index = pd.MultiIndex.from_tuples(list(index_combination),
                                    names=['Date', 'Actor1Geo', 'Actor2Geo'])
  year_df = year_df.groupby(['Date', 'Actor1Geo', 'Actor2Geo']).first().reindex(index).reset_index()
  
  year_df['MonthYear'] = (year_df['Date'] / 100).astype('int32')
  year_df['Year'] = (year_df['MonthYear'] / 100).astype('int32')
  
  year_avg_tone_mapping = year_df.groupby(['Year', 'Actor1Geo', 'Actor2Geo'])['AvgTone'].mean()
  year_avg_tone_mapping.fillna(0.0, inplace=True)
  
  month_avg_tone_mapping = year_df.groupby(['MonthYear', 'Actor1Geo', 'Actor2Geo'])['AvgTone'].mean()
  month_index = month_avg_tone_mapping.index
  
  month_avg_tone = pd.DataFrame({
                "Year": (pd.Series(month_index.get_level_values('MonthYear')) / 100).astype('int32'),
                "Actor1Geo": month_index.get_level_values('Actor1Geo'),
                "Actor2Geo": month_index.get_level_values('Actor2Geo')}).join(
      year_avg_tone_mapping,
      on=['Year', 'Actor1Geo', 'Actor2Geo'],
      how='left'
  )['AvgTone']
  
  month_avg_tone.index = month_avg_tone_mapping.index
  month_avg_tone_mapping.fillna(month_avg_tone, inplace=True)
  
  day_avg_tone = pd.DataFrame({
                "MonthYear": (year_df['Date'] / 100).astype('int32'),
                "Actor1Geo": year_df['Actor1Geo'],
                "Actor2Geo": year_df['Actor2Geo']}).join(
      month_avg_tone_mapping,
      on=['MonthYear', 'Actor1Geo', 'Actor2Geo'],
      how='left'
  )['AvgTone']
  
  day_avg_tone.index = year_df.index
  year_df['AvgTone'].fillna(day_avg_tone, inplace=True)
  
  assert year_df['AvgTone'].isna().sum() == 0
  
  year_df.sort_values(['Date', 'Actor1Geo', 'Actor2Geo'], inplace=True)
  year_df.index = np.arange(year_df.shape[0])
  
  day_counts = np.unique(year_df.groupby('Date')['AvgTone'].count())
  assert day_counts.shape[0] == 1
  day_count = day_counts[0]
  count_of_day.add(day_count)
  assert len(count_of_day) == 1
  
  year_data = year_df['AvgTone'].values.reshape((-1, shape[0], shape[1]))
  file_name = 'data/data_{year}.npy'.format(year=year)
  np.save(file_name, year_data)
  data_files.append(file_name)                                            
  del(year_data, year_df)

In [0]:
day_index = pd.MultiIndex.from_product((sorted(df['Actor1Geo'].unique()), sorted(df['Actor2Geo'].unique())), 
                                       names=['Actor1Geo', 'Actor2Geo']).to_frame()
day_index.to_pickle('day_index.pickle')

In [0]:
dates = pd.Series(df['Date'].unique()).sort_values()

dates.to_pickle('dates.pickle')

In [0]:
import pandas as pd
import numpy as np
import os

In [0]:
day_index = pd.read_pickle('day_index.pickle')
dates = pd.read_pickle('dates.pickle')

In [0]:
import glob

data = []
for data_file in sorted(glob.glob("data/data_*.npy")):
  data.append(np.load(data_file))
              
data = np.concatenate(data)
assert data.shape[0] == dates.shape[0]

In [0]:
num_steps = 10
batch_size = 10
train_data_treshold = 20180601
input_size = data.shape[1:]

In [0]:
import keras.backend as K
import keras.layers as L
from keras.models import Model
from keras.optimizers import Adam
from keras.regularizers import l1_l2

conv_num = input_size[0]

input0 = L.Input(shape=(num_steps, input_size[0], input_size[1]))
reshape0 = L.Reshape((num_steps, input_size[0], input_size[1], 1))(input0)

conv_ltsm1 = L.ConvLSTM2D(
    filters=conv_num, 
    kernel_size=(1, input_size[1])
)(reshape0)

conv_ltsm2 = L.ConvLSTM2D(
    filters=conv_num, 
    kernel_size=(input_size[0], 1)
)(reshape0)
reshape1 = L.Reshape((input_size[0], conv_num))(conv_ltsm1)
reshape2 = L.Reshape((input_size[1], conv_num))(conv_ltsm2)
transposed = L.Permute((2, 1))(reshape2)
# merge =  L.Dot(axes=(2, 1))([reshape1, transposed])
merge = L.Add()([reshape1, reshape2])

model = Model(inputs=[input0], outputs=merge)

optimizer = Adam(lr=0.001)
model.compile(loss='mean_squared_error', optimizer=optimizer)
print(model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            (None, 10, 151, 151) 0                                            
__________________________________________________________________________________________________
reshape_22 (Reshape)            (None, 10, 151, 151, 0           input_8[0][0]                    
__________________________________________________________________________________________________
conv_lst_m2d_15 (ConvLSTM2D)    (None, 151, 1, 151)  13863612    reshape_22[0][0]                 
__________________________________________________________________________________________________
conv_lst_m2d_16 (ConvLSTM2D)    (None, 1, 151, 151)  13863612    reshape_22[0][0]                 
__________________________________________________________________________________________________
reshape_23

In [0]:
import keras.backend as K
import keras.layers as L
from keras.models import Model
from keras.optimizers import Adam
from keras.regularizers import l1_l2

conv_num = input_size[0]//4

input0 = L.Input(shape=(num_steps, input_size[0], input_size[1]))
reshape0 = L.Reshape((num_steps, input_size[0], input_size[1], 1))(input0)

conv_ltsm1_1 = L.ConvLSTM2D(
    filters=conv_num, 
    kernel_size=(1, input_size[1]),
    return_sequences=True,
)(reshape0)

conv_ltsm1_2 = L.ConvLSTM2D(
    filters=conv_num, 
    kernel_size=(input_size[0], 1),
    return_sequences=True,
)(reshape0)
reshape1_1 = L.Reshape((-1, input_size[0], conv_num))(conv_ltsm1_1)
reshape1_2 = L.Reshape((-1, input_size[1], conv_num))(conv_ltsm1_2)
transposed1 = L.Permute((1, 3, 2))(reshape1_2)
merge1 =  L.Dot(axes=(3, 2))([reshape1_1, transposed1])


reshape1 = L.Reshape((-1, input_size[0], input_size[1], 1))(merge1)

conv_ltsm2_1 = L.ConvLSTM2D(
    filters=conv_num, 
    kernel_size=(1, input_size[1])
)(reshape1)

conv_ltsm2_2 = L.ConvLSTM2D(
    filters=conv_num, 
    kernel_size=(input_size[0], 1)
)(reshape1)
reshape2_1 = L.Reshape((input_size[0], conv_num))(conv_ltsm2_1)
reshape2_2 = L.Reshape((input_size[1], conv_num))(conv_ltsm2_2)
transposed2 = L.Permute((2, 1))(reshape2_2)
merge2 =  L.Dot(axes=(2, 1))([reshape2_1, transposed2])

model = Model(inputs=[input0], outputs=merge2)

optimizer = Adam(lr=0.0005)
model.compile(loss='mean_squared_error', optimizer=optimizer)
print(model.summary())

In [37]:
input_size

(151, 151)

In [35]:
import keras.backend as K
import keras.layers as L
from keras.models import Model
from keras.optimizers import Adam
from keras.regularizers import l1_l2

shape = input_size[0] * input_size[1]

output_shape = input_size[0]

input0 = L.Input(shape=(num_steps, input_size[0], input_size[1]))
# reshape0 = L.Reshape((num_steps, input_size[0] * input_size[1]))(input0)
# 
# ltsm1 = L.LSTM(
#     units=output_shape,   
#     kernel_regularizer=l1_l2(l1=0.01, l2=0.1),
#     return_sequences=True,
# )(reshape0)

reshape0 = L.Reshape((num_steps, input_size[0], input_size[1], 1))(input0)

ltsm1 = L.ConvLSTM2D(
    filters=output_shape, 
    kernel_size=(1, input_size[1]),
    return_sequences=True,
)(reshape0)

print(ltsm1.shape)
reshape1 = L.Reshape((num_steps, input_size[0] * input_size[1]))(ltsm1)
print(reshape1.shape)

ltsm2 = L.LSTM(
    units=output_shape,   
    kernel_regularizer=l1_l2(l1=0.01, l2=0.1),
)(reshape1)

dense1 = L.Dense(
    units=output_shape,
#     activation='relu',
    kernel_regularizer=l1_l2(l1=0.01, l2=0.1),
)(ltsm2)


model = Model(inputs=[input0], outputs=dense1)

optimizer = Adam(
    lr=0.001,
    decay=0.001,
    amsgrad=True,
)
model.compile(loss='mean_squared_error', optimizer=optimizer)
print(model.summary())

(?, ?, 151, 1, 151)
(?, 10, 22801)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_11 (InputLayer)        (None, 10, 151, 151)      0         
_________________________________________________________________
reshape_14 (Reshape)         (None, 10, 151, 151, 1)   0         
_________________________________________________________________
conv_lst_m2d_9 (ConvLSTM2D)  (None, 10, 151, 1, 151)   13863612  
_________________________________________________________________
reshape_15 (Reshape)         (None, 10, 22801)         0         
_________________________________________________________________
lstm_9 (LSTM)                (None, 151)               13863612  
_________________________________________________________________
dense_2 (Dense)              (None, 151)               22952     
Total params: 27,750,176
Trainable params: 27,750,176
Non-trainable params: 0
_____________________________

In [0]:
from keras.utils.vis_utils import plot_model
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

In [0]:
from keras.utils import Sequence

class DataSequence(Sequence):
  def __init__(self, data, batch_size, num_steps, index=None, index_axis=0):
    self.data = data
    self.batch_size = batch_size
    self.num_steps = num_steps
    self.index = index
    self.index_axis = index_axis
    
  def __len__(self):
        return int(np.ceil((self.data.shape[0] - self.num_steps + 1) / self.batch_size))
    
    
  def __getitem__(self, idx):
    batch_start_index = self.batch_size * idx
    
    batch_x, batch_y = [], []
    for step_start_index in np.arange(batch_start_index, min(batch_start_index + self.batch_size, self.data.shape[0] - self.num_steps)):
      batch_x.append(self.data[step_start_index: (step_start_index + self.num_steps)])
      if self.index is not None:
        if self.index_axis == 0:
          batch_y.append(self.data[step_start_index + self.num_steps, self.index])
        else:
          batch_y.append(self.data[step_start_index + self.num_steps, :, self.index])
      else:
        batch_y.append(self.data[step_start_index + self.num_steps])

    return np.array(batch_x), np.array(batch_y)

In [14]:
np.argmax(day_index.Actor1Geo.unique() == 'PL')

108

In [0]:
train_end = dates.loc[dates == train_data_treshold].index[0]

train_data = DataSequence(data[:train_end], batch_size, num_steps, index=108, index_axis=0 )
valid_data = DataSequence(data[train_end - num_steps:], batch_size, num_steps, index=108, index_axis=0)

In [0]:
model_dir_name = 'model1'

if not os.path.exists(model_dir_name):
    os.makedirs(model_dir_name)


from keras.callbacks import ModelCheckpoint
checkpointer = ModelCheckpoint(filepath=model_dir_name + '/model-{epoch:02d}.hdf5', 
                               save_best_only=True,
                               verbose=1, 
                               period=10,
                              )
model.save(model_dir_name + '/model.h5')

In [0]:
num_epochs = 100

model.fit_generator(generator=train_data, 
                    epochs=num_epochs,
                    validation_data=valid_data,
                    callbacks=[checkpointer],
                   )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100

Epoch 00010: val_loss improved from inf to 12.63970, saving model to model1/model-10.hdf5
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
  9/197 [>.............................] - ETA: 4:58 - loss: 10.2111

In [5]:
from keras.models import load_model
model = load_model('model.h5')

Using TensorFlow backend.
W0726 11:12:22.869940 139981180176256 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0726 11:12:22.889797 139981180176256 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0726 11:12:22.907044 139981180176256 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0726 11:12:23.393334 139981180176256 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W0726 

In [0]:
from keras.models import load_model

model = load_model('model4/model.h5')
model.load_weights('model4/model-10.hdf5')

In [0]:
def make_data_to_predict(data, dates, date_to_predict=None):
  if date_to_predict is None:
    return np.array(data[-num_steps:])
  else:
    predict_index = dates.loc[dates == date_to_predict].index[0]
    return np.array([data[predict_index - num_steps: predict_index]])

In [0]:
date_to_predict = dates[dates == 20180601].index[0]
country_to_predict = 'PL'

In [0]:
data_to_predict = make_data_to_predict(data, dates, dates.loc[date_to_predict])

In [0]:
predicted = model.predict(data_to_predict)
# predicted = predicted.reshape((-1,))

# day_index.index = np.arange(day_index.shape[0])
predicted_df = pd.DataFrame({
    'country': sorted(day_index.Actor1Geo.unique()),
    'values': predicted[0],
})

In [0]:
target_df = pd.DataFrame({
    'country': sorted(day_index.Actor1Geo.unique()),
    'values': data[date_to_predict, 108],
}) 

In [25]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Bar(x=predicted_df['country'], 
                     y=predicted_df['values'],
                     marker=dict(color='crimson'),
                     opacity=0.5,
                     name='predicted'
                    ))
fig.add_trace(go.Bar(x=target_df['country'], 
                     y=target_df['values'],
                     marker=dict(color='lightslategrey'),
                     opacity=0.5,
                     name='target'
                    ))

fig.show()