In [2]:
import pandas as pd
import datetime
import json
import numpy as np
import string
import math
import re

import torchtext
from torchtext.vocab import Vectors
from torchtext import data, datasets

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from model import TransformerClassification, weights_init

# データのロード

In [3]:
df1 = pd.read_csv('./data/news/2011.csv', encoding='cp932')
# df1 = df1[df1['Company_IDs(TSE)'] == '7203']
df1['News_Source'].value_counts()

日経       144749
ＮＱＮ       77146
発表        30245
ＱＵＩＣＫ      5796
日銀         2256
Ｒ＆Ｉ        1692
財務省         748
Name: News_Source, dtype: int64

In [4]:
df1['Company_Relevance'].value_counts()

100                                                80749
86:86                                               1600
50                                                  1205
49                                                   790
48                                                   701
                                                   ...  
44:40:40:40:40:40                                      1
75:29:29:29:28:28:26:26:26:26:26:26                    1
31:29:29:29                                            1
65:65:27:27:27:27:27:27:27:27:26:26:26:26:26:26        1
78:30:29:29:29:26:26:26:26:26                          1
Name: Company_Relevance, Length: 8801, dtype: int64

In [83]:
for i, date in enumerate(range(2011, 2019)):
    tmp = pd.read_csv('./data/news/' + str(date) + '.csv', encoding='cp932')
    tmp = tmp[tmp['Company_IDs(TSE)'] == '7203']
    tmp = tmp[['Time_Stamp_Original(JST)', 
                        'Company_Code(TSE)', 
                        'Headline', 
                        'News_Source',
                        'Company_Relevance', 
                        'Keyword_Article']]

    # 欠損除去
    tmp = tmp[~tmp["Keyword_Article"].isnull()]

    # タグ除去
    tmp = tmp[(tmp['News_Source'] == '日経') | 
                        (tmp['News_Source'] == 'ＮＱＮ') |
                        (tmp['News_Source'] == 'ＱＵＩＣＫ') | 
                        (tmp['News_Source'] == 'Ｒ＆Ｉ')]

    tmp.index = pd.to_datetime(tmp["Time_Stamp_Original(JST)"])
    tmp = tmp.drop("Time_Stamp_Original(JST)", axis=1)
    
    if i == 0:
        df1 = tmp.copy()
    else:
        df1 = pd.concat([df1, tmp])

# インデックスを設定

In [4]:
def norm_time(x):
    if x.hour > 15:
        return x + datetime.timedelta(days=1)
    return x

time = pd.to_datetime(df1.index.values)
df1.index = df1.index.map(norm_time)
df1.index = df1.index.date

# 株価を挿入する

In [5]:
# 株価を取り出す
df2 = pd.read_csv('./data/stock_price/7203.csv', index_col=0)
df2.index = pd.to_datetime(df2['date'])
df2.index = df2.index.date
df2 = df2.drop(['date'], axis=1)
df2.head(10)

Unnamed: 0,adj_close
2011-01-04,3265.0
2011-01-05,3295.0
2011-01-06,3380.0
2011-01-07,3455.0
2011-01-11,3455.0
2011-01-12,3500.0
2011-01-13,3535.0
2011-01-14,3550.0
2011-01-17,3500.0
2011-01-18,3510.0


# 時系列をくっつける

In [62]:
df3 = pd.concat([df1,df2], axis=1, join_axes=[df1.index])
df3['price'] = np.round(df2.pct_change().shift(-1) * 100, 3)
df3.loc[df3['price'] > 0, 'price'] = 1
df3.loc[df3['price'] < 0, 'price'] = 0
df3['Keyword_Article'] = \
    df3.groupby(level=0).apply(lambda x: ':<pad>:'.join(list(x['Keyword_Article'])))
df3 = df3.dropna()

df3 = df3[~df3.duplicated(subset=['Keyword_Article'])]

  """Entry point for launching an IPython kernel.


In [63]:
df3.head()

Unnamed: 0,Company_Code(TSE),Headline,News_Source,Company_Relevance,Keyword_Article,adj_close,price
2011-01-05,7203.0,<日経>◇12月の中国新車販売、トヨタが単月で過去最高,日経,100,北京:中国:１２月:新車販売台数:前年同月比:増:過去最高:制限:受け:全国:各地:乗用車:...,3295.0,1.0
2011-01-06,7203.0,<NQN>◇トヨタ社長「今年は後半に晴れ間」　為替は１ドル＝90円を期待,ＮＱＮ,100,豊田:見通し:販売:エコカー補助金:安定的:伸び:株価:為替:水準:日経平均株価:最低:ライ...,3380.0,1.0
2011-01-07,7203.0,<日経>◇福岡県、自動車の技術者育成へ新組織　年内、中小向け,日経,37,自動車産業:強化:福岡:先端:設置:方針:技術:調査:ニーズ:カリキュラム:大学:受け:生産...,3455.0,1.0
2011-01-11,7203.0,<日経>◇トヨタ、米ミシガン州に安全研究センター新設,日経,100,先進:安全:子供:高齢者:事故:向上:目指す:米国:大規模:リコール:回収:問題:開催:豊田...,3455.0,0.0
2011-01-12,7203.0,<NQN>◇12日の予定（時間は日本時間）国際収支、米地区連銀経済報告,ＮＱＮ,10,１１月:財務省:１２月:対外:契約:貸出:動向:マネーストック:国庫短期証券:受注:環境装置...,3500.0,1.0


In [64]:
# json_data = {}
# for i, (_, row) in enumerate(df3.iterrows()):
#     json_data[i] = {
#         'date': str(row.name),
#         'Keyword_Article': row['Keyword_Article'].split(':'),
#         'price': row['price']
#     }

# csvファイルに保存する

In [65]:
train_date = 2015
test_date = 2017

In [66]:
date_year = df3.index.map(lambda x: x.year)

In [67]:
df3[date_year <= train_date][['Keyword_Article', 'price']].to_csv(
        './data/news/text_train.tsv',
        header=None,
        index=None,
        sep='\t')

In [68]:
df3[(train_date < date_year) & (date_year > test_date)][['Keyword_Article', 'price']].to_csv(
        './data/news/text_val.tsv',
        header=None,
        index=None,
        sep='\t')

In [69]:
df3[test_date <= date_year][['Keyword_Article', 'price']].to_csv(
        './data/news/text_test.tsv',
        header=None,
        index=None,
        sep='\t')

# Dataの作成

In [70]:
# 前処理
def preprocessing_text(text):
    # カンマ、ピリオド以外の記号をスペースに置換
    for p in string.punctuation:
        if (p == ".") or (p == ",") or (p == ":") or (p == "<")or (p == ">"):
            continue
        else:
            text = text.replace(p, " ")

    # ピリオドなどの前後にはスペースを入れておく
    text = text.replace(".", " . ")
    text = text.replace(",", " , ")
    text = re.sub(r'[0-9 ０-９]', '0', text)
    
    return text

# 分かち書き（今回はデータが英語で、簡易的にスペースで区切る）
def tokenizer_punctuation(text):
    return text.strip().split(':')

# 前処理と分かち書きをまとめた関数を定義
def tokenizer_with_preprocessing(text):
    text = preprocessing_text(text)
    ret = tokenizer_punctuation(text)
    return ret

In [71]:
max_length = 256
batch_size = 32

# 読み込んだ内容に対して行う処理を定義
TEXT = torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, 
                            use_vocab=True,
                            lower=True, include_lengths=True, batch_first=True, fix_length=max_length, 
                            init_token="<cls>", eos_token="<eos>")
LABEL = torchtext.data.Field(sequential=False, use_vocab=False, dtype=torch.float)

In [72]:
train_ds = torchtext.data.TabularDataset.splits(
    path='./data/news', train='text_train.tsv',
    format='tsv',
    fields=[('Text', TEXT), ('Label', LABEL)])
train_ds = train_ds[0]
# print(vars(train_ds[1]))

val_ds = torchtext.data.TabularDataset.splits(
    path='./data/news', train='text_val.tsv',
    format='tsv',
    fields=[('Text', TEXT), ('Label', LABEL)])
val_ds = val_ds[0]

test_ds = torchtext.data.TabularDataset.splits(
    path='./data/news', train='text_test.tsv',
    format='tsv',
    fields=[('Text', TEXT), ('Label', LABEL)])
test_ds = test_ds[0]

japanese_fasttext_vectors = Vectors(name='./data/news/cc.ja.300.vec')
TEXT.build_vocab(train_ds, 
                                 vectors=japanese_fasttext_vectors,
                                 min_freq=10)
TEXT.vocab.freqs

train_dl = torchtext.data.Iterator(
    train_ds, batch_size=batch_size, train=True)
val_dl = torchtext.data.Iterator(
    val_ds, batch_size=batch_size, train=False, sort=False)
test_dl = torchtext.data.Iterator(
    test_ds, batch_size=batch_size, train=False, sort=False)

In [73]:
# # 動作確認
# batch = next(iter(train_dl))
# print(batch.Text[0])
# print(batch.Label)

# モデル構築

In [74]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# モデルの構築
net = TransformerClassification(
    text_embedding_vectors=TEXT.vocab.vectors, 
    d_model=300,
    max_seq_len=256, 
    output_dim=1)

# 訓練モード
net.train()

# パラメータ初期化
net.net3_1.apply(weights_init)
net.net3_2.apply(weights_init)

TransformerBlock(
  (norm_1): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
  (norm_2): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
  (attn): Attention(
    (q_linear): Linear(in_features=300, out_features=300, bias=True)
    (v_linear): Linear(in_features=300, out_features=300, bias=True)
    (k_linear): Linear(in_features=300, out_features=300, bias=True)
    (out): Linear(in_features=300, out_features=300, bias=True)
  )
  (ff): FeedForward(
    (linear_1): Linear(in_features=300, out_features=1024, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear_2): Linear(in_features=1024, out_features=300, bias=True)
  )
  (dropout_1): Dropout(p=0.1, inplace=False)
  (dropout_2): Dropout(p=0.1, inplace=False)
)

# 最適化

In [75]:
# 最適化手法
learning_rate = 2e-5
optimizer = optim.Adam(net.parameters(), lr=learning_rate)


criterion = nn.BCEWithLogitsLoss()
criterion = criterion.to(device)

In [76]:
def accuracy(scores, y):    
    correct = (scores == y)
    acc = correct.sum() / len(correct)
    return acc

def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()
    return acc

In [77]:
num_epochs = 50
dataloaders_dict = {'train': train_dl, 'val':val_dl}


print('----start----')
net.to(device)

torch.backends.cudnn.benchmark = True

for epoch in range(num_epochs):
#         for phase in ['train', 'val']:
    for phase in ['train', 'val']:
        if phase == 'train':
            net.train()
        else:
            net.eval()

        epoch_loss = 0.0
        epoch_corrects = 0

        for batch in (dataloaders_dict[phase]):
            inputs = batch.Text[0].to(device)
            labels = batch.Label.to(device)

            optimizer.zero_grad()

            with torch.set_grad_enabled(phase == 'train'):

                # maskの作成
                input_pad = 1
                input_mask = (inputs != input_pad)

                # Transformerに入力
                preds, _, _ = net(inputs, input_mask)
                preds = preds.view(-1)
#                 loss = torch.mean((preds - labels)**2)
                loss = criterion(preds, labels)

#                     _, preds = torch.max(outputs, 1)

                # 更新
                if phase == 'train':
                    loss.backward()
                    optimizer.step()

                # 結果の計算
                epoch_loss += loss.item() * inputs.size(0)
                preds[preds > 0] = 1
                preds[preds < 0] =0
                epoch_corrects += binary_accuracy(preds, labels) #torch.sum(preds == labels.data)

        # epochごとのlossと正解率
        epoch_loss = epoch_loss / len(dataloaders_dict[phase].dataset)
        epoch_acc = epoch_corrects.double() / len(dataloaders_dict[phase].dataset)

        print('Epoch {}/{} | {:^5} | Loss: {:.4f} Acc: {:.4f}'.format(
                                                                     epoch+1,
                                                                     num_epochs,
                                                                     phase,
                                                                     epoch_loss,
                                                                     epoch_acc))

net_trained = net

----start----
Epoch 1/50 | train | Loss: 0.7550 Acc: 0.4980
Epoch 1/50 |  val  | Loss: 0.7642 Acc: 0.4819
Epoch 2/50 | train | Loss: 0.7274 Acc: 0.5314
Epoch 2/50 |  val  | Loss: 0.7560 Acc: 0.4601
Epoch 3/50 | train | Loss: 0.7140 Acc: 0.5442
Epoch 3/50 |  val  | Loss: 0.7553 Acc: 0.4710
Epoch 4/50 | train | Loss: 0.7107 Acc: 0.5481
Epoch 4/50 |  val  | Loss: 0.7487 Acc: 0.4783
Epoch 5/50 | train | Loss: 0.7327 Acc: 0.5324
Epoch 5/50 |  val  | Loss: 0.7395 Acc: 0.4384
Epoch 6/50 | train | Loss: 0.7170 Acc: 0.5491
Epoch 6/50 |  val  | Loss: 0.8130 Acc: 0.5072
Epoch 7/50 | train | Loss: 0.7129 Acc: 0.5540
Epoch 7/50 |  val  | Loss: 0.7552 Acc: 0.4746
Epoch 8/50 | train | Loss: 0.6881 Acc: 0.5580
Epoch 8/50 |  val  | Loss: 0.7490 Acc: 0.4457
Epoch 9/50 | train | Loss: 0.6754 Acc: 0.5825
Epoch 9/50 |  val  | Loss: 0.7555 Acc: 0.4565
Epoch 10/50 | train | Loss: 0.6721 Acc: 0.5845
Epoch 10/50 |  val  | Loss: 0.7768 Acc: 0.4420
Epoch 11/50 | train | Loss: 0.6681 Acc: 0.5894
Epoch 11/50 |  va

In [78]:
binary_accuracy(preds, labels)

tensor(8., device='cuda:0')

# AttentionMap

In [79]:
# HTMLを作成する関数を実装


def highlight(word, attn):
    "Attentionの値が大きいと文字の背景が濃い赤になるhtmlを出力させる関数"

    html_color = '#%02X%02X%02X' % (
        255, int(255*(1 - attn)), int(255*(1 - attn)))
    return '<span style="background-color: {}"> {}</span>'.format(html_color, word)


def mk_html(index, batch, preds, normlized_weights_1, normlized_weights_2, TEXT):
    "HTMLデータを作成する"

    # indexの結果を抽出
    sentence = batch.Text[0][index]  # 文章
    label = batch.Label[index]  # ラベル
    pred = preds[index]  # 予測
    
    pred[pred > 0] = 1
    pred[pred < 0] =0

    # indexのAttentionを抽出と規格化
    attens1 = normlized_weights_1[index, 0, :]  # 0番目の<cls>のAttention
    attens1 /= attens1.max()

    attens2 = normlized_weights_2[index, 0, :]  # 0番目の<cls>のAttention
    attens2 /= attens2.max()

    # ラベルと予測結果を文字に置き換え
    if label == 0:
        label_str = "Negative"
    else:
        label_str = "Positive"

    if pred == 0:
        pred_str = "Negative"
    else:
        pred_str = "Positive"

    # 表示用のHTMLを作成する
    html = '正解ラベル：{}<br>推論ラベル：{}<br><br>'.format(label_str, pred_str)

    # 1段目のAttention
    html += '[TransformerBlockの1段目のAttentionを可視化]<br>'
    for word, attn in zip(sentence, attens1):
        html += highlight(TEXT.vocab.itos[word], attn)
    html += "<br><br>"

    # 2段目のAttention
    html += '[TransformerBlockの2段目のAttentionを可視化]<br>'
    for word, attn in zip(sentence, attens2):
        html += highlight(TEXT.vocab.itos[word], attn)

    html += "<br><br>"

    return html

In [81]:
from IPython.display import HTML

batch = next(iter(test_dl))

inputs = batch.Text[0].to(device)
labels = batch.Label.to(device)

input_pad = 1
input_mask = (inputs != input_pad)

outputs, normilized_weights_1, normilized_weights_2 = net_trained(inputs, input_mask)
_, preds = torch.max(outputs, 1)

index = 0
html_output = mk_html(index, batch, preds, normilized_weights_1, normilized_weights_2, TEXT)
HTML(html_output)  # HTML形式で出力

In [82]:
outputs

tensor([[ -3.3292],
        [ -2.0865],
        [ -0.0925],
        [  4.0126],
        [  0.1893],
        [  2.3993],
        [ -5.9244],
        [ -0.3561],
        [ -0.4535],
        [ -2.1270],
        [ -5.2448],
        [ -2.4705],
        [ -2.6276],
        [ -0.1763],
        [  4.3505],
        [ -4.0077],
        [ -0.7202],
        [ -1.2223],
        [  5.6277],
        [ -4.8390],
        [ -1.5581],
        [  6.4960],
        [  2.5633],
        [ -0.1113],
        [ -2.9691],
        [  0.6919],
        [  2.5171],
        [ -7.9466],
        [-10.5502],
        [ -2.5874],
        [ -3.2385],
        [ -7.1231]], device='cuda:0', grad_fn=<AddmmBackward>)