In [1]:
import pandas as pd
import datetime
import json
import numpy as np
import string
import math
import re

import torchtext
from torchtext.vocab import Vectors
from torchtext import data, datasets

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from model import TransformerClassification, weights_init, CNN

# データのロード

In [2]:
df1 = pd.read_csv('./data/news/2011.csv', encoding='cp932')
# df1 = df1[df1['Company_IDs(TSE)'] == '7203']
df1['News_Source'].value_counts()

日経       144749
ＮＱＮ       77146
発表        30245
ＱＵＩＣＫ      5796
日銀         2256
Ｒ＆Ｉ        1692
財務省         748
Name: News_Source, dtype: int64

In [3]:
df1['Company_Relevance'].value_counts()

100                                                            80749
86:86                                                           1600
50                                                              1205
49                                                               790
48                                                               701
                                                               ...  
29:29:28:28:28:28:28:28:27:27:27:27:26:26:26:26:26:25              1
28:28:27:27:27:27:25:25:25:25:25:25:25:25:25:25:25:25:25:25        1
85:38:38:35:33                                                     1
31:31:31:29:27:26:26                                               1
37:37:37:37:35:32:27                                               1
Name: Company_Relevance, Length: 8801, dtype: int64

In [4]:
for i, date in enumerate(range(2011, 2019)):
    tmp = pd.read_csv('./data/news/' + str(date) + '.csv', encoding='cp932')
    tmp = tmp[tmp['Company_IDs(TSE)'] == '7203']
    tmp = tmp[['Time_Stamp_Original(JST)', 
                        'Company_Code(TSE)', 
                        'Headline', 
                        'News_Source',
                        'Company_Relevance', 
                        'Keyword_Article']]

    # 欠損除去
    tmp = tmp[~tmp["Keyword_Article"].isnull()]

    # タグ除去
    tmp = tmp[(tmp['News_Source'] == '日経') | 
                        (tmp['News_Source'] == 'ＮＱＮ') |
                        (tmp['News_Source'] == 'ＱＵＩＣＫ') | 
                        (tmp['News_Source'] == 'Ｒ＆Ｉ')]

    tmp.index = pd.to_datetime(tmp["Time_Stamp_Original(JST)"])
    tmp = tmp.drop("Time_Stamp_Original(JST)", axis=1)
    
    if i == 0:
        df1 = tmp.copy()
    else:
        df1 = pd.concat([df1, tmp])

# インデックスを設定

In [5]:
def norm_time(x):
    if x.hour > 15:
        return x + datetime.timedelta(days=1)
    return x

time = pd.to_datetime(df1.index.values)
df1.index = df1.index.map(norm_time)
df1.index = df1.index.date

# 株価を挿入する

In [6]:
# 株価を取り出す
df2 = pd.read_csv('./data/stock_price/7203.csv', index_col=0)
df2.index = pd.to_datetime(df2['date'])
df2.index = df2.index.date
df2 = df2.drop(['date'], axis=1)
df2.head(10)

Unnamed: 0,adj_close
2011-01-04,3265.0
2011-01-05,3295.0
2011-01-06,3380.0
2011-01-07,3455.0
2011-01-11,3455.0
2011-01-12,3500.0
2011-01-13,3535.0
2011-01-14,3550.0
2011-01-17,3500.0
2011-01-18,3510.0


# 時系列をくっつける

In [7]:
df3 = pd.concat([df1,df2], axis=1, join_axes=[df1.index])
df3['price'] = np.round(df2.pct_change().shift(-1) * 100, 3)
df3.loc[df3['price'] > 0, 'price'] = 1
df3.loc[df3['price'] < 0, 'price'] = 0
df3['Keyword_Article'] = \
    df3.groupby(level=0).apply(lambda x: ':<pad>:'.join(list(x['Keyword_Article'])))
df3 = df3.dropna()

df3 = df3[~df3.duplicated(subset=['Keyword_Article'])]

  """Entry point for launching an IPython kernel.


In [8]:
df3.head()

Unnamed: 0,Company_Code(TSE),Headline,News_Source,Company_Relevance,Keyword_Article,adj_close,price
2011-01-04,7203.0,<日経>◇次世代車の研究開発　名大に国内最大拠点,日経,38,安全:環境:負荷:開発:目指す:開所式:研究拠点:効率:簡素化:次世代:電気自動車:電気:幅...,3265.0,1.0
2011-01-05,7203.0,<日経>◇12月の中国新車販売、トヨタが単月で過去最高,日経,100,北京:中国:１２月:新車販売台数:前年同月比:増:過去最高:制限:受け:全国:各地:乗用車:...,3295.0,1.0
2011-01-06,7203.0,<NQN>◇トヨタ社長「今年は後半に晴れ間」　為替は１ドル＝90円を期待,ＮＱＮ,100,豊田:見通し:販売:エコカー補助金:安定的:伸び:株価:為替:水準:日経平均株価:最低:ライ...,3380.0,1.0
2011-01-07,7203.0,<日経>◇福岡県、自動車の技術者育成へ新組織　年内、中小向け,日経,37,自動車産業:強化:福岡:先端:設置:方針:技術:調査:ニーズ:カリキュラム:大学:受け:生産...,3455.0,0.0
2011-01-11,7203.0,<日経>◇トヨタ、米ミシガン州に安全研究センター新設,日経,100,先進:安全:子供:高齢者:事故:向上:目指す:米国:大規模:リコール:回収:問題:開催:豊田...,3455.0,1.0


# csvファイルに保存する

In [9]:
train_date = 2015
test_date = 2017

In [10]:
date_year = df3.index.map(lambda x: x.year)

In [11]:
df3[date_year <= train_date][['Keyword_Article', 'price']].to_csv(
        './data/news/text_train.tsv',
        header=None,
        index=None,
        sep='\t')

In [12]:
df3[(train_date < date_year) & (date_year > test_date)][['Keyword_Article', 'price']].to_csv(
        './data/news/text_val.tsv',
        header=None,
        index=None,
        sep='\t')

In [13]:
df3[test_date <= date_year][['Keyword_Article', 'price']].to_csv(
        './data/news/text_test.tsv',
        header=None,
        index=None,
        sep='\t')

# Dataの作成

In [14]:
# 前処理
def preprocessing_text(text):
    # カンマ、ピリオド以外の記号をスペースに置換
    for p in string.punctuation:
        if (p == ".") or (p == ",") or (p == ":") or (p == "<")or (p == ">"):
            continue
        else:
            text = text.replace(p, " ")

    # ピリオドなどの前後にはスペースを入れておく
    text = text.replace(".", " . ")
    text = text.replace(",", " , ")
    text = re.sub(r'[0-9 ０-９]', '0', text)
    
    return text

# 分かち書き（今回はデータが英語で、簡易的にスペースで区切る）
def tokenizer_punctuation(text):
    return text.strip().split(':')

# 前処理と分かち書きをまとめた関数を定義
def tokenizer_with_preprocessing(text):
    text = preprocessing_text(text)
    ret = tokenizer_punctuation(text)
    return ret

In [15]:
max_length = 1000
batch_size = 32

# 読み込んだ内容に対して行う処理を定義
TEXT = torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, 
                            use_vocab=True,
                            lower=True, include_lengths=True, batch_first=True, fix_length=max_length, 
                            init_token="<cls>", eos_token="<eos>")
LABEL = torchtext.data.Field(sequential=False, use_vocab=False, dtype=torch.float)

In [44]:
train_ds = torchtext.data.TabularDataset.splits(
    path='./data/news', train='text_train.tsv',
    format='tsv',
    fields=[('Text', TEXT), ('Label', LABEL)])
train_ds = train_ds[0]
# print(vars(train_ds[1]))

val_ds = torchtext.data.TabularDataset.splits(
    path='./data/news', train='text_val.tsv',
    format='tsv',
    fields=[('Text', TEXT), ('Label', LABEL)])
val_ds = val_ds[0]

test_ds = torchtext.data.TabularDataset.splits(
    path='./data/news', train='text_test.tsv',
    format='tsv',
    fields=[('Text', TEXT), ('Label', LABEL)])
test_ds = test_ds[0]

japanese_fasttext_vectors = Vectors(name='./data/news/cc.ja.300.vec')
TEXT.build_vocab(train_ds, 
                                 vectors=japanese_fasttext_vectors,
                                 min_freq=10)
TEXT.vocab.freqs

train_dl = torchtext.data.Iterator(
    train_ds, batch_size=batch_size, train=True)
val_dl = torchtext.data.Iterator(
    val_ds, batch_size=batch_size, train=False, sort=False)
test_dl = torchtext.data.Iterator(
    test_ds, batch_size=len(vars(test_ds)['examples']), train=False, sort=False)

403

In [17]:
# # 動作確認
# batch = next(iter(train_dl))
# print(batch.Text[0])
# print(batch.Label)

# モデル構築

In [18]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [19]:


# # モデルの構築
# net = TransformerClassification(
#     text_embedding_vectors=TEXT.vocab.vectors, 
#     d_model=300,
#     max_seq_len=256, 
#     output_dim=1)

# # 訓練モード
# net.train()

# # パラメータ初期化
# net.net3_1.apply(weights_init)
# net.net3_2.apply(weights_init)

In [20]:
INPUT_DIM = len(TEXT.vocab.freqs)
EMBEDDING_DIM = 300
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 1
DROPOUT = 0.1
PAD_IDX = 1

net = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
net.convs.apply(weights_init)
net.fc.apply(weights_init)

Linear(in_features=300, out_features=1, bias=True)

# 最適化

In [21]:
# 最適化手法
learning_rate = 2e-5
optimizer = optim.Adam(net.parameters(), lr=learning_rate)


criterion = nn.BCEWithLogitsLoss()
criterion = criterion.to(device)

In [22]:
def accuracy(scores, y):    
    correct = (scores == y)
    acc = correct.sum() / len(correct)
    return acc

def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum()
    return acc

In [23]:
num_epochs = 100
dataloaders_dict = {'train': train_dl, 'val':val_dl}


print('----start----')
net.to(device)

torch.backends.cudnn.benchmark = True

for epoch in range(num_epochs):
    for phase in ['train', 'val']:
        if phase == 'train':
            net.train()
        else:
            net.eval()

        epoch_loss = 0.0
        epoch_corrects = 0

        for batch in (dataloaders_dict[phase]):
            inputs = batch.Text[0].to(device)
            labels = batch.Label.to(device)

            optimizer.zero_grad()

            with torch.set_grad_enabled(phase == 'train'):

                # Transformerに入力
                preds = net(inputs)
                preds = preds.view(-1)
#                 loss = torch.mean((preds - labels)**2)
                loss = criterion(preds, labels)

#                     _, preds = torch.max(outputs, 1)

                # 更新
                if phase == 'train':
                    loss.backward()
                    optimizer.step()

                # 結果の計算
                epoch_loss += loss.item() * inputs.size(0)
                preds[preds > 0.3] = 1
                preds[preds < 0.3] =0
                epoch_corrects += binary_accuracy(preds, labels) #torch.sum(preds == labels.data)

        # epochごとのlossと正解率
        epoch_loss = epoch_loss / len(dataloaders_dict[phase].dataset)
        epoch_acc = epoch_corrects.double() / len(dataloaders_dict[phase].dataset)

        print('Epoch {}/{} | {:^5} | Loss: {:.4f} Acc: {:.4f}'.format(
                                                                     epoch+1,
                                                                     num_epochs,
                                                                     phase,
                                                                     epoch_loss,
                                                                     epoch_acc))

net_trained = net

----start----
Epoch 1/100 | train | Loss: 0.9380 Acc: 0.5074
Epoch 1/100 |  val  | Loss: 0.7400 Acc: 0.5330
Epoch 2/100 | train | Loss: 0.7651 Acc: 0.5132
Epoch 2/100 |  val  | Loss: 0.7104 Acc: 0.5279
Epoch 3/100 | train | Loss: 0.7002 Acc: 0.5633
Epoch 3/100 |  val  | Loss: 0.7078 Acc: 0.5431
Epoch 4/100 | train | Loss: 0.6820 Acc: 0.5898
Epoch 4/100 |  val  | Loss: 0.7065 Acc: 0.5482
Epoch 5/100 | train | Loss: 0.6566 Acc: 0.5918
Epoch 5/100 |  val  | Loss: 0.7044 Acc: 0.5431
Epoch 6/100 | train | Loss: 0.6388 Acc: 0.6330
Epoch 6/100 |  val  | Loss: 0.7029 Acc: 0.5482
Epoch 7/100 | train | Loss: 0.5944 Acc: 0.6614
Epoch 7/100 |  val  | Loss: 0.7006 Acc: 0.5482
Epoch 8/100 | train | Loss: 0.5988 Acc: 0.6575
Epoch 8/100 |  val  | Loss: 0.7021 Acc: 0.5482
Epoch 9/100 | train | Loss: 0.5475 Acc: 0.7242
Epoch 9/100 |  val  | Loss: 0.6985 Acc: 0.5431
Epoch 10/100 | train | Loss: 0.5270 Acc: 0.7448
Epoch 10/100 |  val  | Loss: 0.6983 Acc: 0.5533
Epoch 11/100 | train | Loss: 0.4990 Acc: 0.7

Epoch 87/100 | train | Loss: 0.0881 Acc: 0.9961
Epoch 87/100 |  val  | Loss: 0.6929 Acc: 0.5381
Epoch 88/100 | train | Loss: 0.0852 Acc: 0.9971
Epoch 88/100 |  val  | Loss: 0.6928 Acc: 0.5431
Epoch 89/100 | train | Loss: 0.0806 Acc: 0.9980
Epoch 89/100 |  val  | Loss: 0.6928 Acc: 0.5482
Epoch 90/100 | train | Loss: 0.0827 Acc: 0.9980
Epoch 90/100 |  val  | Loss: 0.6931 Acc: 0.5381
Epoch 91/100 | train | Loss: 0.0808 Acc: 0.9971
Epoch 91/100 |  val  | Loss: 0.6933 Acc: 0.5482
Epoch 92/100 | train | Loss: 0.0809 Acc: 0.9980
Epoch 92/100 |  val  | Loss: 0.6938 Acc: 0.5381
Epoch 93/100 | train | Loss: 0.0767 Acc: 0.9980
Epoch 93/100 |  val  | Loss: 0.6940 Acc: 0.5482
Epoch 94/100 | train | Loss: 0.0763 Acc: 0.9961
Epoch 94/100 |  val  | Loss: 0.6941 Acc: 0.5431
Epoch 95/100 | train | Loss: 0.0795 Acc: 0.9971
Epoch 95/100 |  val  | Loss: 0.6938 Acc: 0.5482
Epoch 96/100 | train | Loss: 0.0787 Acc: 0.9971
Epoch 96/100 |  val  | Loss: 0.6947 Acc: 0.5381
Epoch 97/100 | train | Loss: 0.0755 Acc:

In [24]:
binary_accuracy(preds, labels)

tensor(4., device='cuda:0')

# AttentionMap

In [71]:
from IPython.display import HTML

batch = next(iter(test_dl))

inputs = batch.Text[0].to(device)
labels = batch.Label.to(device)

preds = net_trained(inputs)
preds = preds.view(-1)

In [72]:
preds[preds > 0.3] = 1
preds[preds < 0.3] =0

In [73]:
correct = (labels == preds).detach().cpu().sum().numpy().item()
correct / len(labels)

0.5062034739454094

In [74]:
correct

204