In [1]:
import pandas as pd
import datetime
import json
import numpy as np
import string
import math
import re

import torchtext
from torchtext.vocab import Vectors
from torchtext import data, datasets

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from itertools import chain

# データのロード

In [2]:
df1 = pd.read_csv('./data/news/2011.csv', encoding='cp932')
# df1 = df1[df1['Company_IDs(TSE)'] == '7203']
df1['News_Source'].value_counts()

日経       144749
ＮＱＮ       77146
発表        30245
ＱＵＩＣＫ      5796
日銀         2256
Ｒ＆Ｉ        1692
財務省         748
Name: News_Source, dtype: int64

In [3]:
df1['Company_Relevance'].value_counts()

100                                                   80749
86:86                                                  1600
50                                                     1205
49                                                      790
48                                                      701
                                                      ...  
87:37:35:28                                               1
34:34:27:27:27:27:27:26:26:25:25:25:25                    1
85:79:30                                                  1
34:34:34:32:32:32:30:29                                   1
28:28:28:28:28:26:26:26:26:26:25:25:25:25:25:25:25        1
Name: Company_Relevance, Length: 8801, dtype: int64

In [4]:
for i, date in enumerate(range(2011, 2019)):
    tmp = pd.read_csv('./data/news/' + str(date) + '.csv', encoding='cp932')
    tmp = tmp[tmp['Company_IDs(TSE)'] == '7203']
    tmp = tmp[['Time_Stamp_Original(JST)', 
                        'Company_Code(TSE)', 
                        'Headline', 
                        'News_Source',
                        'Company_Relevance', 
                        'Keyword_Article']]

    # 欠損除去
    tmp = tmp[~tmp["Keyword_Article"].isnull()]

    # タグ除去
    tmp = tmp[(tmp['News_Source'] == '日経') | 
                        (tmp['News_Source'] == 'ＮＱＮ') |
                        (tmp['News_Source'] == 'ＱＵＩＣＫ') | 
                        (tmp['News_Source'] == 'Ｒ＆Ｉ')]

    tmp.index = pd.to_datetime(tmp["Time_Stamp_Original(JST)"])
    tmp = tmp.drop("Time_Stamp_Original(JST)", axis=1)
    
    if i == 0:
        df1 = tmp.copy()
    else:
        df1 = pd.concat([df1, tmp])

# インデックスを設定

In [5]:
def norm_time(x):
    if x.hour > 15:
        return x + datetime.timedelta(days=1)
    return x

time = pd.to_datetime(df1.index.values)
df1.index = df1.index.map(norm_time)
df1.index = df1.index.date

# 株価を挿入する

In [6]:
# 株価を取り出す
df2 = pd.read_csv('./data/stock_price/7203.csv', index_col=0)
df2.index = pd.to_datetime(df2['date'])
df2.index = df2.index.date
df2 = df2.drop(['date'], axis=1)
df2.head(10)

Unnamed: 0,adj_close
2011-01-04,3265.0
2011-01-05,3295.0
2011-01-06,3380.0
2011-01-07,3455.0
2011-01-11,3455.0
2011-01-12,3500.0
2011-01-13,3535.0
2011-01-14,3550.0
2011-01-17,3500.0
2011-01-18,3510.0


# 時系列をくっつける

In [7]:
df3 = pd.concat([df1,df2], axis=1, join_axes=[df1.index])
df3['price'] = np.round(df2.pct_change().shift(-1) * 100, 3)
df3.loc[df3['price'] > 0, 'price'] = int(1)
df3.loc[df3['price'] <= 0, 'price'] = int(0)
df3['Keyword_Article'] = \
    df3.groupby(level=0).apply(lambda x: ':<pad>:'.join(list(x['Keyword_Article'])))
df3 = df3.dropna()
df3['price'] = df3['price'].astype(np.int)

df3 = df3[~df3.duplicated(subset=['Keyword_Article'])]

  """Entry point for launching an IPython kernel.


In [8]:
df3['price']

2011-01-04    1
2011-01-05    1
2011-01-06    1
2011-01-07    0
2011-01-11    1
             ..
2018-12-21    0
2018-12-25    1
2018-12-26    1
2018-12-27    0
2018-12-28    0
Name: price, Length: 1629, dtype: int64

# csvファイルに保存する

In [9]:
train_date = 2015
test_date = 2017

In [10]:
date_year = df3.index.map(lambda x: x.year)

In [11]:
df3[date_year <= train_date][['Keyword_Article', 'price']].to_csv(
        './data/news/text_train.tsv',
        header=None,
        index=None,
        sep='\t')

In [12]:
df3[(train_date < date_year) & (date_year < test_date)][['Keyword_Article', 'price']].to_csv(
        './data/news/text_val.tsv',
        header=None,
        index=None,
        sep='\t')

In [13]:
df3[test_date <= date_year][['Keyword_Article', 'price']].to_csv(
        './data/news/text_test.tsv',
        header=None,
        index=None,
        sep='\t')

# Dataの作成

In [14]:
# 前処理
def preprocessing_text(text):
    # カンマ、ピリオド以外の記号をスペースに置換
    for p in string.punctuation:
        if (p == ".") or (p == ",") or (p == ":") or (p == "<")or (p == ">"):
            continue
        else:
            text = text.replace(p, " ")

    # ピリオドなどの前後にはスペースを入れておく
    text = text.replace(".", " . ")
    text = text.replace(",", " , ")
    text = re.sub(r'[0-9 ０-９]', '0', text)
    
    return text

# 分かち書き（今回はデータが英語で、簡易的にスペースで区切る）
def tokenizer_punctuation(text):
    return text.strip().split(':')

# 前処理と分かち書きをまとめた関数を定義
def tokenizer_with_preprocessing(text):
    text = preprocessing_text(text)
    ret = tokenizer_punctuation(text)
    return ret

In [15]:
max_length = 256
batch_size = 64

# 読み込んだ内容に対して行う処理を定義
TEXT = torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, 
                            use_vocab=True,
                            lower=True, include_lengths=True, batch_first=True, fix_length=max_length, 
                            init_token="<cls>", eos_token="<eos>")
LABEL = torchtext.data.Field(sequential=False, use_vocab=False, dtype=torch.long)

In [16]:
train_ds = torchtext.data.TabularDataset.splits(
    path='./data/news', train='text_train.tsv',
    format='tsv',
    fields=[('Text', TEXT), ('Label', LABEL)])
train_ds = train_ds[0]
# print(vars(train_ds[1]))

val_ds = torchtext.data.TabularDataset.splits(
    path='./data/news', train='text_val.tsv',
    format='tsv',
    fields=[('Text', TEXT), ('Label', LABEL)])
val_ds = val_ds[0]

test_ds = torchtext.data.TabularDataset.splits(
    path='./data/news', train='text_test.tsv',
    format='tsv',
    fields=[('Text', TEXT), ('Label', LABEL)])
test_ds = test_ds[0]

japanese_fasttext_vectors = Vectors(name='./data/news/cc.ja.300.vec')
TEXT.build_vocab(train_ds, 
                                 vectors=japanese_fasttext_vectors,
                                 min_freq=10)
TEXT.vocab.freqs

train_dl = torchtext.data.Iterator(
    train_ds, batch_size=batch_size, train=True)
val_dl = torchtext.data.Iterator(
    val_ds, batch_size=batch_size, train=False, sort=False)
test_dl = torchtext.data.Iterator(
    test_ds, batch_size=len(vars(test_ds)['examples']), train=False, sort=False)

In [17]:
next(iter(train_dl))


[torchtext.data.batch.Batch of size 64]
	[.Text]:('[torch.LongTensor of size 64x256]', '[torch.LongTensor of size 64]')
	[.Label]:[torch.LongTensor of size 64]

In [18]:
# # 動作確認
# batch = next(iter(train_dl))
# print(batch.Text[0])
# print(batch.Label)

# モデル構築

In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [38]:
class EncoderRNN(nn.Module):
    def __init__(self, emb_dim, h_dim, v_size, device='cpu', v_vec=None, batch_first=True):
        super(EncoderRNN, self).__init__()
        self.device = device
        self.h_dim = h_dim
        self.embed = nn.Embedding(v_size, emb_dim)
        if v_vec is not None:
            self.embed.weight.data.copy_(v_vec)
        self.lstm = nn.LSTM(emb_dim, h_dim, batch_first=batch_first,
                            bidirectional=True)

    def init_hidden(self, b_size):
        h0 = torch.zeros(1*2, b_size, self.h_dim).to(self.device)
        c0 = torch.zeros(1*2, b_size, self.h_dim).to(self.device)
        return (h0, c0)

    def forward(self, sentence, lengths=None):
        self.hidden = self.init_hidden(sentence.size(0))
        emb = self.embed(sentence)
        packed_emb = emb

        if lengths is not None:
            lengths = lengths.view(-1).tolist()
            packed_emb = nn.utils.rnn.pack_padded_sequence(emb, lengths)

        out, hidden = self.lstm(packed_emb, self.hidden)

        if lengths is not None:
            out = nn.utils.rnn.pad_packed_sequence(output)[0]

        out = out[:, :, :self.h_dim] + out[:, :, self.h_dim:]

        return out

In [39]:
class Attn(nn.Module):
    def __init__(self, h_dim):
        super(Attn, self).__init__()
        self.h_dim = h_dim
        self.main = nn.Sequential(
            nn.Linear(h_dim, 24),
            nn.ReLU(True),
            nn.Linear(24,1)
        )

    def forward(self, encoder_outputs):
        b_size = encoder_outputs.size(0)
        attn_ene = self.main(encoder_outputs.reshape(-1, self.h_dim)) # (b, s, h) -> (b * s, 1)
        return F.softmax(attn_ene.view(b_size, -1), dim=1).unsqueeze(2) # (b*s, 1) -> (b, s, 1)

In [40]:
class AttnClassifier(nn.Module):
    def __init__(self, h_dim, c_num):
        super(AttnClassifier, self).__init__()
        self.attn = Attn(h_dim)
        self.main = nn.Linear(h_dim, c_num)


    def forward(self, encoder_outputs):
        attns = self.attn(encoder_outputs) #(b, s, 1)
        feats = (encoder_outputs * attns).sum(dim=1) # (b, s, h) -> (b, h)
        return F.log_softmax(self.main(feats)), attns

In [41]:
torch.manual_seed(0)
emb_dim = 300
h_dim = 32
learning_rate = 1e-3

# make model
encoder = EncoderRNN(emb_dim, h_dim, len(TEXT.vocab), 
                     device=device, v_vec = TEXT.vocab.vectors).to(device)
classifier = AttnClassifier(h_dim, 2).to(device)

In [42]:
# init model
def weights_init(m):
    classname = m.__class__.__name__
    if hasattr(m, 'weight') and (classname.find('Embedding') == -1):
        nn.init.xavier_uniform(m.weight.data, gain=nn.init.calculate_gain('relu'))

for m in encoder.modules():
    print(m.__class__.__name__)
    weights_init(m)

for m in classifier.modules():
    print(m.__class__.__name__)
    weights_init(m)

optimizer = optim.Adam(
    chain(encoder.parameters(),classifier.parameters()), lr=learning_rate)

EncoderRNN
Embedding
LSTM
AttnClassifier
Attn
Sequential
Linear
ReLU
Linear
Linear


  """


In [43]:
def binary_accuracy(pred, y):
    #round predictions to the closest integer
    correct = (pred == y).float() #convert into float for division 
#     print(correct)
    acc = correct.sum()
    return acc

# 損失関数
criterion = nn.CrossEntropyLoss()

# 辞書オブジェクトにまとめる
dataloaders_dict = {'train': train_dl, 'val': val_dl}

In [44]:
# train model 
num_epochs = 10
for epoch in range(num_epochs):
    for phase in ['train', 'val']:
        if phase == 'train':
            encoder.train()
            classifier.train()
        else:
            encoder.eval()
            classifier.eval()

        epoch_loss = 0
        epoch_corrects = 0

        for idx, batch in enumerate(dataloaders_dict[phase]):
            x = batch.Text[0].to(device)
            y = batch.Label.to(device)

            optimizer.zero_grad()
            encoder_outputs = encoder(x)
            output, attn = classifier(encoder_outputs)
            loss = criterion(output, y)

            if phase == 'train':
                loss.backward()
                optimizer.step()

            pred = output.data.max(1, keepdim=True)[1]

            epoch_loss += loss.item() * x.size(0)
            epoch_corrects += binary_accuracy(pred.view(-1), y)


        # epochごとのlossと正解率
        epoch_loss = epoch_loss / len(dataloaders_dict[phase].dataset)
        epoch_acc = epoch_corrects.double() / len(dataloaders_dict[phase].dataset)

        print('Epoch {}/{} | {:^5} | Loss: {:.4f} Acc: {:.4f}'.format(
            epoch+1,
            num_epochs,
            phase,
            epoch_loss,
            epoch_acc))


  # This is added back by InteractiveShellApp.init_path()


Epoch 1/10 | train | Loss: 0.6974 Acc: 0.4632
Epoch 1/10 |  val  | Loss: 0.6914 Acc: 0.5797
Epoch 2/10 | train | Loss: 0.6903 Acc: 0.5240
Epoch 2/10 |  val  | Loss: 0.6868 Acc: 0.5411
Epoch 3/10 | train | Loss: 0.6861 Acc: 0.5859
Epoch 3/10 |  val  | Loss: 0.6904 Acc: 0.5169
Epoch 4/10 | train | Loss: 0.6648 Acc: 0.6163
Epoch 4/10 |  val  | Loss: 0.6879 Acc: 0.5266
Epoch 5/10 | train | Loss: 0.6101 Acc: 0.6801
Epoch 5/10 |  val  | Loss: 0.7781 Acc: 0.4928
Epoch 6/10 | train | Loss: 0.4956 Acc: 0.7782
Epoch 6/10 |  val  | Loss: 0.8338 Acc: 0.5024
Epoch 7/10 | train | Loss: 0.3140 Acc: 0.8803
Epoch 7/10 |  val  | Loss: 1.1254 Acc: 0.4928
Epoch 8/10 | train | Loss: 0.1759 Acc: 0.9293
Epoch 8/10 |  val  | Loss: 1.4235 Acc: 0.4734
Epoch 9/10 | train | Loss: 0.0854 Acc: 0.9686
Epoch 9/10 |  val  | Loss: 1.6436 Acc: 0.4976
Epoch 10/10 | train | Loss: 0.0572 Acc: 0.9823
Epoch 10/10 |  val  | Loss: 1.8177 Acc: 0.5072


In [96]:
def highlight(word, attn):
    html_color = '#%02X%02X%02X' % (255, int(255*(1 - attn)), int(255*(1 - attn)))
    return '<span style="background-color: {}">{}</span>'.format(html_color, word)

def mk_html(sentence, attns):
    html = ""
    for word, attn in zip(sentence, attns):
        html += ' ' + highlight(
            TEXT.vocab.itos[word].encode('utf-8'),
            attn
        )
    return html + "<br><br>"

In [97]:
f = open("attn.html", "w")
for batch in test_dl:
    x = batch.Text[0].to(device)
    y = batch.Label.to(device)
    encoder_outputs = encoder(x)
    output, attn = classifier(encoder_outputs)
    pred = output.data.max(1, keepdim=True)[1]
    a = attn.data[0,:,0]
    f.write( 
                         mk_html(x[0].cpu().detach().numpy(), a))
f.close()

  # This is added back by InteractiveShellApp.init_path()


In [98]:
TEXT.vocab.itos[x.data[0][10]]

'先行'

In [101]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import datetime
import cgitb

cgitb.enable()

to_day = str(datetime.date.today())
print('Content-type: text/html; charset=utf-8;\n\n')
print("""
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
""")
print("""<title>CGIsipt_test</title>
</head>
<body>
""")
print("""これはpythonで動的に生成されたHTMLです<br>
今日は%sです<br>
</body></html>
""").encode('utf-8')%to_day

Content-type: text/html; charset=utf-8;



<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">

<title>CGIsipt_test</title>
</head>
<body>

これはpythonで動的に生成されたHTMLです<br>
今日は%sです<br>
</body></html>



AttributeError: 'NoneType' object has no attribute 'encode'