In [None]:
# 形態素分析ライブラリーMeCab と 辞書(mecab-ipadic-NEologd)のインストール 
# !apt-get -q -y install sudo file mecab libmecab-dev mecab-ipadic-utf8 git curl python-mecab > /dev/null
# !git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git > /dev/null 
# !echo yes | mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n > /dev/null 2>&1
# !pip install mecab-python3 > /dev/null

# シンボリックリンクによるエラー回避
# !ln -s /etc/mecabrc /usr/local/etc/mecabrc
# 辞書のパス
# path = "-d /usr/lib/x86_64-linux-gnu/mecab/dic/mecab-ipadic-neologd"


# pytorchのバージョン指定
!pip uninstall torch torchvision torchaudio
!pip install torchtext==0.8.1

# ライブラリのインストール
!pip install janome
!pip install japanize-matplotlib

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%matplotlib inline
from collections import Counter
# import MeCab
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchtext
import spacy

from gensim.corpora import Dictionary
from gensim import matutils 
from glob import glob
from janome.tokenizer import Tokenizer
from sklearn.model_selection import train_test_split
from torchtext.data import Field, LabelField, BucketIterator, TabularDataset, Pipeline
from torch.nn.utils.rnn import pack_padded_sequence
t
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
import pdb
import re

from warnings import simplefilter
simplefilter('ignore', FutureWarning)
import gc
import japanize_matplotlib
japanize_matplotlib.japanize()

root = "/content/drive/MyDrive/データ分析/dl4e/"

# デバイスを取得
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device: {0}".format(DEVICE))

# mecab
# mecab = MeCab.Tagger("-Owakati")

In [6]:
data = pd.read_csv(root+"input/house_price.csv")
# data["target"] = data["target"].map(lambda d: d/10000)

In [None]:
data.head()

In [None]:
data.loc[0,"others_add_features_list"]

In [9]:
# data = data[(data["target"] >= 60000) & (data["target"] <= 120000)]

In [10]:
train, valid = train_test_split(
    data[["others_add_features", "target"]],
    test_size=0.25, 
    shuffle=True, 
    random_state=42, 
   )

train.to_csv(root+"output/house_price_train.csv", index=False, header=None)
valid.to_csv(root+"output/house_price_valid.csv", index=False, header=None)
# del train, valid, temp

In [11]:
def tokenizer(text):
    return [t for t in text.split("、")]

In [None]:
# 各種Fieldを定義
TEXT = Field(
    sequential=True, 
    tokenize=tokenizer, 
    # lower=True, 
    include_lengths=True, 
    # preprocessing=pipe, 
    # stop_words=rm
    )

# LABEL = LabelField(dtype=torch.float)
LABEL = Field(sequential=False, use_vocab=False, dtype=torch.float)


train, val = TabularDataset.splits(
    path=root+"output/", 
    train="house_price_train.csv", 
    validation="house_price_valid.csv",
    format="csv",
    fields=[("text", TEXT), ("label", LABEL)]
)

# 単語に番号を振る
# 最低出現回数をmin_freqで指定
TEXT.build_vocab(train, min_freq=2)
LABEL.build_vocab(train)

# イテレータの作成
batch_size = 256
train_iter, val_iter = BucketIterator.splits(
    (train, val), batch_size=batch_size, device=DEVICE,
    sort=False
)

In [13]:
class HousePriceModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, padding_idx):
        super(HousePriceModel, self).__init__()
        self.embedding_dim = embedding_dim
        self.output_dim = output_dim
        self.padding_idx = padding_idx
        
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)
        self.linear1  = nn.Linear(embedding_dim, 32)
        # self.bachnorm1 = nn.BatchNorm1d(32)
        # self.linear2  = nn.Linear(32, output_dim)
        self.linear2  = nn.Linear(32, 16)
        self.linear3 =  nn.Linear(16, output_dim)
        
        # ----- 2値分類の時
        # self.act = nn.Sigmoid()
        # -----

    def forward(self, inputs):
        # inputs = (text, text_length)
        # 入力テキストを取得
        # inputs_text = [sentence lengths, batch_size]
        inputs_text = inputs[0]
        
        # 入力をembeddingする
        # output shape: [sentence lengths, batch_size, embeddding_dim]
        embeds = self.embeddings(inputs_text)
        
        # Embeding Vectorの各次元毎 (dim=0)に平均を計算し、文章のベクトルに変換する (Bag-of-Words)
        # output shape: [batch_size, embeddding_dim]
        embeds = torch.mean(embeds, dim=0) # [batch_size, embedding_dim]

        # torch.concat([""])
        
        # embeddingの結果をlinear1に入力し、出力結果に活性化関数reluを適用する
        # output shape: [batch_size, 32]
        output = F.relu(self.linear1(embeds))
        # output = F.relu(self.bachnorm1(self.linear1(embeds)))
        
        # linear1の結果をlinear2に入力する
        # output shape: [batch_size, 16]
        output = F.relu(self.linear2(output)) # [batch_size, output_dim]

        # linear2の結果をlinear3に入力する
        # output shape: [batch_size, output_dim]
        output = self.linear3(output)
        
        # ----- 2値分類のとき
        # 出力をSigmoid関数で変換して、最終出力を得る
        # output shape: [batch_size]
        # output = self.act(output.squeeze(1))
        # -----
        
        return output

In [14]:
def train_model(model, loss_function, optimizer, num_epochs=20):
    # 学習モードに設定
    model.train()

    # モデルの学習
    for epoch in range(num_epochs):
        train_loss, train_acc = 0.0, 0.0
        for batch in tqdm(train_iter):
            optimizer.zero_grad() # 勾配の初期化
            output = model(batch.text)
            loss = loss_function(output, batch.label.float()) # 損失関数の計算
            train_loss += loss.item()  # 損失の加算
            
            # ----- 2値分類の時
            # acc = (torch.round(output) == batch.label).sum() # 正答数の数え上げ
            # train_acc += acc.item()  # 正答数の加算
            # -----

            loss.backward()  # 勾配の計算(逆伝播)
            optimizer.step()  # パラメータの更新

        avg_train_loss = train_loss / (len(train))  # 平均損失の計算

        # ----- 2値分類の時
        # avg_train_acc = train_acc / (len(train))  # 正答率の計算
        # -----

        print(('Epoch [{}/{}], train_loss: {train_loss:.5f}')
        .format(epoch+1, num_epochs, train_loss=avg_train_loss))
      
    return model

In [15]:
def evaluation(model):
    # テストデータの予測
    model.eval()  # 推論モードに切替
    mse = nn.MSELoss()
    mae = nn.L1Loss()
    
    # 計算グラフの構築をしないよう設定
    with torch.no_grad(): 
        total = 0
        valid_mse = 0
        valid_mae = 0
        cnt = 0

        for batch in tqdm(val_iter):    
            output = model(batch.text)  # 予測の計算
            valid_mse += mse(output.squeeze(), batch.label) # MSEの計算
            valid_mae += mae(output.squeeze(), batch.label) # MAEの計算
            cnt += 1

        print('\nValid MSE: {:.2f}'.format(valid_mse/cnt))
        print('Valid MAE: {:.2f}'.format(valid_mae/cnt))


In [None]:
vocab_size = len(TEXT.vocab)
embedding_dim = 16
output_dim = 1
padding_idx = TEXT.vocab.stoi["<pad>"]

# モデル定義
house_price_model = HousePriceModel(vocab_size, embedding_dim, output_dim, padding_idx).to(DEVICE)
loss_function = nn.MSELoss().to(DEVICE)
optimizer = optim.Adam(house_price_model.parameters(), lr=0.0075)

# モデルの学習
house_price_model = train_model(house_price_model, loss_function, optimizer, num_epochs=500)

# モデルの評価
evaluation(house_price_model)

In [27]:
# ID化されたトークンを自然言語の文章に変換する関数
def token2text(tokens, TEXT):
    texts = []
    for token in tokens:
        text = TEXT.vocab.itos[token]
        if text != "<pad>":
            texts.append(text)
        else:
            break
    return "".join(texts)


# data_numで指定した数の例をテストデータから取り出し、出力結果を表示する関数
def print_result(model, data_num=5):
    # 出力例を確認
    model.eval()  # 推論モードに切替
    batch = next(iter(val_iter))  
    predicts = model(batch.text)  # 予測の計算
    for i in range(data_num):
        tokens = batch.text[0][:, i]
        print("input text: {}".format(token2text(tokens, TEXT)))
        print("answer label: {}".format(batch.label[i]))
        print("predicted label: {}\n".format(predicts[i]))

In [None]:
# 出力例を確認
print_result(house_price_model, data_num=10)

In [29]:
def returan_all_result(model):
    model.eval()
    batch = next(iter(val_iter))

    r = {
        "text": [], "predict": [], "answer": [], 
        }
    for b in iter(val_iter):
        predicts = model(b.text)

        for i in range(len(predicts)):
            tokens = b.text[0][:, i]
            r["text"].append(token2text(tokens, TEXT))
            r["predict"].append(float(predicts[i]))
            r["answer"].append(float(batch.label[i]))
    
    return r


In [None]:
result = returan_all_result(house_price_model)

In [31]:
result_df = pd.DataFrame(result)

In [None]:
result_df.describe()

In [None]:
result_df = result_df.sort_values('answer')
result_df.head()

In [34]:
result_df.to_csv(root+"nlp_predicts.csv", index=False)

In [None]:
result_df.plot.scatter(x='answer', y='predict', alpha=0.5)

In [None]:
len(result_df)