特徴量エンジニアリング(name列)
 - tfidf → SVD(50次元)
 - tfidf → NMF(50次元)
 - BERTを用いた特徴量抽出
 - Universal Sentence Encoderを用いた特徴量抽出

In [None]:
from google.colab import drive
drive.mount('/content/drive')
# カレントディレクトリの指定
import os
os.chdir('/content/drive/MyDrive/分析コンペ/05_ProbSpace/民泊サービスの宿泊料金予測/')

Mounted at /content/drive


In [None]:
!pip install transformers
!pip install tensorflow_text

In [None]:
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import warnings

import torch
import transformers
from transformers import BertTokenizer
import tensorflow as tf
import tensorflow_text
import tensorflow_hub as hub

from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import TruncatedSVD, PCA, NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
from sklearn.feature_extraction.text import _document_frequency

from src.config import *
# import src.preprocessing as pr

warnings.filterwarnings('ignore')

In [None]:
import re
import datetime
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from geopy.distance import geodesic


def nearest_station(df, df_station):
    """
    2点の座標から、最寄り駅のインデックスをリストで取得
    実行時間：全データ(14,000件程度)で7min 24s
    """
    list_distance = []
    for i in range(len(df)):
        list_tmp = []
        coordinate = (df.loc[i, COL_LATITUDE], df.loc[i, COL_LONGITUDE])
        for j in range(len(df_station)):
            compare_coordinate = (df_station.loc[j, COL_LATITUDE], df_station.loc[j, COL_LONGITUDE])
            list_tmp.append(geodesic(coordinate, compare_coordinate).km)
        list_distance.append(list_tmp.index(min(list_tmp)))
    return list_distance


def remove_symbol(text):
    """
    name列の不要な記号を除去
    """
    code_regex = re.compile('[!"#$%&\'\\\\()*+,-./:;<=>?@[\\]^_`{|}~「」〔〕“”〈〉『』【】＆＊・（）＄＃＠。、？！｀＋￥％★✣♪◎☆￫〜◇✋▲△⭐︎丨❤▶☀️※《》☕️✦♯♬♡]')
    cleaned_text = text.replace(r'wi-fi', 'wifi')  # 記号除去の際の単語分割を防ぐ
    cleaned_text = code_regex.sub(' ', cleaned_text)
    return cleaned_text


def create_elapsed_days(df):
    """
    2020/4/30までの経過日数
    """
    df[COL_ELAPSED_DAYS] = (datetime.datetime(2020, 4, 30) - df[COL_LAST_REVIEW]).dt.days
    return df


def enc_categorical(df, col_list, method):
    """
    カテゴリカル変数に対して、one-hotかlabel-encを行う
    """
    if method == 'one-hot':
        df = pd.get_dummies(df, columns=col_list, drop_first=True)
        return df
    elif method == 'label-enc':
        for col in col_list:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])
        return df

In [None]:
class BertSequenceVectorizer:
    def __init__(self, model_name="bert-base-uncased", max_len=128):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model_name = model_name
        self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
        self.bert_model = transformers.BertModel.from_pretrained(self.model_name)
        self.bert_model = self.bert_model.to(self.device)
        self.max_len = max_len

    def vectorize(self, sentence: str) -> np.array:
        inp = self.tokenizer.encode(sentence)
        len_inp = len(inp)

        if len_inp >= self.max_len:
            inputs = inp[:self.max_len]
            masks = [1] * self.max_len
        else:
            inputs = inp + [0] * (self.max_len - len_inp)
            masks = [1] * len_inp + [0] * (self.max_len - len_inp)

        inputs_tensor = torch.tensor([inputs], dtype=torch.long).to(self.device)
        masks_tensor = torch.tensor([masks], dtype=torch.long).to(self.device)

        bert_out = self.bert_model(inputs_tensor, masks_tensor)
        seq_out, pooled_out = bert_out['last_hidden_state'], bert_out['pooler_output']

        if torch.cuda.is_available():    
            return seq_out[0][0].cpu().detach().numpy() # 0番目は [CLS] token, 768 dim の文章特徴量
        else:
            return seq_out[0][0].detach().numpy()

In [None]:
df_train = pd.read_csv('input/train_data_augument.csv', parse_dates=[COL_LAST_REVIEW], dtype=DICT_DTYPES)
df_test = pd.read_csv('input/test_data.csv', parse_dates=[COL_LAST_REVIEW], dtype=DICT_DTYPES)

In [None]:
df_all_name = pd.concat([df_train[[COL_NAME]], df_test[[COL_NAME]]], axis=0).reset_index(drop=True)
df_all_name.head()

Unnamed: 0,name
0,KiyosumiShirakawa 3min|★SkyTree★|WIFI|Max4|Tre...
1,Downtown Tokyo Iriya next to Ueno
2,"Japan Style,Private,Affordable,4min to Sta."
3,4 min to Shinjuku Sta. by train / 2 ppl / Wi-fi
4,LICENSED SHINJUKU HOUSE: Heart of the action!


In [None]:
df_all_name[COL_CLEAN_NAME] = df_all_name[COL_NAME].str.lower()
df_all_name[COL_CLEAN_NAME] = df_all_name[COL_CLEAN_NAME].apply(remove_symbol)

In [None]:
df_all_name.head()

Unnamed: 0,name,clean_name
0,KiyosumiShirakawa 3min|★SkyTree★|WIFI|Max4|Tre...,kiyosumishirakawa 3min skytree wifi max4 tre...
1,Downtown Tokyo Iriya next to Ueno,downtown tokyo iriya next to ueno
2,"Japan Style,Private,Affordable,4min to Sta.",japan style private affordable 4min to sta
3,4 min to Shinjuku Sta. by train / 2 ppl / Wi-fi,4 min to shinjuku sta by train 2 ppl wifi
4,LICENSED SHINJUKU HOUSE: Heart of the action!,licensed shinjuku house heart of the action


#### BERTによる特徴量抽出

In [None]:
BSV = BertSequenceVectorizer(
    model_name="bert-base-multilingual-uncased",
    max_len=64)
features = np.stack(
    df_all_name[COL_CLEAN_NAME].map(lambda x: BSV.vectorize(x).reshape(-1)).values
)

Downloading:   0%|          | 0.00/851k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/641M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
roBSV = BertSequenceVectorizer(
    model_name="roberta-base",
    max_len=64)
features = np.stack(
    df_all_name[COL_CLEAN_NAME].map(lambda x: roBSV.vectorize(x).reshape(-1)).values
)

In [None]:
df_name_emb = pd.DataFrame(data=features, columns=[f'name_bert_enb_{i + 1}' for i in range(features.shape[1])])
df_name_emb.head()

Unnamed: 0,name_bert_enb_1,name_bert_enb_2,name_bert_enb_3,name_bert_enb_4,name_bert_enb_5,name_bert_enb_6,name_bert_enb_7,name_bert_enb_8,name_bert_enb_9,name_bert_enb_10,...,name_bert_enb_759,name_bert_enb_760,name_bert_enb_761,name_bert_enb_762,name_bert_enb_763,name_bert_enb_764,name_bert_enb_765,name_bert_enb_766,name_bert_enb_767,name_bert_enb_768
0,-0.036676,-0.049738,0.050754,-0.02562,-0.331241,0.013945,-0.067799,-0.020603,-1.920738,-0.024602,...,-0.00852,2.122303,-0.113638,-0.033221,0.078959,0.016577,-0.089147,-0.006365,-0.010549,-0.050055
1,0.069604,-0.127225,0.024435,-0.011773,-0.351301,0.142089,-0.040712,0.042105,-1.932358,-0.063492,...,-0.113498,2.09374,-0.07674,0.017097,0.146325,-0.094669,-0.218213,0.041695,0.007427,0.01232
2,-0.116778,-0.017075,0.004824,-0.087936,-0.345614,0.037181,-0.043724,-0.075973,-1.831783,-0.07497,...,0.113531,2.09504,-0.079617,-0.148137,0.08085,-0.025728,-0.097848,0.044289,-0.027747,-0.017916
3,-0.054565,-0.04879,0.066115,0.034154,-0.338605,0.081742,-0.155228,-0.137337,-1.803986,-0.045013,...,-0.069991,2.176921,-0.117206,-0.168948,0.041346,0.045783,-0.080928,0.068311,-0.186026,0.033761
4,-0.021847,0.017521,0.082993,-0.03214,-0.223052,0.050514,-0.031871,-0.007857,-1.833061,-0.029889,...,0.008655,2.055185,-0.093976,-0.035631,0.095189,-0.012795,-0.08965,0.086061,0.015115,-0.058719


#### Universal Sentence Encoderによる特徴量抽出

In [None]:
tqdm.pandas()

embedder = hub.load(
    "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")
features = np.stack(
    df_all_name[COL_CLEAN_NAME].progress_apply(lambda x: embedder(x).numpy().reshape(-1)).values
)

100%|██████████| 24796/24796 [03:01<00:00, 136.51it/s]


In [None]:
df_name_sentence_enc = pd.DataFrame(data=features, columns=[f'name_sentence_enc_{i + 1}' for i in range(features.shape[1])])
df_name_sentence_enc.head()

Unnamed: 0,name_sentence_enc_1,name_sentence_enc_2,name_sentence_enc_3,name_sentence_enc_4,name_sentence_enc_5,name_sentence_enc_6,name_sentence_enc_7,name_sentence_enc_8,name_sentence_enc_9,name_sentence_enc_10,...,name_sentence_enc_503,name_sentence_enc_504,name_sentence_enc_505,name_sentence_enc_506,name_sentence_enc_507,name_sentence_enc_508,name_sentence_enc_509,name_sentence_enc_510,name_sentence_enc_511,name_sentence_enc_512
0,-0.020817,-0.024469,0.009413,0.002318,-0.02266,0.066125,0.048065,-0.052013,-0.021806,0.09154,...,-0.060178,-0.020622,0.068885,0.062525,-0.05517,-0.016464,-0.014358,0.002798,0.012901,-0.019419
1,-0.016604,0.03226,0.021041,0.022471,-0.031093,0.037494,-0.038235,-0.012227,-0.071438,0.088568,...,-0.038681,0.007019,0.043597,0.022773,-0.045433,0.018652,0.015996,-0.02,-0.019207,-0.057829
2,-0.051247,0.014204,0.004854,-0.077494,-0.007085,0.031117,-0.07205,0.089297,-0.03022,0.087944,...,-0.055185,-0.026338,0.045335,0.055183,0.006401,-0.026624,0.067077,-0.050112,0.052876,-0.046238
3,-0.027279,-0.023753,0.003625,-0.025298,-0.074566,-0.002388,-0.062128,-0.033083,-0.001888,0.083536,...,-0.04772,-0.048412,0.057754,0.016479,-0.073248,0.050621,0.007236,-0.084796,0.065109,0.003366
4,0.043166,0.033854,0.02448,-0.020102,0.013765,0.037079,0.004255,0.002165,-0.02713,0.088773,...,-0.034981,-0.002184,-0.006184,0.055101,0.019394,0.013484,-0.021094,-0.028609,-0.008339,-0.058638


#### tfifd → 次元削減

In [None]:
tfidf_svd = Pipeline(steps=[
    ("TfidfVectorizer", TfidfVectorizer()),
    ('TruncatedSVD', TruncatedSVD(n_components=50, random_state=0))
])

tfidf_nmf = Pipeline(steps=[
    ("TfidfVectorizer", TfidfVectorizer()),
    ('NMF', NMF(n_components=50, random_state=0))
])

svd_features = tfidf_svd.fit_transform(df_all_name[COL_CLEAN_NAME])
nmf_features = tfidf_nmf.fit_transform(df_all_name[COL_CLEAN_NAME])

In [None]:
df_name_svd = pd.DataFrame(data=svd_features, columns=[f'name_svd_{i + 1}' for i in range(svd_features.shape[1])])
df_name_nmf = pd.DataFrame(data=nmf_features, columns=[f'name_nmf_{i + 1}' for i in range(nmf_features.shape[1])])
display(df_name_svd.head())
display(df_name_nmf.head())

Unnamed: 0,name_svd_1,name_svd_2,name_svd_3,name_svd_4,name_svd_5,name_svd_6,name_svd_7,name_svd_8,name_svd_9,name_svd_10,...,name_svd_41,name_svd_42,name_svd_43,name_svd_44,name_svd_45,name_svd_46,name_svd_47,name_svd_48,name_svd_49,name_svd_50
0,0.085872,0.076671,-0.006118,-0.035097,0.089097,-8.7e-05,-0.026398,0.001729,-0.0194,-0.036538,...,-0.076041,-0.012677,0.070628,0.081684,0.085562,-0.117829,0.103666,0.014454,-0.026341,-0.049899
1,0.141998,-0.102595,0.010942,-0.135544,0.059983,-0.076209,0.127684,0.014239,0.035072,-0.05706,...,0.002216,0.035651,-0.029237,-0.007453,0.006159,-0.003905,0.003355,0.024152,-0.045912,0.013117
2,0.171557,-0.020864,0.042284,-0.04299,-0.041844,-0.174367,-0.001621,0.061557,0.008615,0.006035,...,0.044219,-0.019584,-0.050702,-0.038299,0.007712,-0.078284,0.036298,-0.005927,0.02873,0.031145
3,0.391594,0.10029,-0.179886,-0.178896,-0.187117,-0.25127,-0.091131,-0.036105,-0.041105,-0.020006,...,0.105183,0.071906,0.071762,-0.110729,0.048166,-0.076391,-0.028395,-0.029964,-0.054129,-0.047571
4,0.123735,-0.016868,0.07286,-0.00344,-0.146279,0.083058,0.126626,-0.069506,-0.040334,-0.058586,...,-0.00977,-0.003196,-0.015742,0.020123,-0.013877,0.014267,-0.022825,-0.014977,0.01328,-0.023412


Unnamed: 0,name_nmf_1,name_nmf_2,name_nmf_3,name_nmf_4,name_nmf_5,name_nmf_6,name_nmf_7,name_nmf_8,name_nmf_9,name_nmf_10,...,name_nmf_41,name_nmf_42,name_nmf_43,name_nmf_44,name_nmf_45,name_nmf_46,name_nmf_47,name_nmf_48,name_nmf_49,name_nmf_50
0,0.0,0.027505,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.063755,0.00576,0.093923,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.001052,0.0,0.0,0.0,0.036223,0.0,...,0.0,0.0,0.0,0.0,0.001224,0.001658,0.0,0.0,0.0,0.0
2,0.012813,0.0,0.0,0.0,0.0,0.0,0.0,0.000727,0.032873,0.000453,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002689
3,0.013943,0.036904,0.0,0.0,0.0,0.0,0.0,0.036643,0.038691,0.0,...,0.0,0.004349,0.0,0.015473,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.058476,0.026783,0.0,0.033969,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df_name_features = pd.concat([df_name_emb, df_name_sentence_enc, df_name_svd, df_name_nmf], axis=1)
df_name_features.shape

(24796, 1380)

In [None]:
df_train.shape

(19800, 13)

In [None]:
df_train_name_features = df_name_features[:df_train.shape[0]].reset_index(drop=True)
df_test_name_features = df_name_features[df_train.shape[0]:].reset_index(drop=True)

In [None]:
print(f'df_train shape: {df_train_name_features.shape}')
print(f'df_test shape: {df_test_name_features.shape}')

df_train shape: (19800, 1380)
df_test shape: (4996, 1380)


In [None]:
df_train_name_features.to_csv('input/train_data_name_features_augumentation.csv', index=False)
df_test_name_features.to_csv('input/test_data_name_features_augumentation.csv', index=False)