特徴量エンジニアリング(name列)
 - tfidf → SVD(50次元)
 - tfidf → NMF(50次元)
 - BERTを用いた特徴量抽出
 - Universal Sentence Encoderを用いた特徴量抽出

In [1]:
from google.colab import drive
drive.mount('/content/drive')
# カレントディレクトリの指定
import os
os.chdir('/content/drive/MyDrive/分析コンペ/05_ProbSpace/民泊サービスの宿泊料金予測/')

Mounted at /content/drive


In [None]:
!pip install transformers
!pip install tensorflow_text

In [3]:
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import warnings

import torch
import transformers
from transformers import BertTokenizer
import tensorflow as tf
import tensorflow_text
import tensorflow_hub as hub

from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import TruncatedSVD, PCA, NMF
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
from sklearn.feature_extraction.text import _document_frequency

from src.config import *
import src.preprocessing as pr

warnings.filterwarnings('ignore')

In [4]:
class BertSequenceVectorizer:
    def __init__(self, model_name="bert-base-uncased", max_len=128):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model_name = model_name
        self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
        self.bert_model = transformers.BertModel.from_pretrained(self.model_name)
        self.bert_model = self.bert_model.to(self.device)
        self.max_len = max_len

    def vectorize(self, sentence: str) -> np.array:
        inp = self.tokenizer.encode(sentence)
        len_inp = len(inp)

        if len_inp >= self.max_len:
            inputs = inp[:self.max_len]
            masks = [1] * self.max_len
        else:
            inputs = inp + [0] * (self.max_len - len_inp)
            masks = [1] * len_inp + [0] * (self.max_len - len_inp)

        inputs_tensor = torch.tensor([inputs], dtype=torch.long).to(self.device)
        masks_tensor = torch.tensor([masks], dtype=torch.long).to(self.device)

        bert_out = self.bert_model(inputs_tensor, masks_tensor)
        seq_out, pooled_out = bert_out['last_hidden_state'], bert_out['pooler_output']

        if torch.cuda.is_available():    
            return seq_out[0][0].cpu().detach().numpy() # 0番目は [CLS] token, 768 dim の文章特徴量
        else:
            return seq_out[0][0].detach().numpy()

In [5]:
df_train = pd.read_csv('input/train_data.csv', parse_dates=[COL_LAST_REVIEW], dtype=DICT_DTYPES)
df_test = pd.read_csv('input/test_data.csv', parse_dates=[COL_LAST_REVIEW], dtype=DICT_DTYPES)

In [6]:
df_all_name = pd.concat([df_train[[COL_NAME]], df_test[[COL_NAME]]], axis=0).reset_index(drop=True)
df_all_name.head()

Unnamed: 0,name
0,KiyosumiShirakawa 3min|★SkyTree★|WIFI|Max4|Tre...
1,Downtown Tokyo Iriya next to Ueno
2,"Japan Style,Private,Affordable,4min to Sta."
3,4 min to Shinjuku Sta. by train / 2 ppl / Wi-fi
4,LICENSED SHINJUKU HOUSE: Heart of the action!


In [7]:
df_all_name[COL_CLEAN_NAME] = df_all_name[COL_NAME].str.lower()
df_all_name[COL_CLEAN_NAME] = df_all_name[COL_CLEAN_NAME].apply(pr.remove_symbol)

In [8]:
df_all_name.head()

Unnamed: 0,name,clean_name
0,KiyosumiShirakawa 3min|★SkyTree★|WIFI|Max4|Tre...,kiyosumishirakawa 3min skytree wifi max4 tre...
1,Downtown Tokyo Iriya next to Ueno,downtown tokyo iriya next to ueno
2,"Japan Style,Private,Affordable,4min to Sta.",japan style private affordable 4min to sta
3,4 min to Shinjuku Sta. by train / 2 ppl / Wi-fi,4 min to shinjuku sta by train 2 ppl wifi
4,LICENSED SHINJUKU HOUSE: Heart of the action!,licensed shinjuku house heart of the action


#### BERTによる特徴量抽出

In [9]:
BSV = BertSequenceVectorizer(
    model_name="bert-base-multilingual-uncased",
    max_len=64)
features = np.stack(
    df_all_name[COL_CLEAN_NAME].map(lambda x: BSV.vectorize(x).reshape(-1)).values
)

Downloading:   0%|          | 0.00/851k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/641M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
df_name_emb = pd.DataFrame(data=features, columns=[f'name_bert_enb_{i + 1}' for i in range(features.shape[1])])
df_name_emb.head()

Unnamed: 0,name_bert_enb_1,name_bert_enb_2,name_bert_enb_3,name_bert_enb_4,name_bert_enb_5,name_bert_enb_6,name_bert_enb_7,name_bert_enb_8,name_bert_enb_9,name_bert_enb_10,...,name_bert_enb_759,name_bert_enb_760,name_bert_enb_761,name_bert_enb_762,name_bert_enb_763,name_bert_enb_764,name_bert_enb_765,name_bert_enb_766,name_bert_enb_767,name_bert_enb_768
0,-0.036676,-0.049738,0.050754,-0.02562,-0.331241,0.013945,-0.067799,-0.020603,-1.920739,-0.024602,...,-0.00852,2.122303,-0.113638,-0.033221,0.078959,0.016577,-0.089147,-0.006365,-0.010549,-0.050055
1,0.069604,-0.127225,0.024435,-0.011773,-0.351301,0.142089,-0.040712,0.042105,-1.932358,-0.063492,...,-0.113498,2.09374,-0.07674,0.017098,0.146325,-0.094669,-0.218214,0.041695,0.007428,0.01232
2,-0.116778,-0.017075,0.004824,-0.087936,-0.345614,0.037181,-0.043724,-0.075973,-1.831782,-0.07497,...,0.113531,2.095039,-0.079617,-0.148137,0.08085,-0.025728,-0.097848,0.044289,-0.027747,-0.017916
3,-0.054566,-0.048791,0.066115,0.034154,-0.338604,0.081742,-0.155228,-0.137337,-1.803986,-0.045013,...,-0.069991,2.176922,-0.117207,-0.168948,0.041346,0.045783,-0.080928,0.068312,-0.186026,0.033761
4,-0.021847,0.017521,0.082993,-0.03214,-0.223052,0.050514,-0.031871,-0.007857,-1.833062,-0.029889,...,0.008656,2.055184,-0.093976,-0.035631,0.095189,-0.012796,-0.08965,0.086061,0.015116,-0.058719


#### Universal Sentence Encoderによる特徴量抽出

In [11]:
tqdm.pandas()

embedder = hub.load(
    "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")
features = np.stack(
    df_all_name[COL_CLEAN_NAME].progress_apply(lambda x: embedder(x).numpy().reshape(-1)).values
)

100%|██████████| 14986/14986 [01:38<00:00, 152.13it/s]


In [12]:
df_name_sentence_enc = pd.DataFrame(data=features, columns=[f'name_sentence_enc_{i + 1}' for i in range(features.shape[1])])
df_name_sentence_enc.head()

Unnamed: 0,name_sentence_enc_1,name_sentence_enc_2,name_sentence_enc_3,name_sentence_enc_4,name_sentence_enc_5,name_sentence_enc_6,name_sentence_enc_7,name_sentence_enc_8,name_sentence_enc_9,name_sentence_enc_10,...,name_sentence_enc_503,name_sentence_enc_504,name_sentence_enc_505,name_sentence_enc_506,name_sentence_enc_507,name_sentence_enc_508,name_sentence_enc_509,name_sentence_enc_510,name_sentence_enc_511,name_sentence_enc_512
0,-0.020817,-0.024469,0.009413,0.002318,-0.02266,0.066125,0.048065,-0.052013,-0.021806,0.09154,...,-0.060178,-0.020622,0.068885,0.062525,-0.05517,-0.016464,-0.014358,0.002798,0.012901,-0.019419
1,-0.016604,0.03226,0.021041,0.022471,-0.031093,0.037494,-0.038235,-0.012227,-0.071438,0.088568,...,-0.038681,0.007019,0.043597,0.022773,-0.045433,0.018652,0.015996,-0.02,-0.019207,-0.057829
2,-0.051247,0.014204,0.004854,-0.077494,-0.007085,0.031117,-0.07205,0.089297,-0.03022,0.087944,...,-0.055185,-0.026338,0.045335,0.055183,0.006401,-0.026624,0.067077,-0.050111,0.052876,-0.046238
3,-0.027279,-0.023753,0.003625,-0.025298,-0.074566,-0.002388,-0.062128,-0.033083,-0.001888,0.083536,...,-0.04772,-0.048412,0.057754,0.016479,-0.073248,0.050621,0.007236,-0.084796,0.065109,0.003366
4,0.043166,0.033854,0.02448,-0.020102,0.013765,0.037079,0.004255,0.002165,-0.02713,0.088773,...,-0.034981,-0.002184,-0.006184,0.055101,0.019394,0.013484,-0.021094,-0.028609,-0.008339,-0.058638


#### tfifd → 次元削減

In [13]:
tfidf_svd = Pipeline(steps=[
    ("TfidfVectorizer", TfidfVectorizer()),
    ('TruncatedSVD', TruncatedSVD(n_components=50, random_state=0))
])

tfidf_nmf = Pipeline(steps=[
    ("TfidfVectorizer", TfidfVectorizer()),
    ('NMF', NMF(n_components=50, random_state=0))
])

svd_features = tfidf_svd.fit_transform(df_all_name[COL_CLEAN_NAME])
nmf_features = tfidf_nmf.fit_transform(df_all_name[COL_CLEAN_NAME])

In [14]:
df_name_svd = pd.DataFrame(data=svd_features, columns=[f'name_svd_{i + 1}' for i in range(svd_features.shape[1])])
df_name_nmf = pd.DataFrame(data=nmf_features, columns=[f'name_nmf_{i + 1}' for i in range(nmf_features.shape[1])])
display(df_name_svd.head())
display(df_name_nmf.head())

Unnamed: 0,name_svd_1,name_svd_2,name_svd_3,name_svd_4,name_svd_5,name_svd_6,name_svd_7,name_svd_8,name_svd_9,name_svd_10,...,name_svd_41,name_svd_42,name_svd_43,name_svd_44,name_svd_45,name_svd_46,name_svd_47,name_svd_48,name_svd_49,name_svd_50
0,0.095187,0.031978,-0.042307,0.10403,0.047375,0.021559,0.0082,0.012486,-0.023821,0.02688,...,-0.011494,0.032136,0.051379,0.062842,-0.050891,0.005841,-0.024622,0.004007,0.051061,-0.033168
1,0.15036,-0.107559,0.089944,0.10645,-0.057372,-0.084118,-0.006647,-0.053472,-0.113307,-0.129811,...,0.053653,0.047106,0.01202,0.032039,0.005947,0.029388,0.030427,-0.004934,0.013696,0.014055
2,0.183825,-0.024951,0.038542,-0.033043,-0.001392,-0.178047,0.057977,0.044514,0.065284,-0.026418,...,0.012248,-0.11065,0.038608,0.022863,0.021413,-0.029799,-0.012886,-0.049166,0.024616,-0.073765
3,0.44077,-0.172714,-0.209239,-0.138462,-0.037927,-0.079738,-0.069956,0.239671,0.051533,-0.25937,...,0.036328,-0.019215,0.112628,-0.017056,0.024349,-0.013463,0.006439,-0.022742,-0.013591,0.017317
4,0.11035,0.051847,0.044703,-0.089136,-0.144313,0.058196,-0.01391,-0.006677,-0.076028,-0.02645,...,-0.029885,-0.021146,-0.026566,0.105954,-0.13149,0.029993,-0.046088,0.149089,-0.110803,0.137139


Unnamed: 0,name_nmf_1,name_nmf_2,name_nmf_3,name_nmf_4,name_nmf_5,name_nmf_6,name_nmf_7,name_nmf_8,name_nmf_9,name_nmf_10,...,name_nmf_41,name_nmf_42,name_nmf_43,name_nmf_44,name_nmf_45,name_nmf_46,name_nmf_47,name_nmf_48,name_nmf_49,name_nmf_50
0,0.0,0.023941,0.0,0.023301,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.003259,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00057
1,0.0,0.0,0.0,0.0,4.7e-05,0.0,0.0,0.038311,0.05405,0.000177,...,0.0,0.001156,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000379,0.0,0.0,0.0,0.000424,0.0,0.044866,0.035054,0.001815,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.026034,0.031678,0.0,0.0,0.0,0.0,0.04746,0.04187,0.0,0.0,...,0.007228,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0186,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000614,...,0.003686,0.0,0.0,0.121278,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
df_name_features = pd.concat([df_name_emb, df_name_sentence_enc, df_name_svd, df_name_nmf], axis=1)
df_name_features.shape

(14986, 1380)

In [16]:
df_train_name_features = df_name_features[:df_train.shape[0]].reset_index(drop=True)
df_test_name_features = df_name_features[df_train.shape[0]:].reset_index(drop=True)

In [17]:
print(f'df_train shape: {df_train_name_features.shape}')
print(f'df_test shape: {df_test_name_features.shape}')

df_train shape: (9990, 1380)
df_test shape: (4996, 1380)


In [18]:
df_train_name_features.to_csv('input/train_data_name_features.csv', index=False)
df_test_name_features.to_csv('input/test_data_name_features.csv', index=False)