特徴量エンジニアリング(name列)
 - tfidf → SVD(50次元)
 - tfidf → NMF(50次元)
 - BERTを用いた特徴量抽出
 - Universal Sentence Encoderを用いた特徴量抽出

In [1]:
from google.colab import drive
drive.mount('/content/drive')
# カレントディレクトリの指定
import os
os.chdir('/content/drive/MyDrive/分析コンペ/05_ProbSpace/民泊サービスの宿泊料金予測/')

Mounted at /content/drive


In [None]:
!pip install transformers
!pip install tensorflow_text

In [9]:
import re
import scipy.sparse as sp
import numpy as np
import pandas as pd
import torch
import transformers
from transformers import BertTokenizer
import warnings

import tensorflow as tf
import tensorflow_text
import tensorflow_hub as hub

from tqdm import tqdm

from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import TruncatedSVD, PCA, NMF
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
from sklearn.feature_extraction.text import _document_frequency

from src.config import *
import src.preprocessing as pr

warnings.filterwarnings('ignore')

In [13]:
COL_CLEAN_TEXT = 'clean_name'

In [10]:
def remove_symbol(text):
    code_regex = re.compile('[!"#$%&\'\\\\()*+,-./:;<=>?@[\\]^_`{|}~「」〔〕“”〈〉『』【】＆＊・（）＄＃＠。、？！｀＋￥％★✣♪◎☆￫〜◇✋▲△⭐︎丨❤▶☀️※《》☕️✦♯♬♡]')
    cleaned_text = text.replace(r'wi-fi', 'wifi')
    cleaned_text = code_regex.sub(' ', cleaned_text)
    return cleaned_text

In [4]:
class BertSequenceVectorizer:
    def __init__(self, model_name="bert-base-uncased", max_len=128):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model_name = model_name
        self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
        self.bert_model = transformers.BertModel.from_pretrained(self.model_name)
        self.bert_model = self.bert_model.to(self.device)
        self.max_len = max_len

    def vectorize(self, sentence: str) -> np.array:
        inp = self.tokenizer.encode(sentence)
        len_inp = len(inp)

        if len_inp >= self.max_len:
            inputs = inp[:self.max_len]
            masks = [1] * self.max_len
        else:
            inputs = inp + [0] * (self.max_len - len_inp)
            masks = [1] * len_inp + [0] * (self.max_len - len_inp)

        inputs_tensor = torch.tensor([inputs], dtype=torch.long).to(self.device)
        masks_tensor = torch.tensor([masks], dtype=torch.long).to(self.device)

        bert_out = self.bert_model(inputs_tensor, masks_tensor)
        seq_out, pooled_out = bert_out['last_hidden_state'], bert_out['pooler_output']

        if torch.cuda.is_available():    
            return seq_out[0][0].cpu().detach().numpy() # 0番目は [CLS] token, 768 dim の文章特徴量
        else:
            return seq_out[0][0].detach().numpy()

In [5]:
df_train = pd.read_csv('input/train_data.csv', parse_dates=[COL_LAST_REVIEW], dtype=DICT_DTYPES)
df_test = pd.read_csv('input/test_data.csv', parse_dates=[COL_LAST_REVIEW], dtype=DICT_DTYPES)

In [6]:
df_train.head()

Unnamed: 0,id,name,host_id,neighbourhood,latitude,longitude,room_type,minimum_nights,number_of_reviews,last_review,reviews_per_month,availability_365,y
0,1,KiyosumiShirakawa 3min|★SkyTree★|WIFI|Max4|Tre...,242899459,Koto Ku,35.68185,139.8031,Entire home/apt,1,55,2020-04-25,2.21,173,12008
1,2,Downtown Tokyo Iriya next to Ueno,308879948,Taito Ku,35.72063,139.78536,Entire home/apt,6,72,2020-03-25,2.11,9,6667
2,3,"Japan Style,Private,Affordable,4min to Sta.",300877823,Katsushika Ku,35.74723,139.82349,Entire home/apt,1,18,2020-03-23,3.46,288,9923
3,4,4 min to Shinjuku Sta. by train / 2 ppl / Wi-fi,236935461,Shibuya Ku,35.68456,139.68077,Entire home/apt,1,2,2020-04-02,1.76,87,8109
4,5,LICENSED SHINJUKU HOUSE: Heart of the action!,243408889,Shinjuku Ku,35.6984,139.70467,Entire home/apt,1,86,2020-01-30,2.0,156,100390


In [7]:
df_all_name = pd.concat([df_train[[COL_NAME]], df_test[[COL_NAME]]], axis=0).reset_index(drop=True)

In [14]:
df_all_name[COL_CLEAN_TEXT] = df_all_name[COL_NAME].str.lower()

In [15]:
# df_all_name[COL_CLEAN_TEXT] = hero.lowercase(df_all_name[COL_NAME])
df_all_name[COL_CLEAN_TEXT] = df_all_name[COL_CLEAN_TEXT].apply(remove_symbol)

In [16]:
df_all_name

Unnamed: 0,name,clean_name
0,KiyosumiShirakawa 3min|★SkyTree★|WIFI|Max4|Tre...,kiyosumishirakawa 3min skytree wifi max4 tre...
1,Downtown Tokyo Iriya next to Ueno,downtown tokyo iriya next to ueno
2,"Japan Style,Private,Affordable,4min to Sta.",japan style private affordable 4min to sta
3,4 min to Shinjuku Sta. by train / 2 ppl / Wi-fi,4 min to shinjuku sta by train 2 ppl wifi
4,LICENSED SHINJUKU HOUSE: Heart of the action!,licensed shinjuku house heart of the action
...,...,...
14981,Stylish Pad In Nishi Shinjuku - Free WiFi!,stylish pad in nishi shinjuku free wifi
14982,U-6 鶯谷 BasaeInn Uguisudani Tokyo,u 6 鶯谷 basaeinn uguisudani tokyo
14983,A convenient room! 〜SHINJUKU〜 【Pocket WiFi】,a convenient room shinjuku pocket wifi
14984,Shinjuku ShareHouse Dormitory1 with Balcony,shinjuku sharehouse dormitory1 with balcony


In [17]:
BSV = BertSequenceVectorizer(
    model_name="bert-base-multilingual-uncased",
    max_len=64)
features = np.stack(
    df_all_name[COL_CLEAN_TEXT].map(lambda x: BSV.vectorize(x).reshape(-1)).values
)

Downloading:   0%|          | 0.00/851k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/641M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
df_name_emb = pd.DataFrame(data=features, columns=[f'name_emb_{i + 1}' for i in range(features.shape[1])])

In [19]:
df_name_emb.head()

Unnamed: 0,name_emb_1,name_emb_2,name_emb_3,name_emb_4,name_emb_5,name_emb_6,name_emb_7,name_emb_8,name_emb_9,name_emb_10,...,name_emb_759,name_emb_760,name_emb_761,name_emb_762,name_emb_763,name_emb_764,name_emb_765,name_emb_766,name_emb_767,name_emb_768
0,-0.036676,-0.049738,0.050754,-0.02562,-0.331241,0.013945,-0.067799,-0.020603,-1.920739,-0.024602,...,-0.00852,2.122303,-0.113638,-0.033221,0.078959,0.016577,-0.089147,-0.006365,-0.010549,-0.050055
1,0.069604,-0.127225,0.024435,-0.011773,-0.351301,0.142089,-0.040712,0.042105,-1.932358,-0.063492,...,-0.113498,2.09374,-0.07674,0.017098,0.146325,-0.094669,-0.218214,0.041695,0.007428,0.01232
2,-0.116778,-0.017075,0.004824,-0.087936,-0.345614,0.037181,-0.043724,-0.075973,-1.831782,-0.07497,...,0.113531,2.095039,-0.079617,-0.148137,0.08085,-0.025728,-0.097848,0.044289,-0.027747,-0.017916
3,-0.054566,-0.048791,0.066115,0.034154,-0.338604,0.081742,-0.155228,-0.137337,-1.803986,-0.045013,...,-0.069991,2.176922,-0.117207,-0.168948,0.041346,0.045783,-0.080928,0.068312,-0.186026,0.033761
4,-0.021847,0.017521,0.082993,-0.03214,-0.223052,0.050514,-0.031871,-0.007857,-1.833062,-0.029889,...,0.008656,2.055184,-0.093976,-0.035631,0.095189,-0.012796,-0.08965,0.086061,0.015116,-0.058719


In [20]:
tqdm.pandas()

embedder = hub.load(
    "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")
features = np.stack(
    df_all_name[COL_CLEAN_TEXT].progress_apply(lambda x: embedder(x).numpy().reshape(-1)).values
)

100%|██████████| 14986/14986 [01:35<00:00, 156.52it/s]


In [21]:
df_name_sentence_enc = pd.DataFrame(data=features, columns=[f'name_entence_enc_{i + 1}' for i in range(features.shape[1])])

In [22]:
df_name_sentence_enc.head()

Unnamed: 0,name_entence_enc_1,name_entence_enc_2,name_entence_enc_3,name_entence_enc_4,name_entence_enc_5,name_entence_enc_6,name_entence_enc_7,name_entence_enc_8,name_entence_enc_9,name_entence_enc_10,...,name_entence_enc_503,name_entence_enc_504,name_entence_enc_505,name_entence_enc_506,name_entence_enc_507,name_entence_enc_508,name_entence_enc_509,name_entence_enc_510,name_entence_enc_511,name_entence_enc_512
0,-0.020817,-0.024469,0.009413,0.002318,-0.02266,0.066125,0.048065,-0.052013,-0.021806,0.09154,...,-0.060178,-0.020622,0.068885,0.062525,-0.05517,-0.016464,-0.014358,0.002798,0.012901,-0.019419
1,-0.016604,0.03226,0.021041,0.022471,-0.031093,0.037494,-0.038235,-0.012227,-0.071438,0.088568,...,-0.038681,0.007019,0.043597,0.022773,-0.045433,0.018652,0.015996,-0.02,-0.019207,-0.057829
2,-0.051247,0.014204,0.004854,-0.077494,-0.007085,0.031117,-0.07205,0.089297,-0.03022,0.087944,...,-0.055185,-0.026338,0.045335,0.055183,0.006401,-0.026624,0.067077,-0.050111,0.052876,-0.046238
3,-0.027279,-0.023753,0.003625,-0.025298,-0.074566,-0.002388,-0.062128,-0.033083,-0.001888,0.083536,...,-0.04772,-0.048412,0.057754,0.016479,-0.073248,0.050621,0.007236,-0.084796,0.065109,0.003366
4,0.043166,0.033854,0.02448,-0.020102,0.013765,0.037079,0.004255,0.002165,-0.02713,0.088773,...,-0.034981,-0.002184,-0.006184,0.055101,0.019394,0.013484,-0.021094,-0.028609,-0.008339,-0.058638


In [23]:
tfidf_svd = Pipeline(steps=[
    ("TfidfVectorizer", TfidfVectorizer()),
    ('TruncatedSVD', TruncatedSVD(n_components=50, random_state=0))
])

tfidf_nmf = Pipeline(steps=[
    ("TfidfVectorizer", TfidfVectorizer()),
    ('NMF', NMF(n_components=50, random_state=0))
])

svd_features = tfidf_svd.fit_transform(df_all_name[COL_NAME])
nmf_features = tfidf_nmf.fit_transform(df_all_name[COL_NAME])

In [24]:
df_name_svd = pd.DataFrame(data=svd_features, columns=[f'name_svd_{i + 1}' for i in range(svd_features.shape[1])])
df_name_nmf = pd.DataFrame(data=nmf_features, columns=[f'name_nmf_{i + 1}' for i in range(nmf_features.shape[1])])

In [25]:
df_name_nmf.head()

Unnamed: 0,name_nmf_1,name_nmf_2,name_nmf_3,name_nmf_4,name_nmf_5,name_nmf_6,name_nmf_7,name_nmf_8,name_nmf_9,name_nmf_10,...,name_nmf_41,name_nmf_42,name_nmf_43,name_nmf_44,name_nmf_45,name_nmf_46,name_nmf_47,name_nmf_48,name_nmf_49,name_nmf_50
0,0.0,0.022792,0.0,0.021503,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002185
1,0.0,0.0,0.0,0.0,0.0,0.03606,9.3e-05,0.0,0.000842,0.047238,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001609
2,0.0,0.0,0.0,0.0,0.000113,0.033566,0.0,0.0,0.0,0.001728,...,0.0,0.0,0.055095,0.0,0.0,0.004798,0.0,0.0,0.0,0.0
3,0.021749,0.0,0.0,0.0,0.060049,0.034909,0.0,0.0,0.049411,0.0,...,0.0,0.004307,0.052743,0.0,0.0,0.16208,0.0,0.0,0.0,0.0
4,0.017115,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.002413,0.0,0.0,0.143528,0.0,0.0,0.001721,0.0,0.0


In [26]:
df_name_features = pd.concat([df_name_emb, df_name_sentence_enc, df_name_svd, df_name_nmf], axis=1)
df_name_features.shape

(14986, 1380)

In [27]:
df_train_name_features = df_name_features[:df_train.shape[0]].reset_index(drop=True)
df_test_name_features = df_name_features[df_train.shape[0]:].reset_index(drop=True)

In [28]:
print(f'df_train shape: {df_train_name_features.shape}')
print(f'df_test shape: {df_test_name_features.shape}')

df_train shape: (9990, 1380)
df_test shape: (4996, 1380)


In [29]:
df_train_name_features.to_csv('input/train_data_name_features.csv', index=False)
df_test_name_features.to_csv('input/test_data_name_features.csv', index=False)