In [None]:
!nvidia-smi

In [None]:
!pip install transformers > /dev/null

In [None]:
import pandas as pd
import numpy as np
import torch
import transformers
import re
from transformers import BertTokenizer
from tqdm import tqdm
tqdm.pandas()

In [None]:
#DATADIR = "drive/MyDrive/atma10/input/"
#OUTPUTDIR = "drive/MyDrive/atma10/feature/"
DATADIR = "../input/"
OUTPUTDIR = "../feature/"

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2 
    dfs = []
    for col in df.columns: #columns毎に処理
        col_type = df[col].dtypes
        if col_type in numerics: #numericsのデータ型の範囲内のときに処理を実行. データの最大最小値を元にデータ型を効率的なものに変更
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    dfs.append(df[col].astype(np.int8))
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    dfs.append(df[col].astype(np.int16))
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    dfs.append(df[col].astype(np.int32))
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    dfs.append(df[col].astype(np.int64) ) 
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    dfs.append(df[col].astype(np.float16))
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    dfs.append(df[col].astype(np.float32))
                else:
                    dfs.append(df[col].astype(np.float64))
        else:
            dfs.append(df[col])
    
    df_out = pd.concat(dfs, axis=1)
    if verbose:
        end_mem = df_out.memory_usage().sum() / 1024**2
        num_reduction = str(100 * (start_mem - end_mem) / start_mem)
        print(f'Mem. usage decreased to {str(end_mem)[:3]}Mb:  {num_reduction[:2]}% reduction')
    return df_out

In [None]:
def remove_pad_token(s):
    if s is np.nan:
        return np.nan
    
    clean_s = re.sub("<pad>", "", s)
    if clean_s[0] == " ":
        clean_s = clean_s[1:]
    return clean_s

In [None]:
train_data = pd.read_csv(DATADIR + "train.csv")
test_data = pd.read_csv(DATADIR + "test.csv")

all_df = pd.concat([train_data, test_data], axis=0).reset_index(drop=True)

In [None]:
class BertSequenceVectorizer:
    def __init__(self):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model_name = 'bert-base-cased' #uncased  #"bert-base-multilingual-cased"
        self.tokenizer = BertTokenizer.from_pretrained(self.model_name)
        self.bert_model = transformers.BertModel.from_pretrained(self.model_name)
        self.bert_model = self.bert_model.to(self.device)
        self.max_len = 128


    def vectorize(self, sentence : str) -> np.array:
        inp = self.tokenizer.encode(sentence)
        len_inp = len(inp)

        if len_inp >= self.max_len:
            inputs = inp[:self.max_len]
            masks = [1] * self.max_len
        else:
            inputs = inp + [0] * (self.max_len - len_inp)
            masks = [1] * len_inp + [0] * (self.max_len - len_inp)

        inputs_tensor = torch.tensor([inputs], dtype=torch.long).to(self.device)
        masks_tensor = torch.tensor([masks], dtype=torch.long).to(self.device)

        bert_out = self.bert_model(inputs_tensor, masks_tensor)
        seq_out, pooled_out = bert_out['last_hidden_state'], bert_out['pooler_output']

        if torch.cuda.is_available():    
            return seq_out[0][0].cpu().detach().numpy() # 0番目は [CLS] token, 768 dim の文章特徴量
        else:
            return seq_out[0][0].detach().numpy()

In [None]:
BSV = BertSequenceVectorizer() # インスタンス化します

In [None]:
for c in ["transed_long_title", "transed_description"]:
    _df = pd.read_csv(OUTPUTDIR + f"{c}.csv") #"long_title_transed.csv", "description_transed.csv"
    all_df = pd.merge(all_df, _df, on="object_id", how="left")
    all_df[c] = all_df[c].apply(lambda x: remove_pad_token(x))


    all_df[c] = all_df[c].fillna("NaN") # null は代わりのもので埋めます
    all_df[f'{c}_feature'] = all_df[c].progress_apply(lambda x: BSV.vectorize(x))

    arr = all_df[f"{c}_feature"].values.tolist()
    arr = np.array(arr)

    suffix = "_".join(c.split("_")[1:])
    df_out = pd.concat([all_df["object_id"],
                        pd.DataFrame(arr).add_prefix(f"nl_enBERT_{suffix}")], axis=1)

    df_out = reduce_mem_usage(df_out)
    df_out.to_csv(OUTPUTDIR+f"nl_en_BERT_{c}.csv", index=False)

In [None]:
#df_out.to_csv(OUTPUTDIR+"BERT_description.csv", index=False)
#df_out.to_csv(OUTPUTDIR+"BERT_multi_description.csv", index=False)