In [1]:
# =========================
# Library
# =========================
import numpy as np
import pandas as pd
from tqdm import tqdm
import sys, os
from transformers import DistilBertModel, DistilBertTokenizer,AutoModel,AutoTokenizer
import torch.nn.functional as F
import torch.nn as nn
import torch
from torch.utils.data import DataLoader, Dataset

2022-05-18 07:42:33.609187: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [2]:
# =========================
# Constant
# =========================
TRAIN_PATH = "../data/train.csv"

In [3]:
# =========================
# Settings
# =========================
fe = "029"
MAX_LEN = 32
BS = 128
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BERT_MODEL = 'bert-base-multilingual-uncased'

In [4]:
if not os.path.exists(f"../output/fe/fe{fe}"):
    os.makedirs(f"../output/fe/fe{fe}")
    os.makedirs(f"../output/fe/fe{fe}/model")

In [5]:
# =========================
# Functions
# =========================
class BertDataset(Dataset):
    def __init__(self, text, tokenizer, max_len):
        self.text = text
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = str(self.text[item])
        inputs = self.tokenizer(
            text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True
        )
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        
        return {
            "input_ids": torch.tensor(ids, dtype=torch.long),
            "attention_mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long)
        }
    
class bert_model(nn.Module):
    def __init__(self):
        super(bert_model, self).__init__()
        self.model = AutoModel.from_pretrained(BERT_MODEL)

    def forward(self, ids, mask):
        # pooler
        bert_out = self.model(ids, attention_mask=mask)[0]
        x = F.normalize((bert_out[:, 1:, :]*mask[:, 1:, None]).mean(axis=1))
        return x

In [6]:
# ============================
# Main
# ============================
train = pd.read_csv(TRAIN_PATH)

In [7]:
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/851k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.64M [00:00<?, ?B/s]

In [8]:
train.head()

Unnamed: 0,id,name,latitude,longitude,address,city,state,zip,country,url,phone,categories,point_of_interest
0,E_000001272c6c5d,Café Stad Oudenaarde,50.859975,3.634196,Abdijstraat,Nederename,Oost-Vlaanderen,9700.0,BE,,,Bars,P_677e840bb6fc7e
1,E_000002eae2a589,Carioca Manero,-22.907225,-43.178244,,,,,BR,,,Brazilian Restaurants,P_d82910d8382a83
2,E_000007f24ebc95,ร้านตัดผมการาเกด,13.780813,100.4849,,,,,TH,,,Salons / Barbershops,P_b1066599e78477
3,E_000008a8ba4f48,Turkcell,37.84451,27.844202,Adnan Menderes Bulvarı,,,,TR,,,Mobile Phone Shops,P_b2ed86905a4cd3
4,E_00001d92066153,Restaurante Casa Cofiño,43.338196,-4.326821,,Caviedes,Cantabria,,ES,,,Spanish Restaurants,P_809a884d4407fb


In [9]:
train["name"] = train["name"].astype(str)
train["name"] = train["name"].str.lower()

In [10]:
train["name"].nunique()

817309

In [11]:
name = train["name"].unique()

In [12]:
np.save(f"../output/fe/fe{fe}/model/name.npy",name)

In [13]:
train_ = BertDataset(name, tokenizer, MAX_LEN)
train_loader = DataLoader(
        dataset=train_, batch_size=BS * 2, shuffle=False)

In [14]:
model = bert_model()
model = model.to(device)
model.eval()
val_preds = []
with torch.no_grad():
    for d in tqdm(train_loader,total=len(train_loader)):
        input_ids = d['input_ids']
        mask = d['attention_mask']
        token_type_ids = d["token_type_ids"]
        input_ids = input_ids.to(device)
        mask = mask.to(device)
        output = model(input_ids, mask)
        val_preds.append(output.detach().cpu().numpy().astype(np.float32))

Downloading:   0%|          | 0.00/641M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 3193/3193 [03:22<00:00, 15.75it/s]


In [15]:
val_preds= np.concatenate(val_preds)

In [16]:
np.save(f"../output/fe/fe{fe}/model/bert_base_multilingual_embedding.npy",val_preds)