In [1]:
# =========================
# Library
# =========================
import numpy as np
import pandas as pd
from tqdm import tqdm
import sys, os
from transformers import DistilBertModel, DistilBertTokenizer,AutoModel,AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.cuda.amp import autocast, GradScaler
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
import torch.nn.functional as F
import torch.nn as nn
import torch
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
import logging
import sys
from contextlib import contextmanager
import time
import random
import math
import cudf, cuml, cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors
import gc
%env TOKENIZERS_PARALLELISM=true
%matplotlib inline

2022-06-24 21:14:13.666935: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


env: TOKENIZERS_PARALLELISM=true


In [2]:
# =========================
# Constant
# =========================
TRAIN_PATH = "../data/train.csv"
TARGET = "point_of_interest"

In [3]:
# =========================
# Settings
# =========================
exp = "101"
if not os.path.exists(f"../output/exp/ex{exp}"):
    os.makedirs(f"../output/exp/ex{exp}")
    os.makedirs(f"../output/exp/ex{exp}/model")
LOGGER_PATH = f"../output/exp/ex{exp}/ex_{exp}.txt"
MODEL_PATH_BASE = f"../output/exp/ex{exp}/model/ex{exp}"
MODEL_PATH = "xlm-roberta-large"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

val_fold = 0 
# config
SEED = 0
BATCH_SIZE = 64
iters_to_accumulate = 1
n_epochs = 5
max_len = 128
weight_decay = 0.1
beta = (0.9, 0.98)
lr = 1e-5
num_warmup_steps_rate = 0.1
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
clip_grad_norm = 1

In [4]:
# ===============
# Functions
# ===============
def setup_logger(out_file=None, stderr=True, stderr_level=logging.INFO, file_level=logging.DEBUG):
    LOGGER.handlers = []
    LOGGER.setLevel(min(stderr_level, file_level))

    if stderr:
        handler = logging.StreamHandler(sys.stderr)
        handler.setFormatter(FORMATTER)
        handler.setLevel(stderr_level)
        LOGGER.addHandler(handler)

    if out_file is not None:
        handler = logging.FileHandler(out_file)
        handler.setFormatter(FORMATTER)
        handler.setLevel(file_level)
        LOGGER.addHandler(handler)

    LOGGER.info("logger set up")
    return LOGGER

@contextmanager
def timer(name):
    t0 = time.time()
    yield
    LOGGER.info(f'[{name}] done in {time.time() - t0:.0f} s')
    

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [6]:
class FourSquareDataset(Dataset):
    def __init__(self, text, near_text,num_features, tokenizer, max_len,labels=None):
        self.text = text
        self.near_text = near_text
        self.num_features = num_features
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.labels = labels

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = self.text[item]
        near_text = self.near_text[item]
        inputs = self.tokenizer(
            text,near_text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True
        )
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        num_feature = self.num_features[item]
        if self.labels is not None:
            label = self.labels[item]
            return {
                "input_ids": torch.tensor(ids, dtype=torch.long),
                "attention_mask": torch.tensor(mask, dtype=torch.long),
                "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
                "num_feature" : torch.tensor(num_feature, dtype=torch.float32),
                "label" : torch.tensor(label, dtype=torch.float32),
            }
        else:
            return {
                "input_ids": torch.tensor(ids, dtype=torch.long),
                "attention_mask": torch.tensor(mask, dtype=torch.long),
                "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
                "num_feature" : torch.tensor(num_feature, dtype=torch.float32),
            }
    
    
class bert_model(nn.Module):
    def __init__(self):
        super(bert_model, self).__init__()
        self.model = AutoModel.from_pretrained(MODEL_PATH)
        self.ln1 = nn.LayerNorm(1024)
        self.linear1 = nn.Sequential(
            nn.Linear(1024,128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Dropout(0.2))
        
        self.linear2 = nn.Sequential(
            nn.Linear(2,32),
            nn.LayerNorm(32),
            nn.ReLU(),
            nn.Dropout(0.2))
        
        self.linear3 = nn.Sequential(
            nn.Linear(128 + 32,64),
            nn.LayerNorm(64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64,1),
           )
        
    

    def forward(self, ids, mask, token_type_ids,num_features):
        # pooler
        out = self.model(ids, attention_mask=mask,token_type_ids=token_type_ids)['last_hidden_state'][:,0,:]
        out =  self.ln1(out)
        out = self.linear1(out)
        out2 = self.linear2(num_features)
        out = torch.cat([out,out2],axis=-1)
        out = self.linear3(out)
        return out
    
    
def collate(d,train=True):
    mask_len = int(d["attention_mask"].sum(axis=1).max())
    if train:
        return {"input_ids" : d['input_ids'][:,:mask_len],
                "attention_mask" : d['attention_mask'][:,:mask_len],
                "token_type_ids" : d["token_type_ids"][:,:mask_len],
                 "label" : d['label'],
                 "num_feature" : d["num_feature"]}
    else:
        return {"input_ids" : d['input_ids'][:,:mask_len],
                "attention_mask" : d['attention_mask'][:,:mask_len],
                "token_type_ids" : d["token_type_ids"][:,:mask_len],
                 "num_feature" : d["num_feature"]}

In [7]:
# https://www.kaggle.com/code/columbia2131/foursquare-iou-metrics
def get_id2poi(input_df: pd.DataFrame) -> dict:
    return dict(zip(input_df['id'], input_df['point_of_interest']))

def get_poi2ids(input_df: pd.DataFrame) -> dict:
    return input_df.groupby('point_of_interest')['id'].apply(set).to_dict()

def get_score(input_df: pd.DataFrame):
    scores = []
    for id_str, matches in zip(input_df['id'].to_numpy(), input_df['matches'].to_numpy()):
        targets = poi2ids[id2poi[id_str]]
        preds = set(matches.split())
        score = len((targets & preds)) / len((targets | preds))
        scores.append(score)
    scores = np.array(scores)
    return scores.mean()

In [8]:
LOGGER = logging.getLogger()
FORMATTER = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
setup_logger(out_file=LOGGER_PATH)

2022-06-24 21:14:25,698 - INFO - logger set up


<RootLogger root (DEBUG)>

In [9]:
# ============================
# Main
# ============================
train = pd.read_csv(TRAIN_PATH)
train_fold0 = pd.read_csv("../output/exp/ex062/ex062_pred.csv")
train_fold1 = pd.read_csv("../output/exp/ex063/ex063_pred.csv")

In [10]:
# nameとcategoryのみ
train_near = train.copy()
train_near.columns = [f"near_{i}" for i in train.columns]
use_cols = ["name","categories",'latitude', 'longitude','address','city','state']
near_use_cols = [f"near_{c}" for c in use_cols]
train_fold0 = train_fold0.merge(train[["id"] + use_cols],how="left",on="id")
train_fold0 = train_fold0.merge(train_near[["near_id"] + near_use_cols],how="left",on="near_id")

In [11]:
train_fold1 = train_fold1.merge(train[["id"] + use_cols],how="left",on="id")
train_fold1 = train_fold1.merge(train_near[["near_id"] + near_use_cols],how="left",on="near_id")

In [12]:
# textの作成
train_fold0["text"] = train_fold0['name'].astype(str).str.lower() + " " + train_fold0['categories'].astype(str).str.lower()+\
                      train_fold0['address'].astype(str).str.lower() + " " + train_fold0['city'].astype(str).str.lower() + " " + train_fold0['state'].astype(str).str.lower() 
train_fold0["near_text"] = train_fold0['near_name'].astype(str).str.lower() + " " + train_fold0['near_categories'].astype(str).str.lower()+\
                      train_fold0['near_address'].astype(str).str.lower() + " " + train_fold0['near_city'].astype(str).str.lower() + " " + train_fold0['near_state'].astype(str).str.lower() 

2022-06-24 21:14:46,845 - INFO - Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2022-06-24 21:14:46,846 - INFO - NumExpr defaulting to 8 threads.


In [13]:
# textの作成
train_fold1["text"] = train_fold1['name'].astype(str).str.lower() + " " + train_fold1['categories'].astype(str).str.lower()+\
                      train_fold1['address'].astype(str).str.lower() + " " + train_fold1['city'].astype(str).str.lower() + " " + train_fold1['state'].astype(str).str.lower() 
train_fold1["near_text"] = train_fold1['near_name'].astype(str).str.lower() + " " + train_fold1['near_categories'].astype(str).str.lower()+\
                      train_fold1['near_address'].astype(str).str.lower() + " " + train_fold1['near_city'].astype(str).str.lower() + " " + train_fold1['near_state'].astype(str).str.lower()

In [14]:
# sc
for c in ["latitude","longitude"]:
    mean = train[c].mean()
    std = train[c].std()
    train_fold0[f"{c}_sc"] = (train_fold0[c] - mean) / std
    train_fold1[f"{c}_sc"] = (train_fold1[c] - mean) / std
    print(c,"mean:",mean,"std",std)

latitude mean: 26.87459868745177 std 23.144740576788625
longitude mean: 20.70497351331466 std 82.6778436146614


In [15]:
num_cols = ['latitude_sc', 'longitude_sc']

In [19]:
train_all = pd.concat([train_fold0,train_fold1]).reset_index(drop=True)

In [20]:
# ================================
# train
# ================================
with timer("roberta"):
    set_seed(SEED)
    train_text,train_near_text, train_num_features, train_labels = \
    train_all["text"].values, train_all["near_text"].values,train_all[num_cols].values, train_all["target"].values
        
    train_ = FourSquareDataset(train_text,train_near_text,train_num_features,
                               tokenizer,max_len,train_labels)
        
    # loader
    train_loader = DataLoader(dataset=train_, batch_size=BATCH_SIZE, shuffle = True ,pin_memory=True,num_workers=8)
        
    # model
    model = bert_model()
    model = model.to(device)

    # optimizer, scheduler
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=lr,
                      betas=beta,
                      weight_decay=weight_decay,
                      )
    num_train_optimization_steps = int(len(train_loader) * n_epochs)
    num_warmup_steps = int(num_train_optimization_steps * num_warmup_steps_rate)
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=num_warmup_steps,
                                                num_training_steps=num_train_optimization_steps)

    criterion = nn.BCEWithLogitsLoss()
    best_score = 0
    for epoch in range(n_epochs):
        print(f"============start epoch:{epoch}==============")
        model.train() 
        scaler = GradScaler()
        for i, d in tqdm(enumerate(train_loader),total=len(train_loader)):
            d = collate(d)
            ids = d["input_ids"].to(device)
            mask = d['attention_mask'].to(device)
            token_type_ids = d["token_type_ids"].to(device)
            labels = d['label'].to(device)
            num_features = d["num_feature"].to(device)
            labels = labels.unsqueeze(-1)
            optimizer.zero_grad()
            with autocast():
                output = model(ids,mask,token_type_ids,num_features)
                loss = criterion(output, labels)
            scaler.scale(loss).backward()
            #torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad_norm)
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
        torch.save(model.state_dict(), MODEL_PATH_BASE + f"_{epoch}.pth") # Saving current best model
        if epoch == 4:
            break

Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




100%|██████████| 66309/66309 [4:04:53<00:00,  4.51it/s]  




100%|██████████| 66309/66309 [4:05:01<00:00,  4.51it/s]  




100%|██████████| 66309/66309 [4:04:57<00:00,  4.51it/s]  




100%|██████████| 66309/66309 [4:03:27<00:00,  4.54it/s]  




100%|██████████| 66309/66309 [4:05:20<00:00,  4.50it/s]  
2022-06-25 17:39:56,771 - INFO - [roberta] done in 73473 s
