In [1]:
# =========================
# Library
# =========================
import numpy as np
import pandas as pd
from tqdm import tqdm
import sys, os
from transformers import DistilBertModel, DistilBertTokenizer,AutoModel,AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.cuda.amp import autocast, GradScaler
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
import torch.nn.functional as F
import torch.nn as nn
import torch
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score,accuracy_score,recall_score,precision_score
import logging
import sys
from contextlib import contextmanager
import time
import random
import math
import cudf, cuml, cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors
import gc
%env TOKENIZERS_PARALLELISM=true
%matplotlib inline

2022-07-07 20:43:12.066053: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


env: TOKENIZERS_PARALLELISM=true


In [2]:
# =========================
# Constant
# =========================
TRAIN_PATH = "../data/train.csv"
TARGET = "point_of_interest"

In [3]:
# =========================
# Settings
# =========================
exp = "113"
if not os.path.exists(f"../output/exp/ex{exp}"):
    os.makedirs(f"../output/exp/ex{exp}")
    os.makedirs(f"../output/exp/ex{exp}/model")
LOGGER_PATH = f"../output/exp/ex{exp}/ex_{exp}.txt"
MODEL_PATH_BASE = f"../output/exp/ex{exp}/model/ex{exp}"
MODEL_PATH = "xlm-roberta-large"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

val_fold = 0 
# config
SEED = 0
BATCH_SIZE = 64
iters_to_accumulate = 1
n_epochs = 5
max_len = 128
weight_decay = 0.1
beta = (0.9, 0.98)
lr = 1e-5
num_warmup_steps_rate = 0.1
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
clip_grad_norm = 1

In [4]:
# ===============
# Functions
# ===============
def setup_logger(out_file=None, stderr=True, stderr_level=logging.INFO, file_level=logging.DEBUG):
    LOGGER.handlers = []
    LOGGER.setLevel(min(stderr_level, file_level))

    if stderr:
        handler = logging.StreamHandler(sys.stderr)
        handler.setFormatter(FORMATTER)
        handler.setLevel(stderr_level)
        LOGGER.addHandler(handler)

    if out_file is not None:
        handler = logging.FileHandler(out_file)
        handler.setFormatter(FORMATTER)
        handler.setLevel(file_level)
        LOGGER.addHandler(handler)

    LOGGER.info("logger set up")
    return LOGGER

@contextmanager
def timer(name):
    t0 = time.time()
    yield
    LOGGER.info(f'[{name}] done in {time.time() - t0:.0f} s')
    

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [6]:
class FourSquareDataset(Dataset):
    def __init__(self, text, near_text,num_features, tokenizer, max_len,labels=None):
        self.text = text
        self.near_text = near_text
        self.num_features = num_features
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.labels = labels

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = self.text[item]
        near_text = self.near_text[item]
        inputs = self.tokenizer(
            text,near_text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True
        )
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        num_feature = self.num_features[item]
        if self.labels is not None:
            label = self.labels[item]
            return {
                "input_ids": torch.tensor(ids, dtype=torch.long),
                "attention_mask": torch.tensor(mask, dtype=torch.long),
                "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
                "num_feature" : torch.tensor(num_feature, dtype=torch.float32),
                "label" : torch.tensor(label, dtype=torch.float32),
            }
        else:
            return {
                "input_ids": torch.tensor(ids, dtype=torch.long),
                "attention_mask": torch.tensor(mask, dtype=torch.long),
                "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
                "num_feature" : torch.tensor(num_feature, dtype=torch.float32),
            }
    
    
class bert_model(nn.Module):
    def __init__(self):
        super(bert_model, self).__init__()
        self.model = AutoModel.from_pretrained(MODEL_PATH)
        self.ln1 = nn.LayerNorm(1024)
        self.linear1 = nn.Sequential(
            nn.Linear(1024,128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Dropout(0.2))
        
        self.linear2 = nn.Sequential(
            nn.Linear(2,32),
            nn.LayerNorm(32),
            nn.ReLU(),
            nn.Dropout(0.2))
        
        self.linear3 = nn.Sequential(
            nn.Linear(128 + 32,64),
            nn.LayerNorm(64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64,1),
           )
        
    

    def forward(self, ids, mask, token_type_ids,num_features):
        # pooler
        out = self.model(ids, attention_mask=mask,token_type_ids=token_type_ids)['last_hidden_state'][:,0,:]
        out =  self.ln1(out)
        out = self.linear1(out)
        out2 = self.linear2(num_features)
        out = torch.cat([out,out2],axis=-1)
        out = self.linear3(out)
        return out
    
    
def collate(d,train=True):
    mask_len = int(d["attention_mask"].sum(axis=1).max())
    if train:
        return {"input_ids" : d['input_ids'][:,:mask_len],
                "attention_mask" : d['attention_mask'][:,:mask_len],
                "token_type_ids" : d["token_type_ids"][:,:mask_len],
                 "label" : d['label'],
                 "num_feature" : d["num_feature"]}
    else:
        return {"input_ids" : d['input_ids'][:,:mask_len],
                "attention_mask" : d['attention_mask'][:,:mask_len],
                "token_type_ids" : d["token_type_ids"][:,:mask_len],
                 "num_feature" : d["num_feature"]}

In [7]:
# https://www.kaggle.com/code/columbia2131/foursquare-iou-metrics
def get_id2poi(input_df: pd.DataFrame) -> dict:
    return dict(zip(input_df['id'], input_df['point_of_interest']))

def get_poi2ids(input_df: pd.DataFrame) -> dict:
    return input_df.groupby('point_of_interest')['id'].apply(set).to_dict()

def get_score(input_df: pd.DataFrame):
    scores = []
    for id_str, matches in zip(input_df['id'].to_numpy(), input_df['matches'].to_numpy()):
        targets = poi2ids[id2poi[id_str]]
        preds = set(matches.split())
        score = len((targets & preds)) / len((targets | preds))
        scores.append(score)
    scores = np.array(scores)
    return scores.mean()

In [8]:
LOGGER = logging.getLogger()
FORMATTER = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
setup_logger(out_file=LOGGER_PATH)

2022-07-07 20:43:23,812 - INFO - logger set up


<RootLogger root (DEBUG)>

In [9]:
# ============================
# Main
# ============================
train = pd.read_csv(TRAIN_PATH)
train_fold0_pp = pd.read_csv("../output/exp/ex113/ex113_pp_new_pair_fold0.csv")

In [11]:
# nameとcategoryのみ
train_near = train.copy()
train_near.columns = [f"near_{i}" for i in train.columns]
use_cols = ["name","categories",'latitude', 'longitude','address','city','state']
near_use_cols = [f"near_{c}" for c in use_cols]
train_fold0_pp = train_fold0_pp.merge(train[["id"] + use_cols],how="left",on="id")
train_fold0_pp = train_fold0_pp.merge(train_near[["near_id"] + near_use_cols],how="left",on="near_id")

In [12]:
# textの作成
train_fold0_pp["text"] = train_fold0_pp['name'].astype(str).str.lower() + " " + train_fold0_pp['categories'].astype(str).str.lower()+\
                      train_fold0_pp['address'].astype(str).str.lower() + " " + train_fold0_pp['city'].astype(str).str.lower() + " " + train_fold0_pp['state'].astype(str).str.lower() 
train_fold0_pp["near_text"] = train_fold0_pp['near_name'].astype(str).str.lower() + " " + train_fold0_pp['near_categories'].astype(str).str.lower()+\
                      train_fold0_pp['near_address'].astype(str).str.lower() + " " + train_fold0_pp['near_city'].astype(str).str.lower() + " " + train_fold0_pp['near_state'].astype(str).str.lower() 

2022-07-07 20:43:44,572 - INFO - Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2022-07-07 20:43:44,572 - INFO - NumExpr defaulting to 8 threads.


In [13]:
# sc
for c in ["latitude","longitude"]:
    mean = train[c].mean()
    std = train[c].std()
    train_fold0_pp[f"{c}_sc"] = (train_fold0_pp[c] - mean) / std
    #train_fold1[f"{c}_sc"] = (train_fold1[c] - mean) / std
    print(c,"mean:",mean,"std",std)

latitude mean: 26.87459868745177 std 23.144740576788625
longitude mean: 20.70497351331466 std 82.6778436146614


In [14]:
num_cols = ['latitude_sc', 'longitude_sc']

In [18]:
# ================================
# train
# ================================
with timer("roberta"):
    set_seed(SEED)
    val_text,val_near_text, val_num_features, val_labels = \
    train_fold0_pp["text"].values, train_fold0_pp["near_text"].values,train_fold0_pp[num_cols].values, train_fold0_pp["target"].values
    val_ = FourSquareDataset(val_text,val_near_text,val_num_features,
                             tokenizer,max_len,val_labels)
        
    val_loader = DataLoader(dataset=val_, batch_size=BATCH_SIZE*2, shuffle = False , pin_memory=True,num_workers=8)
        
    # model
    model = bert_model()
    model.load_state_dict(torch.load("../output/exp/ex070/model/ex070_0.pth"))
    model = model.to(device)

    val_preds = []
    model.eval()  # switch model to the evaluation mode
    with torch.no_grad():  
        for d in tqdm(val_loader,total=len(val_loader)):
            # =========================
            # data loader
            # =========================
            d = collate(d)
            ids = d["input_ids"].to(device)
            mask = d['attention_mask'].to(device)
            token_type_ids = d["token_type_ids"].to(device)
            num_features = d["num_feature"].to(device)
            with autocast():
                outputs = model(ids,mask,token_type_ids,num_features)
            val_preds.append(outputs.sigmoid().detach().cpu().numpy())
    val_preds = np.concatenate(val_preds,axis=0)

Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 2782/2782 [03:20<00:00, 13.89it/s]
2022-07-07 20:47:45,759 - INFO - [roberta] done in 235 s


In [19]:
np.save(f"../output/exp/ex{exp}/oof.npy",val_preds)

In [20]:
train_fold0_pp["pred"] = val_preds.reshape(-1)

In [21]:
train_fold0_pp["target"].sum()/len(train_fold0_pp)

0.6090026230502649

In [22]:
accuracy_score(train_fold0_pp["target"],train_fold0_pp["pred"] > 0.02)

0.7846234209742918