In [1]:
# =========================
# Library
# =========================
import numpy as np
import pandas as pd
from tqdm import tqdm
import sys, os
from transformers import DistilBertModel, DistilBertTokenizer,AutoModel,AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup,get_cosine_schedule_with_warmup
from torch.cuda.amp import autocast, GradScaler
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
import torch.nn.functional as F
import torch.nn as nn
import torch
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
import logging
import sys
from contextlib import contextmanager
import time
import random
import math
import cudf, cuml, cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors
import gc
import transformers
transformers.logging.set_verbosity_error()
%env TOKENIZERS_PARALLELISM=true
%matplotlib inline

2022-07-02 11:49:00.595268: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


env: TOKENIZERS_PARALLELISM=true


In [2]:
from torch_ema import ExponentialMovingAverage

In [3]:
# =========================
# Constant
# =========================
TRAIN_PATH = "../data/train.csv"
TARGET = "point_of_interest"

In [4]:
# =========================
# Settings
# =========================
exp = "115"
if not os.path.exists(f"../output/exp/ex{exp}"):
    os.makedirs(f"../output/exp/ex{exp}")
    os.makedirs(f"../output/exp/ex{exp}/model")
LOGGER_PATH = f"../output/exp/ex{exp}/ex_{exp}.txt"
MODEL_PATH_SAVE = f"../output/exp/ex{exp}/ex{exp}"
MODEL_PATH_BASE = f"../output/exp/ex{exp}/model/ex{exp}"
MODEL_PATH = "microsoft/mdeberta-v3-base"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

val_fold = 0 
# config
debug = False
SEED = 0
BATCH_SIZE = 64
if debug:
    BATCH_SIZE = 16
iters_to_accumulate = 1
n_epochs = 6
max_len = 128
weight_decay = 0.1
beta = (0.9, 0.98)
scheduler_type = "cosine"
lr = 1e-5
head_type = "linear"
lr_head = 1e-4
num_warmup_steps_rate = 0.1
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
clip_grad_norm = 1
fgm_start_epoch = 2
ema_decay = 0.999

In [6]:
# ===============
# Functions
# ===============
def setup_logger(out_file=None, stderr=True, stderr_level=logging.INFO, file_level=logging.DEBUG):
    LOGGER.handlers = []
    LOGGER.setLevel(min(stderr_level, file_level))

    if stderr:
        handler = logging.StreamHandler(sys.stderr)
        handler.setFormatter(FORMATTER)
        handler.setLevel(stderr_level)
        LOGGER.addHandler(handler)

    if out_file is not None:
        handler = logging.FileHandler(out_file)
        handler.setFormatter(FORMATTER)
        handler.setLevel(file_level)
        LOGGER.addHandler(handler)

    LOGGER.info("logger set up")
    return LOGGER

@contextmanager
def timer(name):
    t0 = time.time()
    yield
    LOGGER.info(f'[{name}] done in {time.time() - t0:.0f} s')
    

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [7]:
class FourSquareDataset(Dataset):
    def __init__(self, text, near_text,num_features, tokenizer, max_len,labels=None):
        self.text = text
        self.near_text = near_text
        self.num_features = num_features
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.labels = labels

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = self.text[item]
        near_text = self.near_text[item]
        inputs = self.tokenizer(
            text,near_text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True
        )
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        num_feature = self.num_features[item]
        if self.labels is not None:
            label = self.labels[item]
            return {
                "input_ids": torch.tensor(ids, dtype=torch.long),
                "attention_mask": torch.tensor(mask, dtype=torch.long),
                "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
                "num_feature" : torch.tensor(num_feature, dtype=torch.float32),
                "label" : torch.tensor(label, dtype=torch.float32),
            }
        else:
            return {
                "input_ids": torch.tensor(ids, dtype=torch.long),
                "attention_mask": torch.tensor(mask, dtype=torch.long),
                "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
                "num_feature" : torch.tensor(num_feature, dtype=torch.float32),
            }
    
    
class TransformerHead(nn.Module):
    def __init__(self, in_features, max_length=max_len, num_layers=1, nhead=8):
        super().__init__()

        self.transformer = nn.TransformerEncoder(
            encoder_layer=nn.TransformerEncoderLayer(d_model=in_features, nhead=nhead),
            num_layers=num_layers,
        )
        self.row_fc = nn.Linear(in_features, 1)
        self.out_features = max_length

    def forward(self, x):
        out = self.transformer(x)
        out = self.row_fc(out).squeeze(-1)
        p1d = (0, self.out_features - out.shape[-1])
        out = F.pad(out, p1d, "constant", 0)
        return out

    
class FGM():
    def __init__(self, model):
        self.model = model
        self.backup = {}

    def attack(self, epsilon=1., emb_name='word_embeddings'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0:
                    r_at = epsilon * param.grad / norm
                    param.data.add_(r_at)

    def restore(self, emb_name='word_embeddings'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                assert name in self.backup
                param.data = self.backup[name]
            self.backup = {} 
        
    
class bert_model(nn.Module):
    def __init__(self):
        super(bert_model, self).__init__()
        self.model = AutoModel.from_pretrained(MODEL_PATH)
        self.head_type = head_type
        encoder_feature_size = 768
        if self.head_type == "transformer":
            self.transformer_head = TransformerHead(
                    in_features=768,
                    max_length=max_len,
                    num_layers=1,
                    nhead=8,
                )
            encoder_feature_size = self.transformer_head.out_features
        self.ln1 = nn.LayerNorm(encoder_feature_size)
        self.linear1 = nn.Sequential(
            nn.Linear(encoder_feature_size,128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Dropout(0.2))
        
        self.linear2 = nn.Sequential(
            nn.Linear(90,128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Dropout(0.2))
        
        self.linear3 = nn.Sequential(
            nn.Linear(128 + 128,64),
            nn.LayerNorm(64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64,1),
           )
        

    

    def forward(self, ids, mask, token_type_ids,num_features):
        if self.head_type == "transformer":
            out = self.model(ids, attention_mask=mask,token_type_ids=token_type_ids)['last_hidden_state']
            out = self.transformer_head(out)
        else:
            out = self.model(ids, attention_mask=mask,token_type_ids=token_type_ids)['last_hidden_state'][:,0,:]
        out =  self.ln1(out)
        out = self.linear1(out)
        out2 = self.linear2(num_features)
        out = torch.cat([out,out2],axis=-1)
        out = self.linear3(out)
        return out
    
    
def collate(d,train=True):
    mask_len = int(d["attention_mask"].sum(axis=1).max())
    if train:
        return {"input_ids" : d['input_ids'][:,:mask_len],
                "attention_mask" : d['attention_mask'][:,:mask_len],
                "token_type_ids" : d["token_type_ids"][:,:mask_len],
                 "label" : d['label'],
                 "num_feature" : d["num_feature"]}
    else:
        return {"input_ids" : d['input_ids'][:,:mask_len],
                "attention_mask" : d['attention_mask'][:,:mask_len],
                "token_type_ids" : d["token_type_ids"][:,:mask_len],
                 "num_feature" : d["num_feature"]}

In [8]:
# https://www.kaggle.com/code/columbia2131/foursquare-iou-metrics
def get_id2poi(input_df: pd.DataFrame) -> dict:
    return dict(zip(input_df['id'], input_df['point_of_interest']))

def get_poi2ids(input_df: pd.DataFrame) -> dict:
    return input_df.groupby('point_of_interest')['id'].apply(set).to_dict()

def get_score(input_df: pd.DataFrame):
    scores = []
    for id_str, matches in zip(input_df['id'].to_numpy(), input_df['matches'].to_numpy()):
        targets = poi2ids[id2poi[id_str]]
        preds = set(matches.split())
        score = len((targets & preds)) / len((targets | preds))
        scores.append(score)
    scores = np.array(scores)
    return scores.mean()

def join(df):
        x = [str(e) for e in list(df)]
        return " ".join(x)
    
def get_comp_score(val,pred):
    train_raw = pd.read_csv(TRAIN_PATH)
    kf = GroupKFold(n_splits=2)
    for i, (trn_idx, val_idx) in enumerate(kf.split(train_raw, train_raw[TARGET],
                                                    train_raw[TARGET])):
        train_raw.loc[val_idx, "set"] = i
    val_ = val.copy()
    val_["pred"] = pred
    val_tr_ = val_[val_["pred"] >= 0.5].reset_index(drop=True)
    #del val_tr
    gc.collect()
    val_id = train_raw[train_raw["set"] == val_fold]["id"].unique()
    #del val_
    gc.collect()
    val_id_match = pd.DataFrame()
    val_id_match["id"] = val_id
    val_id_match["near_id"] = val_id
    val_all = pd.concat([val_id_match,val_tr_[["id","near_id"]]]).reset_index(drop=True)
    #val_all = val_all[["id","near_id"]].reset_index(drop=True)
    val_all_ = val_all.copy()
    val_all_.columns = ["near_id","id"]
    val_all = pd.concat([val_all,val_all_]).reset_index(drop=True)
    val_all = val_all.drop_duplicates(subset=["id","near_id"]).reset_index(drop=True)
    del val_all_
    gc.collect()
    val_all = val_all.merge(train_raw[["id","point_of_interest"]],how="left",on="id").reset_index(drop=True)
    id2poi = get_id2poi(val_all)
    poi2ids = get_poi2ids(val_all)
    docs = val_all.groupby("id")["near_id"].apply(join)
    docs = docs.reset_index()
    docs.columns = ["id","matches"]
    scores = []
    for id_str, matches in zip(docs['id'].to_numpy(), docs['matches'].to_numpy()):
        targets = poi2ids[id2poi[id_str]]
        preds = set(matches.split())
        score = len((targets & preds)) / len((targets | preds))
        scores.append(score)
    scores = np.array(scores)
    return scores.mean()

In [9]:
LOGGER = logging.getLogger()
FORMATTER = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
setup_logger(out_file=LOGGER_PATH)

2022-07-02 11:49:07,000 - INFO - logger set up


<RootLogger root (DEBUG)>

In [10]:
# ============================
# Main
# ============================
train = pd.read_csv(TRAIN_PATH)
train_fold0 = pd.read_csv("../output/exp/ex062/ex062_fe.csv")
train_fold1 = pd.read_csv("../output/exp/ex063/ex063_fe.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [11]:
if debug:
    train_fold0 = train_fold0[:1000]
    train_fold1 = train_fold1[:1000]

In [12]:
# ===============================================================================
# Spatial location features
# ===============================================================================

# ===============================================================================
# Get manhattan distance
# ===============================================================================
def manhattan(lat1, long1, lat2, long2):
    return np.abs(lat2 - lat1) + np.abs(long2 - long1)

# ===============================================================================
# Get haversine distance
# ===============================================================================
def vectorized_haversine(lats1, lats2, longs1, longs2):
    # radius = 6371
    radius = 1
    dlat=np.radians(lats2 - lats1)
    dlon=np.radians(longs2 - longs1)
    a = np.sin(dlat/2) * np.sin(dlat/2) + np.cos(np.radians(lats1)) \
        * np.cos(np.radians(lats2)) * np.sin(dlon/2) * np.sin(dlon/2)
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = radius * c
    return d

# ===============================================================================
# Compute distances + Euclidean
# ===============================================================================
def add_lat_lon_distance_features(df):
    lat1 = df['latitude']
    lat2 = df['near_latitude']
    lon1 = df['longitude']
    lon2 = df['near_longitude']
    df['latdiff'] = (lat1 - lat2)
    df['londiff'] = (lon1 - lon2)
    df['manhattan'] = manhattan(lat1, lon1, lat2, lon2)
    df['euclidean'] = (df['latdiff'] ** 2 + df['londiff'] ** 2) ** 0.5
    df['haversine'] = vectorized_haversine(lat1, lat2, lon1, lon2)
    df["x"] = np.cos(np.radians(df["latitude"]))*np.cos(np.radians(df["longitude"]))
    df["y"] = np.sin(np.radians(df["latitude"]))*np.cos(np.radians(df["longitude"]))
    df["z"] = np.sin(np.radians(df["longitude"]))
    df["near_x"] = np.cos(np.radians(df["near_latitude"]))*np.cos(np.radians(df["near_longitude"]))
    df["near_y"] = np.sin(np.radians(df["near_latitude"]))*np.cos(np.radians(df["near_longitude"]))
    df["near_z"] = np.sin(np.radians(df["near_longitude"]))
    df["dot"] = df["x"]*df["near_x"]+df["y"]*df["near_y"]+df["z"]*df["near_z"]


    col_64 = list(df.dtypes[df.dtypes == np.float64].index)
    for col in col_64:
        df[col] = df[col].astype(np.float32)
    return df

In [13]:
train_fold0 = add_lat_lon_distance_features(train_fold0)
train_fold1 = add_lat_lon_distance_features(train_fold1)

2022-07-02 11:50:53,615 - INFO - Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2022-07-02 11:50:53,616 - INFO - NumExpr defaulting to 8 threads.


In [14]:
# textの作成
train_fold0["text"] = train_fold0['name'].astype(str).str.lower() + " " + train_fold0['categories'].astype(str).str.lower()+\
                      train_fold0['address'].astype(str).str.lower() + " " + train_fold0['city'].astype(str).str.lower() + " " + train_fold0['state'].astype(str).str.lower() 
train_fold0["near_text"] = train_fold0['near_name'].astype(str).str.lower() + " " + train_fold0['near_categories'].astype(str).str.lower()+\
                      train_fold0['near_address'].astype(str).str.lower() + " " + train_fold0['near_city'].astype(str).str.lower() + " " + train_fold0['near_state'].astype(str).str.lower() 

In [15]:
# textの作成
train_fold1["text"] = train_fold1['name'].astype(str).str.lower() + " " + train_fold1['categories'].astype(str).str.lower()+\
                      train_fold1['address'].astype(str).str.lower() + " " + train_fold1['city'].astype(str).str.lower() + " " + train_fold1['state'].astype(str).str.lower() 
train_fold1["near_text"] = train_fold1['near_name'].astype(str).str.lower() + " " + train_fold1['near_categories'].astype(str).str.lower()+\
                      train_fold1['near_address'].astype(str).str.lower() + " " + train_fold1['near_city'].astype(str).str.lower() + " " + train_fold1['near_state'].astype(str).str.lower()

In [16]:
num_cols = ["latitude", "longitude", "near_latitude", "near_longitude",
           "latdiff", "londiff", "manhattan", "euclidean", "haversine",
           "x", "y", "z", "near_x", "near_y", "near_z", "dot",
           'name_gesh', 'name_leven', 'name_jaro',
           'address_gesh', 'address_leven', 'address_jaro', 'city_gesh',
           'city_leven', 'city_jaro', 'state_gesh', 'state_leven', 'state_jaro',
           'zip_gesh', 'zip_leven', 'zip_jaro', 'url_gesh', 'url_leven',
           'url_jaro', 'phone_gesh', 'phone_leven','phone_jaro', 'categories_gesh', 'categories_leven', 'categories_jaro',
           'distance', 'distance_rank', 'name_gesh_mean', 'name_gesh_max',
           'near_name_gesh_mean', 'near_name_gesh_max', 'name_gesh_mean_rate',
           'name_gesh_max_rate', 'near_name_gesh_mean_rate',
           'near_name_gesh_max_rate', 'name_leven_mean', 'name_leven_min',
           'near_name_leven_mean', 'near_name_leven_min', 'name_leven_mean_rate',
           'name_leven_min_rate', 'near_name_leven_mean_rate',
           'near_name_leven_min_rate', 'name_jaro_mean', 'name_jaro_max',
           'near_name_jaro_mean', 'near_name_jaro_max', 'name_jaro_mean_rate',
           'name_jaro_max_rate', 'near_name_jaro_mean_rate',
           'near_name_jaro_max_rate', 'categories_gesh_mean',
           'categories_gesh_max', 'near_categories_gesh_mean',
           'near_categories_gesh_max', 'categories_gesh_mean_rate',
           'categories_gesh_max_rate', 'near_categories_gesh_mean_rate',
           'near_categories_gesh_max_rate', 'categories_leven_mean',
           'categories_leven_min', 'near_categories_leven_mean',
           'near_categories_leven_min', 'categories_leven_mean_rate',
           'categories_leven_min_rate', 'near_categories_leven_mean_rate',
           'near_categories_leven_min_rate', 'categories_jaro_mean',
           'categories_jaro_max', 'near_categories_jaro_mean',
           'near_categories_jaro_max','categories_jaro_mean_rate', 'categories_jaro_max_rate',
           'near_categories_jaro_mean_rate', 'near_categories_jaro_max_rate']

In [17]:
num_features_fold0 = train_fold0[num_cols].values.astype(np.float32)
num_features_fold1 = train_fold1[num_cols].values.astype(np.float32)

In [18]:
num_features_concat = np.concatenate([num_features_fold0,num_features_fold1],axis=0)
mean_std_dict = {}
cols2num_dict = {}
for n,c in tqdm(enumerate(num_cols)):
    if c in ["latitude","longitude"]:
        mean = train[c].mean()
        std = train[c].std()
        mean_std_dict[c] = [mean,std]
    elif c in ["x", "y", "z", "near_x", "near_y", "near_z", "dot"]:
        mean = 0
        std = 1
        mean_std_dict[c] = [mean,std]
    else:
        num_features_concat[num_features_concat[:,n] == np.inf,n] = np.nan
        num_features_concat[num_features_concat[:,n] == -np.inf,n] = np.nan
        mean = np.nanmean(num_features_concat[:,n])
        std = np.nanstd(num_features_concat[:,n])
        mean_std_dict[c] = [mean,std]
    cols2num_dict[c] = n

90it [00:04, 18.06it/s]


In [19]:
mean_std_dict

{'latitude': [26.87459868745177, 23.144740576788625],
 'longitude': [20.70497351331466, 82.6778436146614],
 'near_latitude': [22.377329, 23.80125],
 'near_longitude': [47.604324, 72.81156],
 'latdiff': [0.0026245795, 0.60088676],
 'londiff': [-0.004257245, 1.7946571],
 'manhattan': [0.21769507, 2.1573222],
 'euclidean': [0.17776342, 1.8842192],
 'haversine': [0.002634386, 0.023196151],
 'x': [0, 1],
 'y': [0, 1],
 'z': [0, 1],
 'near_x': [0, 1],
 'near_y': [0, 1],
 'near_z': [0, 1],
 'dot': [0, 1],
 'name_gesh': [0.535247, 0.28312334],
 'name_leven': [12.289453, 8.717725],
 'name_jaro': [0.6814999, 0.278118],
 'address_gesh': [0.55750847, 0.32904968],
 'address_leven': [11.519652, 11.330601],
 'address_jaro': [0.68748957, 0.28175715],
 'city_gesh': [0.78845024, 0.33762273],
 'city_leven': [2.639476, 4.342476],
 'city_jaro': [0.8420279, 0.29122102],
 'state_gesh': [0.7989922, 0.3356451],
 'state_leven': [2.618497, 4.6914506],
 'state_jaro': [0.8407704, 0.2916656],
 'zip_gesh': [0.904034

In [20]:
for n,c in tqdm(enumerate(num_cols)):
    num_features_fold0[num_features_fold0[:,n] == np.inf,n] = np.nan
    num_features_fold0[num_features_fold0[:,n] == -np.inf,n] = np.nan
    num_features_fold0[:,n] = (num_features_fold0[:,n] - mean_std_dict[c][0]) / (mean_std_dict[c][1]) 
    
    num_features_fold1[num_features_fold1[:,n] == np.inf,n] = np.nan
    num_features_fold1[num_features_fold1[:,n] == -np.inf,n] = np.nan
    num_features_fold1[:,n] = (num_features_fold1[:,n] - mean_std_dict[c][0]) / (mean_std_dict[c][1]) 
num_features_fold0 = np.nan_to_num(num_features_fold0)
num_features_fold1 = np.nan_to_num(num_features_fold1)

90it [00:00, 96.93it/s] 


In [21]:
train_all = pd.concat([train_fold0,train_fold1]).reset_index(drop=True)
num_features_all = np.concatenate([num_features_fold0,num_features_fold1],axis=0)

In [None]:
# ================================
# train
# ================================
with timer("mdeberta_base"):
    set_seed(SEED)
    train_text,train_near_text, train_num_features, train_labels = \
    train_all["text"].values, train_all["near_text"].values,num_features_all, train_all["target"].values
    train_ = FourSquareDataset(train_text,train_near_text,train_num_features,
                               tokenizer,max_len,train_labels)
        
    # loader
    train_loader = DataLoader(dataset=train_, batch_size=BATCH_SIZE, shuffle = True ,pin_memory=True,num_workers=8)
        
    # model
    model = bert_model()
    model = model.to(device)
    model2 = bert_model()
    model2 = model2.to(device)
    ema = ExponentialMovingAverage(model.parameters(), decay=ema_decay)
    fgm = FGM(model)

    # optimizer, scheduler
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay, "lr": lr},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0, "lr": lr},
        {'params': [p for n, p in model.named_parameters() if "model" not in n], 'weight_decay': weight_decay, "lr": lr_head}, # head params
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=lr,
                      betas=beta,
                      weight_decay=weight_decay,
                      )
    num_train_optimization_steps = int(len(train_loader) * n_epochs)
    num_warmup_steps = int(num_train_optimization_steps * num_warmup_steps_rate)
    if scheduler_type == "cosine":
        scheduler = get_cosine_schedule_with_warmup(
                    optimizer,
                    num_warmup_steps=num_warmup_steps,
                    num_training_steps=num_train_optimization_steps,
                    num_cycles=0.5,
                )
    elif scheduler_type == "linear":
        scheduler = get_linear_schedule_with_warmup(optimizer,
                                                    num_warmup_steps=num_warmup_steps,
                                                    num_training_steps=num_train_optimization_steps)

    criterion = nn.BCEWithLogitsLoss()
    for epoch in range(n_epochs):
        print(f"============start epoch:{epoch}==============")
        model.train() 
        train_losses_batch = []
        for i, d in tqdm(enumerate(train_loader),total=len(train_loader)):
            d = collate(d)
            ids = d["input_ids"].to(device)
            mask = d['attention_mask'].to(device)
            token_type_ids = d["token_type_ids"].to(device)
            labels = d['label'].to(device)
            num_features = d["num_feature"].to(device)
            labels = labels.unsqueeze(-1)
            optimizer.zero_grad()
            output = model(ids,mask,token_type_ids,num_features)
            loss = criterion(output, labels)
            loss.backward()
            #torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad_norm)
            if epoch >= fgm_start_epoch:
                fgm.attack()
                output_adv = model(ids,mask,token_type_ids,num_features)
                loss_adv = criterion(output_adv, labels)
                loss_adv.backward() 
                fgm.restore() 
            optimizer.step()
            scheduler.step()
            ema.update()
        if epoch >= 3:
            torch.save(model.state_dict(), MODEL_PATH_SAVE + f"_{epoch}.pth") # Saving current best model
            ema.copy_to(model2.parameters())
            torch.save(model2.state_dict(), MODEL_PATH_BASE + f"_{epoch}_ema.pth") # Saving current best ema model



100%|██████████| 66309/66309 [2:49:38<00:00,  6.51it/s]  




100%|██████████| 66309/66309 [2:49:52<00:00,  6.51it/s]  




100%|██████████| 66309/66309 [5:08:24<00:00,  3.58it/s]  




100%|██████████| 66309/66309 [5:06:31<00:00,  3.61it/s]  




100%|██████████| 66309/66309 [5:07:16<00:00,  3.60it/s]  




 72%|███████▏  | 47542/66309 [3:39:03<1:28:38,  3.53it/s]