In [1]:
import shutil
from pathlib import Path

transformers_path = Path("/opt/conda/lib/python3.7/site-packages/transformers")
input_dir = Path("../deberta-v2-3-fast-tokenizer")

convert_file = input_dir / "convert_slow_tokenizer.py"
conversion_path = transformers_path/convert_file.name

if conversion_path.exists():
    conversion_path.unlink()

shutil.copy(convert_file, transformers_path)
deberta_v2_path = transformers_path / "models" / "deberta_v2"

for filename in [
    'tokenization_deberta_v2.py',
    'tokenization_deberta_v2_fast.py',
    "deberta__init__.py"]:
    if str(filename).startswith("deberta"):
        filepath = deberta_v2_path/str(filename).replace("deberta", "")
    else:
        filepath = deberta_v2_path/filename
    if filepath.exists():
        filepath.unlink()

    shutil.copy(input_dir/filename, filepath)

In [2]:
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup,AdamW,AutoModel,AutoConfig
%env TOKENIZERS_PARALLELISM=true

from transformers.models.deberta_v2 import DebertaV2TokenizerFast

tokenizers.__version__: 0.11.0
transformers.__version__: 4.16.2


2022-04-28 12:09:48.815774: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


env: TOKENIZERS_PARALLELISM=true


In [3]:
# ========================================
# library
# ========================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, KFold,GroupKFold
from sklearn.metrics import mean_squared_error
%matplotlib inline
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
from torch.cuda.amp import autocast, GradScaler
import logging
from ast import literal_eval
import sys
from contextlib import contextmanager
import time
import random
from tqdm import tqdm
import os
import ast
import itertools
from sklearn.metrics import f1_score,recall_score
from torch.nn import Parameter
import torch.nn.functional as F
import pickle

In [4]:
# ==================
# Constant
# ==================

FEATURES_PATH = "../data/features.csv"
PATIENT_NOTES_PATH = "../data/patient_notes.csv"
TRAIN_PATH = "../data/train.csv"
TEST_PATH = "../data/test.csv"
#PRETRAIN_MODEL_PATH = "../output/pretrained/microsoft-deberta-v2-xlarge/microsoft-deberta-v2-xlarge-mlm-epoch-10.bin"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
# ===============
# Configs
# ===============
ex = "051"
if not os.path.exists(f"../output/exp/ex{ex}"):
    os.makedirs(f"../output/exp/ex{ex}")
    os.makedirs(f"../output/exp/ex{ex}/ex{ex}_model")
    
OUTPUT_DIR = f"../output/exp/ex{ex}"
MODEL_PATH_BASE = f"../output/exp/ex{ex}/ex{ex}_model/ex{ex}"
LOGGER_PATH = f"../output/exp/ex{ex}/ex{ex}.txt"

SEED = 0
N_SPLITS = 5
SHUFFLE = True
num_workers = 4
BATCH_SIZE = 4
iters_to_accumulate = 1
n_epochs = 6
es_patience = 10
max_len = 512
weight_decay = 0.1
beta = (0.9, 0.98)
lr = 1e-5
num_warmup_steps_rate = 0.1
clip_grad_norm = 1.0

MODEL_PATH = "microsoft/deberta-v2-xlarge"
tokenizer = DebertaV2TokenizerFast.from_pretrained(MODEL_PATH)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
# ===============
# Settings
# ===============
oof_path1 = "../output/exp/ex038/ex038_oof.npy"
oof_path2 = "../output/exp/ex041/ex041_oof.npy"
oof_path3 = "../output/exp/ex051/ex051_oof.npy"
nakama_path = '../output/nakama/ex141/ex141.csv'

In [7]:
# ===============
# Functions
# ===============
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def setup_logger(out_file=None, stderr=True, stderr_level=logging.INFO, file_level=logging.DEBUG):
    LOGGER.handlers = []
    LOGGER.setLevel(min(stderr_level, file_level))

    if stderr:
        handler = logging.StreamHandler(sys.stderr)
        handler.setFormatter(FORMATTER)
        handler.setLevel(stderr_level)
        LOGGER.addHandler(handler)

    if out_file is not None:
        handler = logging.FileHandler(out_file)
        handler.setFormatter(FORMATTER)
        handler.setLevel(file_level)
        LOGGER.addHandler(handler)

    LOGGER.info("logger set up")
    return LOGGER


@contextmanager
def timer(name):
    t0 = time.time()
    yield 
    LOGGER.info(f'[{name}] done in {time.time() - t0:.0f} s')
    
    
LOGGER = logging.getLogger()
FORMATTER = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
setup_logger(out_file=LOGGER_PATH)

2022-04-28 12:09:52,614 - INFO - logger set up


<RootLogger root (DEBUG)>

In [8]:
# ====================================================
# Main
# ====================================================
train = pd.read_csv(TRAIN_PATH)
train['annotation'] = train['annotation'].apply(ast.literal_eval)
train['location'] = train['location'].apply(ast.literal_eval)
features = pd.read_csv(FEATURES_PATH )
def preprocess_features(features):
    features.loc[27, 'feature_text'] = "Last-Pap-smear-1-year-ago"
    return features
features = preprocess_features(features)
patient_notes = pd.read_csv(PATIENT_NOTES_PATH)

In [9]:
train = train.merge(features, on=['feature_num', 'case_num'], how='left')
train = train.merge(patient_notes, on=['pn_num', 'case_num'], how='left')
display(train.head())

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724],Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693],Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...
2,00016_002,0,16,2,[chest pressure],[203 217],Chest-pressure,HPI: 17yo M presents with palpitations. Patien...
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]",Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258],Lightheaded,HPI: 17yo M presents with palpitations. Patien...


In [10]:
# incorrect annotation
train.loc[338, 'annotation'] = ast.literal_eval('[["father heart attack"]]')
train.loc[338, 'location'] = ast.literal_eval('[["764 783"]]')

train.loc[621, 'annotation'] = ast.literal_eval('[["for the last 2-3 months"]]')
train.loc[621, 'location'] = ast.literal_eval('[["77 100"]]')

train.loc[655, 'annotation'] = ast.literal_eval('[["no heat intolerance"], ["no cold intolerance"]]')
train.loc[655, 'location'] = ast.literal_eval('[["285 292;301 312"], ["285 287;296 312"]]')

train.loc[1262, 'annotation'] = ast.literal_eval('[["mother thyroid problem"]]')
train.loc[1262, 'location'] = ast.literal_eval('[["551 557;565 580"]]')

train.loc[1265, 'annotation'] = ast.literal_eval('[[\'felt like he was going to "pass out"\']]')
train.loc[1265, 'location'] = ast.literal_eval('[["131 135;181 212"]]')

train.loc[1396, 'annotation'] = ast.literal_eval('[["stool , with no blood"]]')
train.loc[1396, 'location'] = ast.literal_eval('[["259 280"]]')

train.loc[1591, 'annotation'] = ast.literal_eval('[["diarrhoe non blooody"]]')
train.loc[1591, 'location'] = ast.literal_eval('[["176 184;201 212"]]')

train.loc[1615, 'annotation'] = ast.literal_eval('[["diarrhea for last 2-3 days"]]')
train.loc[1615, 'location'] = ast.literal_eval('[["249 257;271 288"]]')

train.loc[1664, 'annotation'] = ast.literal_eval('[["no vaginal discharge"]]')
train.loc[1664, 'location'] = ast.literal_eval('[["822 824;907 924"]]')

train.loc[1714, 'annotation'] = ast.literal_eval('[["started about 8-10 hours ago"]]')
train.loc[1714, 'location'] = ast.literal_eval('[["101 129"]]')

train.loc[1929, 'annotation'] = ast.literal_eval('[["no blood in the stool"]]')
train.loc[1929, 'location'] = ast.literal_eval('[["531 539;549 561"]]')

train.loc[2134, 'annotation'] = ast.literal_eval('[["last sexually active 9 months ago"]]')
train.loc[2134, 'location'] = ast.literal_eval('[["540 560;581 593"]]')

train.loc[2191, 'annotation'] = ast.literal_eval('[["right lower quadrant pain"]]')
train.loc[2191, 'location'] = ast.literal_eval('[["32 57"]]')

train.loc[2553, 'annotation'] = ast.literal_eval('[["diarrhoea no blood"]]')
train.loc[2553, 'location'] = ast.literal_eval('[["308 317;376 384"]]')

train.loc[3124, 'annotation'] = ast.literal_eval('[["sweating"]]')
train.loc[3124, 'location'] = ast.literal_eval('[["549 557"]]')

train.loc[3858, 'annotation'] = ast.literal_eval('[["previously as regular"], ["previously eveyr 28-29 days"], ["previously lasting 5 days"], ["previously regular flow"]]')
train.loc[3858, 'location'] = ast.literal_eval('[["102 123"], ["102 112;125 141"], ["102 112;143 157"], ["102 112;159 171"]]')

train.loc[4373, 'annotation'] = ast.literal_eval('[["for 2 months"]]')
train.loc[4373, 'location'] = ast.literal_eval('[["33 45"]]')

train.loc[4763, 'annotation'] = ast.literal_eval('[["35 year old"]]')
train.loc[4763, 'location'] = ast.literal_eval('[["5 16"]]')

train.loc[4782, 'annotation'] = ast.literal_eval('[["darker brown stools"]]')
train.loc[4782, 'location'] = ast.literal_eval('[["175 194"]]')

train.loc[4908, 'annotation'] = ast.literal_eval('[["uncle with peptic ulcer"]]')
train.loc[4908, 'location'] = ast.literal_eval('[["700 723"]]')

train.loc[6016, 'annotation'] = ast.literal_eval('[["difficulty falling asleep"]]')
train.loc[6016, 'location'] = ast.literal_eval('[["225 250"]]')

train.loc[6192, 'annotation'] = ast.literal_eval('[["helps to take care of aging mother and in-laws"]]')
train.loc[6192, 'location'] = ast.literal_eval('[["197 218;236 260"]]')

train.loc[6380, 'annotation'] = ast.literal_eval('[["No hair changes"], ["No skin changes"], ["No GI changes"], ["No palpitations"], ["No excessive sweating"]]')
train.loc[6380, 'location'] = ast.literal_eval('[["480 482;507 519"], ["480 482;499 503;512 519"], ["480 482;521 531"], ["480 482;533 545"], ["480 482;564 582"]]')

train.loc[6562, 'annotation'] = ast.literal_eval('[["stressed due to taking care of her mother"], ["stressed due to taking care of husbands parents"]]')
train.loc[6562, 'location'] = ast.literal_eval('[["290 320;327 337"], ["290 320;342 358"]]')

train.loc[6862, 'annotation'] = ast.literal_eval('[["stressor taking care of many sick family members"]]')
train.loc[6862, 'location'] = ast.literal_eval('[["288 296;324 363"]]')

train.loc[7022, 'annotation'] = ast.literal_eval('[["heart started racing and felt numbness for the 1st time in her finger tips"]]')
train.loc[7022, 'location'] = ast.literal_eval('[["108 182"]]')

train.loc[7422, 'annotation'] = ast.literal_eval('[["first started 5 yrs"]]')
train.loc[7422, 'location'] = ast.literal_eval('[["102 121"]]')

train.loc[8876, 'annotation'] = ast.literal_eval('[["No shortness of breath"]]')
train.loc[8876, 'location'] = ast.literal_eval('[["481 483;533 552"]]')

train.loc[9027, 'annotation'] = ast.literal_eval('[["recent URI"], ["nasal stuffines, rhinorrhea, for 3-4 days"]]')
train.loc[9027, 'location'] = ast.literal_eval('[["92 102"], ["123 164"]]')

train.loc[9938, 'annotation'] = ast.literal_eval('[["irregularity with her cycles"], ["heavier bleeding"], ["changes her pad every couple hours"]]')
train.loc[9938, 'location'] = ast.literal_eval('[["89 117"], ["122 138"], ["368 402"]]')

train.loc[9973, 'annotation'] = ast.literal_eval('[["gaining 10-15 lbs"]]')
train.loc[9973, 'location'] = ast.literal_eval('[["344 361"]]')

train.loc[10513, 'annotation'] = ast.literal_eval('[["weight gain"], ["gain of 10-16lbs"]]')
train.loc[10513, 'location'] = ast.literal_eval('[["600 611"], ["607 623"]]')

train.loc[11551, 'annotation'] = ast.literal_eval('[["seeing her son knows are not real"]]')
train.loc[11551, 'location'] = ast.literal_eval('[["386 400;443 461"]]')

train.loc[11677, 'annotation'] = ast.literal_eval('[["saw him once in the kitchen after he died"]]')
train.loc[11677, 'location'] = ast.literal_eval('[["160 201"]]')

train.loc[12124, 'annotation'] = ast.literal_eval('[["tried Ambien but it didnt work"]]')
train.loc[12124, 'location'] = ast.literal_eval('[["325 337;349 366"]]')

train.loc[12279, 'annotation'] = ast.literal_eval('[["heard what she described as a party later than evening these things did not actually happen"]]')
train.loc[12279, 'location'] = ast.literal_eval('[["405 459;488 524"]]')

train.loc[12289, 'annotation'] = ast.literal_eval('[["experienced seeing her son at the kitchen table these things did not actually happen"]]')
train.loc[12289, 'location'] = ast.literal_eval('[["353 400;488 524"]]')

train.loc[13238, 'annotation'] = ast.literal_eval('[["SCRACHY THROAT"], ["RUNNY NOSE"]]')
train.loc[13238, 'location'] = ast.literal_eval('[["293 307"], ["321 331"]]')

train.loc[13297, 'annotation'] = ast.literal_eval('[["without improvement when taking tylenol"], ["without improvement when taking ibuprofen"]]')
train.loc[13297, 'location'] = ast.literal_eval('[["182 221"], ["182 213;225 234"]]')

train.loc[13299, 'annotation'] = ast.literal_eval('[["yesterday"], ["yesterday"]]')
train.loc[13299, 'location'] = ast.literal_eval('[["79 88"], ["409 418"]]')

train.loc[13845, 'annotation'] = ast.literal_eval('[["headache global"], ["headache throughout her head"]]')
train.loc[13845, 'location'] = ast.literal_eval('[["86 94;230 236"], ["86 94;237 256"]]')

train.loc[14083, 'annotation'] = ast.literal_eval('[["headache generalized in her head"]]')
train.loc[14083, 'location'] = ast.literal_eval('[["56 64;156 179"]]')

In [11]:
train['annotation_length'] = train['annotation'].apply(len)

In [12]:
# ====================================================
# CV split
# ====================================================
Fold = GroupKFold(n_splits=N_SPLITS)
groups = train['pn_num'].values
for n, (train_index, val_index) in enumerate(Fold.split(train, train['location'], groups)):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    2860
1    2860
2    2860
3    2860
4    2860
dtype: int64

In [14]:
def create_labels_for_scoring(df):
    # example: ['0 1', '3 4'] -> ['0 1; 3 4']
    df['location_for_create_labels'] = [ast.literal_eval(f'[]')] * len(df)
    for i in range(len(df)):
        lst = df.loc[i, 'location']
        if lst:
            new_lst = ';'.join(lst)
            df.loc[i, 'location_for_create_labels'] = ast.literal_eval(f'[["{new_lst}"]]')
    # create labels
    truths = []
    for location_list in df['location_for_create_labels'].values:
        truth = []
        if len(location_list) > 0:
            location = location_list[0]
            for loc in [s.split() for s in location.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                truth.append([start, end])
        truths.append(truth)
    return truths


def get_char_probs(texts, predictions, tokenizer):
    results = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, predictions)):
        encoded = tokenizer(text, 
                            add_special_tokens=True,
                            return_offsets_mapping=True)
        for idx, (offset_mapping, pred) in enumerate(zip(encoded['offset_mapping'], prediction)):
            start = offset_mapping[0]
            end = offset_mapping[1]
            results[i][start:end] = pred
    return results

def get_results_raw(char_probs, th=0.5):
    results = []
    for char_prob in char_probs:
        result = np.where(char_prob >= th)[0] + 1
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        result = [f"{min(r)} {max(r)}" for r in result]
        result = ";".join(result)
        results.append(result)
    return results


def get_results(char_probs, th=0.5):
    results = []
    for char_prob in char_probs:
        result = np.where(char_prob >= th)[0] + 1
        if len(result) > 0:
            if result[0] == 1:
                result = np.concatenate([np.array([0]),result],axis=0)
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        result = [f"{min(r)} {max(r)}" for r in result]
        result = ";".join(result)
        results.append(result)
    return results

def get_results_pp(char_probs,txts,th=0.5,case=0):
    results = []
    for char_prob,txt in zip(char_probs,txts):
        result = np.where(char_prob >= th)[0] + 1
        if len(result) > 0:
            if result[0] == 1:
                result = np.concatenate([np.array([0]),result],axis=0)
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        result_ = []
        if case != 9:
            for r in result:
                if r[0] >= 1:
                    if txt[r[0] - 1] != ' ':
                        result_.append([r[0]-1] + r)
                    else:
                        result_.append(r)
                else:
                    result_.append(r)
            result_ = [f"{min(r)} {max(r)}" for r in result_]
            result_ = ";".join(result_)
            results.append(result_)
        else:
            result = [f"{min(r)} {max(r)}" for r in result]
            result = ";".join(result)
            results.append(result)
    return results


def get_predictions(results):
    predictions = []
    for result in results:
        prediction = []
        if result != "":
            for loc in [s.split() for s in result.split(';')]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    return predictions

def get_score(y_true, y_pred):
    score,re_score = span_micro_f1(y_true, y_pred)
    return score,re_score

def span_micro_f1(truths,preds):
    """
    Micro f1 on spans.

    Args:
        preds (list of lists of two ints): Prediction spans.
        truths (list of lists of two ints): Ground truth spans.

    Returns:
        float: f1 score.
    """
    bin_preds = []
    bin_truths = []
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue
        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0)
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
    return micro_f1(bin_preds, bin_truths)

def spans_to_binary(spans, length=None):
    """
    Converts spans to a binary array indicating whether each character is in the span.

    Args:
        spans (list of lists of two ints): Spans.

    Returns:
        np array [length]: Binarized spans.
    """
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1
    return binary

def micro_f1(preds, truths):
    """
    Micro f1 on binary arrays.

    Args:
        preds (list of lists of ints): Predictions.
        truths (list of lists of ints): Ground truths.

    Returns:
        float: f1 score.
    """
    # Micro : aggregating over all instances
    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    return f1_score(truths, preds), recall_score(truths, preds)

In [17]:
oof38 = np.load(oof_path1)
oof41 = np.load(oof_path2)
oof51 = np.load(oof_path3)

In [18]:
nakama = pd.read_csv(nakama_path) 

In [19]:
MODEL_PATH1 = "microsoft/deberta-v3-large"
MODEL_PATH2 = "microsoft/deberta-v2-xlarge"
#MODEL_PATH3 = "microsoft/deberta-v2-xlarge"
tokenizer1 = DebertaV2TokenizerFast.from_pretrained(MODEL_PATH1)
tokenizer2 = DebertaV2TokenizerFast.from_pretrained(MODEL_PATH2)
#tokenizer3 = AutoTokenizer.from_pretrained(MODEL_PATH3,trim_offsets=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [20]:
# cv
valid_texts = train['pn_history'].values
valid_labels = create_labels_for_scoring(train)
char_probs1 = get_char_probs(valid_texts, oof38,  tokenizer1)
char_probs2 = get_char_probs(valid_texts, oof41,  tokenizer2)
char_probs3 = get_char_probs(valid_texts, oof51,  tokenizer2)

In [21]:
train['feature_text'] = train['feature_text'].str.lower()
train['pn_history'] = train['pn_history'].str.lower()

In [22]:
nakama = nakama.sort_values(by="id").reset_index(drop=True)
sum(train["id"] == nakama["id"])

14300

In [23]:
class CFG:
    num_workers=4
    path="../input/nbme-debertav3large-exp141/"
    config_path=path+'config.pth'
    model="microsoft/deberta-v3-large"
    batch_size=32
    fc_dropout=0.
    max_len=315
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]
    losses=['bce', 'bce', 'bce', 'bce']
    target_sizes=[1, 1, 1, 1]
    
char_probs_n = get_char_probs(nakama['pn_history'].values,
                            nakama[[str(i) for i in range(CFG.max_len)]].values, 
                            tokenizer1)

In [24]:
w1 = 0.25
w2 = 0.1
w3 = 0.35
w4 = 0.3

In [25]:
w_sum = w1 + w2 + w3 + w4
w1 /= w_sum
w2 /= w_sum
w3 /= w_sum
w4 /= w_sum
print("w1",w1)
print("w2",w2)
print("w3",w3)
print("w4",w4)

w1 0.25
w2 0.1
w3 0.35
w4 0.3


In [33]:
def postprocess(texts, preds):
    from nltk.tokenize import word_tokenize
    preds_pp = preds.copy()
    tk0 = tqdm(range(len(preds_pp)), total=len(preds_pp))
    for raw_idx in tk0:
        pred = preds[raw_idx]
        text = texts[raw_idx]
        if len(pred) != 0:
            # pp1: indexが1から始まる予測値は0から始まるように修正 ## +0.00123
            if pred[0][0] == 1:
                preds_pp[raw_idx][0][0] = 0
            for p_index, pp in enumerate(pred):
                start, end = pred[p_index]
                if start == 0:
                    break
                # pp2: startとendが同じ予測値はstartを前に1ずらす ## +0.00012
                if start == end:
                    preds_pp[raw_idx][p_index][0] = start - 1
                    break
                # pp3: 始点が改行の場合始点を1つ後ろにずらす ## +0.00032
                if text[start] == '\n':
                    preds_pp[raw_idx][p_index][0] = start + 1
                    start = start + 1
                # pp4: 1-2などは-2で予測されることがあるので修正 ## +0.00001
                if text[start-1].isdigit() and text[start] == '-' and text[start+1].isdigit():
                    preds_pp[raw_idx][p_index][0] = start - 1
                    start = start - 1
                if text[start-1].isdigit() and text[start] == '/' and text[start+1].isdigit():
                    preds_pp[raw_idx][p_index][0] = start - 1
                    start = start - 1
                # pp5: 67などは7で予測されることがあるので修正 ## +0.00001
                if text[start-1].isdigit() and text[start].isdigit():
                    preds_pp[raw_idx][p_index][0] = start - 1
                    start = start - 1
                # pp6: 文頭が大文字で始まるものは大文字部分が除かれて予測されることがあるので修正 ## +0.00013
                if text[start-2] == '.' and text[start-1].isupper():
                    preds_pp[raw_idx][p_index][0] = start - 1
                    start = start - 1
                if text[start-2] == ',' and text[start-1].isupper():
                    preds_pp[raw_idx][p_index][0] = start - 1
                    start = start - 1
                if text[start-2] == ':' and text[start-1].isupper():
                    preds_pp[raw_idx][p_index][0] = start - 1
                    start = start - 1
                if text[start-2] == '-' and text[start-1].isupper():
                    preds_pp[raw_idx][p_index][0] = start - 1
                    start = start - 1
                # pp7: heart -> h + eart となっているようなものを汎用的に修正する ## +0.00050
                try:
                    text_token = word_tokenize(text[start-1:end])
                    first = text[start:end].split()[0]
                    if first not in text_token:
                        for t in text_token:
                            if first == t[-len(first):]:
                                sub = len(t) - len(first)
                                preds_pp[raw_idx][p_index][0] = start - sub
                                start = start - sub
                                break
                except:
                    None
                # pp8: .で終わっているもの ## 0.00001
                if text[end-1:end] == '.':
                    preds_pp[raw_idx][p_index][1] = end - 1
                    end = end - 1
    return preds_pp

In [34]:
char_probs = []
for i in range(len(char_probs1)):
    char_probs.append(char_probs1[i] * w1 + char_probs2[i] * w2 + char_probs3[i] * w3 + char_probs_n[i] * w4 )
results = get_results_raw(char_probs, th=0.47)
preds = get_predictions(results)
preds_pp = postprocess(valid_texts, preds)
score = get_score(valid_labels, preds_pp)
LOGGER.info(f'cv:{score}')

100%|██████████| 14300/14300 [00:01<00:00, 12804.33it/s]
2022-04-28 12:11:18,511 - INFO - cv:(0.894178945426973, 0.9077353554581247)


In [35]:
import warnings
warnings.simplefilter('ignore')
best_dict = {}
for c in range(10):
    for th in [0.47,0.475,0.48,0.485,0.49,0.495,0.50,0.505,0.51,0.515,0.52,0.525,0.53]:
        valid_labels_ = np.array(valid_labels)[train["case_num"] == c]
        char_probs_ = np.array(char_probs)[train["case_num"] == c]
        valid_texts_ = valid_texts[train["case_num"] == c]
        results = get_results_raw(char_probs_,th=th)
        preds = get_predictions(results)
        preds_pp = postprocess(valid_texts_, preds)
        score,re_score = get_score(valid_labels_, preds_pp)
        #LOGGER.info(f'{th} case{i} cv:{score}')
        if th == 0.47:
            best_dict[c] = [th,score]
        else:
            if best_dict[c][1] < score:
                best_dict[c] = [th,score]

2022-04-28 12:11:18,537 - INFO - Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2022-04-28 12:11:18,538 - INFO - NumExpr defaulting to 8 threads.
100%|██████████| 1300/1300 [00:00<00:00, 12238.12it/s]
100%|██████████| 1300/1300 [00:00<00:00, 12518.33it/s]
100%|██████████| 1300/1300 [00:00<00:00, 12319.60it/s]
100%|██████████| 1300/1300 [00:00<00:00, 12184.76it/s]
100%|██████████| 1300/1300 [00:00<00:00, 12275.67it/s]
100%|██████████| 1300/1300 [00:00<00:00, 12251.10it/s]
100%|██████████| 1300/1300 [00:00<00:00, 12296.88it/s]
100%|██████████| 1300/1300 [00:00<00:00, 12360.97it/s]
100%|██████████| 1300/1300 [00:00<00:00, 12374.01it/s]
100%|██████████| 1300/1300 [00:00<00:00, 12353.74it/s]
100%|██████████| 1300/1300 [00:00<00:00, 12350.69it/s]
100%|██████████| 1300/1300 [00:00<00:00, 12376.57it/s]
100%|██████████| 1300/1300 [00:00<00:00, 12497.41it/s]
100%|██████████| 1300/1300 [00:00<00:00, 13428.35it/s]
100%|██████████| 1300/1300 [00:00<

In [36]:
best_dict

{0: [0.49, 0.8976772190005388],
 1: [0.525, 0.9064990886585599],
 2: [0.475, 0.8517623923219974],
 3: [0.485, 0.9260359498514676],
 4: [0.53, 0.9248769561757685],
 5: [0.47, 0.8364175195561528],
 6: [0.47, 0.906836587356394],
 7: [0.53, 0.8801618303571429],
 8: [0.485, 0.9256547241005716],
 9: [0.53, 0.9290001463914508]}

In [37]:
def get_results_raw_pp(char_probs, case_nums,th_dict):
    results = []
    for char_prob,case_num in zip(char_probs,case_nums):
        result = np.where(char_prob >= th_dict[case_num][0])[0] + 1
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        result = [f"{min(r)} {max(r)}" for r in result]
        result = ";".join(result)
        results.append(result)
    return results

In [38]:
char_probs = []
for i in range(len(char_probs1)):
    char_probs.append(char_probs1[i] * w1 + char_probs2[i] * w2 + char_probs3[i] * w3 + char_probs_n[i] * w4 )
case_nums = train["case_num"].values
results = get_results_raw_pp(char_probs, case_nums,best_dict)
preds = get_predictions(results)
preds_pp = postprocess(valid_texts, preds)
score = get_score(valid_labels, preds_pp)
LOGGER.info(f'cv:{score}')

100%|██████████| 14300/14300 [00:01<00:00, 12912.26it/s]
2022-04-28 12:12:21,010 - INFO - cv:(0.8946205091478651, 0.9030701138485383)
