In [1]:
import pickle
from pathlib import Path
from typing import Dict, Tuple

import numpy as np
import pandas as pd
from pandarallel import pandarallel
from tqdm.auto import tqdm

import torch
from sklearn.metrics import accuracy_score
from torch.optim import AdamW
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_scheduler,
    BertTokenizerFast,
)

from dataset import BERTDataset
from utils import (
    generate_evidence_to_wiki_pages_mapping,
    jsonl_dir_to_df,
    load_json,
    load_model,
    save_checkpoint,
    set_lr_scheduler,
)

pandarallel.initialize(progress_bar=True, verbose=0, nb_workers=4)

In [2]:
LABEL2ID: Dict[str, int] = {
    "supports": 0,
    "refutes": 1,
    "NOT ENOUGH INFO": 2,
}
ID2LABEL: Dict[int, str] = {v: k for k, v in LABEL2ID.items()}

In [3]:

TRAIN_DATA = load_json("data/Stage2/PublicTrain/WithoutBM25/UnBM25_train_doc10sent5.jsonl")
DEV_DATA = load_json("data/Stage2/PublicTrain/WithoutBM25/UnBM25_dev_doc10sent5.jsonl")

TRAIN_PKL_FILE = Path("data/Stage2/PublicTrain/WithoutBM25/UnBM25_train_doc10sent5.pkl")
DEV_PKL_FILE = Path("data/Stage2/PublicTrain/WithoutBM25/UnBM25_dev_doc10sent5.pkl")


In [4]:
wiki_pages = jsonl_dir_to_df("data/wiki-pages")
mapping = generate_evidence_to_wiki_pages_mapping(wiki_pages,)


Reading and concatenating jsonl files in data/wiki-pages
Generate parse mapping


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=296938), Label(value='0 / 296938')…

Transform to id to evidence_map mapping


In [5]:

class AicupTopkEvidenceBERTDataset(BERTDataset):
    """AICUP dataset with top-k evidence sentences."""
    
    def __getitem__(
        self,
        idx: int,
        **kwargs,
    ) -> Tuple[Dict[str, torch.Tensor], int]:
        item = self.data.iloc[idx]
        claim = item["claim"]
        
        
        evidence=item["SingleEvidence"]
        
        
        
        concat = self.tokenizer(
            claim,evidence,
            padding="max_length",
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
        )
        label = LABEL2ID[item["label"]] if "label" in item else -1
        concat_ten = {k: torch.tensor(v) for k, v in concat.items()}

        if "label" in item:
            concat_ten["labels"] = torch.tensor(label)

        return concat_ten

In [6]:
class AicupTopkEvidenceBERT_Val_Dataset(torch.utils.data.Dataset):
    """AicupTopkEvidenceBERTDataset class for AICUP dataset with top-k evidence sentences."""

    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: AutoTokenizer,
        max_length: int = 32,
        topk: int = 5,
        # TrainMode : bool=True
    ):
        self.data=data
        self.tokenizer=tokenizer
        self.max_length=max_length
    def __len__(self):
        return len(self.data)
    def __getitem__(
        self,
        idx: int,
        **kwargs,
    ) :
        item = self.data.iloc[idx]
        sentA = item["claim"]
        predicted_pages_text={}
        sentB = item["evidence_list"]
        label = LABEL2ID[item["label"]] if "label" in item else -1
        
        for num,text in enumerate(sentB):
            PredictedTextToken = self.tokenizer(
            sentA,text,
            padding="max_length",
            max_length=self.max_length,
            truncation=True,
            )
            CandidateNum=f"ClaimEvidence{num+1}"
            predicted_pages_text[CandidateNum]=PredictedTextToken
        
        
        for key,value in predicted_pages_text.items():
            predicted_pages_text[key]={k: torch.tensor(v) for k, v in predicted_pages_text[key].items()}
        label=torch.tensor(label)
        predicted_pages_text["labels"]=label
        
        return predicted_pages_text

In [7]:
def RuleBasedAccuracy(y_pred,y_pred2,y_pred3,y_pred4,y_pred5):
    final_pred=[]
    
    for num,i in enumerate (y_pred):
        if(i==0 or y_pred2[num]==0 or y_pred3[num]==0 or y_pred4[num]==0 or y_pred5[num]==0):
            final_pred.append(0)
        elif(i==1 or y_pred2[num]==1 or y_pred3[num]==1 or y_pred4[num]==1 or y_pred5[num]==1):
            final_pred.append(1)
        else:
            final_pred.append(2)
    
    return final_pred
def run_evaluation(model: torch.nn.Module, dataloader: DataLoader, device):
    model.eval()

    loss = 0
    y_true = []
    y_pred = []
    y_pred2 = []
    y_pred3 = []
    y_pred4 = []
    y_pred5 = []
    final_pred=[]
    with torch.no_grad():
        for batch in tqdm(dataloader):
            y_true.extend(batch["labels"].tolist())
            
            ####### ClaimEvidence1 #########
            batch_2=batch["ClaimEvidence1"]
            batch_2["labels"]=batch["labels"]
            batch_2 = {k: v.to(device) for k, v in batch_2.items()}
            
            outputs = model(**batch_2)
            ####### ClaimEvidence2 #########
            batch_2_2=batch["ClaimEvidence2"]
            batch_2_2 = {k: v.to(device) for k, v in batch_2_2.items()}
            outputs2 = model(**batch_2_2)
            
            ####### ClaimEvidence3 #########
            batch_2_3=batch["ClaimEvidence3"]
            batch_2_3 = {k: v.to(device) for k, v in batch_2_3.items()}
            outputs3 = model(**batch_2_3)
            
            ####### ClaimEvidence4 #########
            batch_2_4=batch["ClaimEvidence4"]
            batch_2_4 = {k: v.to(device) for k, v in batch_2_4.items()}
            outputs4 = model(**batch_2_4)

            ####### ClaimEvidence5 #########
            batch_2_5=batch["ClaimEvidence5"]
            batch_2_5 = {k: v.to(device) for k, v in batch_2_5.items()}
            outputs5 = model(**batch_2_5)



            #print(outputs.logits.shape)
            loss += outputs.loss.item() #outputs loss
            logits = outputs.logits#logits shape:torch.Size([32, 3])
            logits2 = outputs2.logits
            logits3 = outputs3.logits
            logits4 = outputs4.logits
            logits5 = outputs5.logits


            y_pred.extend(torch.argmax(logits, dim=1).tolist())
            y_pred2.extend(torch.argmax(logits2, dim=1).tolist())
            y_pred3.extend(torch.argmax(logits3, dim=1).tolist())
            y_pred4.extend(torch.argmax(logits4, dim=1).tolist())
            y_pred5.extend(torch.argmax(logits5, dim=1).tolist())
            
        final_pred=RuleBasedAccuracy(y_pred,y_pred2,y_pred3,y_pred4,y_pred5)
    
    # acc = accuracy_score(y_true, y_pred)
    acc = accuracy_score(y_true, final_pred)

    return {"val_loss": loss / len(dataloader), "val_acc": acc}

In [8]:
def join_with_topk_evidence(
    df: pd.DataFrame,
    mapping: dict,
    mode: str = "train",
    topk: int = 5,
    train_evidence_split=True
) -> pd.DataFrame:
    """join_with_topk_evidence join the dataset with topk evidence.

    Note:
        After extraction, the dataset will be like this:
               id     label         claim                           evidence            evidence_list
        0    4604  supports       高行健...     [[[3393, 3552, 高行健, 0], [...  [高行健 （ ）江西赣州出...
        ..    ...       ...            ...                                ...                     ...
        945  2095  supports       美國總...  [[[1879, 2032, 吉米·卡特, 16], [...  [卸任后 ， 卡特積極參與...
        停各种战争及人質危機的斡旋工作 ， 反对美国小布什政府攻打伊拉克...

        [946 rows x 5 columns]

    Args:
        df (pd.DataFrame): The dataset with evidence.
        wiki_pages (pd.DataFrame): The wiki pages dataframe
        topk (int, optional): The topk evidence. Defaults to 5.
        cache(Union[Path, str], optional): The cache file path. Defaults to None.
            If cache is None, return the result directly.

    Returns:
        pd.DataFrame: The dataset with topk evidence_list.
            The `evidence_list` column will be: List[str]
    """

    # format evidence column to List[List[Tuple[str, str, str, str]]]
    if "evidence" in df.columns:
        df["evidence"] = df["evidence"].parallel_map(
            lambda x: [[x]] if not isinstance(x[0], list) else [x]
            if not isinstance(x[0][0], list) else x)

    print(f"Extracting evidence_list for the {mode} mode ...")
    if mode == "eval":
        # extract evidence
        df["evidence_list"] = df["predicted_evidence"].parallel_map(lambda x: [
            mapping.get(evi_id, {}).get(str(evi_idx), "")
            for evi_id, evi_idx in x  # for each evidence list
        ][:topk] if isinstance(x, list) else [])
        print(df["evidence_list"][:5])
    else:
        

        
        # extract evidence
        df["evidence_list"] = df["evidence"].parallel_map(lambda x: [
            " ".join([  # join evidence
                mapping.get(evi_id, {}).get(str(evi_idx), "")
                for _, _, evi_id, evi_idx in evi_list
            ]) if isinstance(evi_list, list) else ""
            for evi_list in x  # for each evidence list
            ][:len(x)] if isinstance(x, list) else [])

    return df

In [22]:
#@title  { display-mode: "form" }

#MODEL_NAME = "bert-base-chinese"  #@param {type:"string"} #ckiplab/
MODEL_NAME = "ckiplab/bert-base-chinese"
#MODEL_NAME = "ckiplab/albert-base-chinese" #albert-base-chinese
TRAIN_BATCH_SIZE = 32  #@param {type:"integer"}
TEST_BATCH_SIZE = 32  #@param {type:"integer"}
SEED = 42  #@param {type:"integer"}
LR = 7e-5  #@param {type:"number"}
NUM_EPOCHS = 20  #@param {type:"integer"}
MAX_SEQ_LEN = 256  #@param {type:"integer"}
# EVIDENCE_TOPK = 5  #@param {type:"integer"} #default parameter
EVIDENCE_TOPK = 5
#EVIDENCE_TOPK = 1
VALIDATION_STEP = 1000  #@param {type:"integer"}


In [10]:
OUTPUT_FILENAME = "submission.jsonl"

EXP_DIR = f"/第二階段資料/無BM25/claim_verification/e{NUM_EPOCHS}_bs{TRAIN_BATCH_SIZE}_" + f"{LR}_top{EVIDENCE_TOPK}"
LOG_DIR = "logs/" + EXP_DIR
CKPT_DIR = "checkpoints/" + EXP_DIR

if not Path(LOG_DIR).exists():
    Path(LOG_DIR).mkdir(parents=True)

if not Path(CKPT_DIR).exists():
    Path(CKPT_DIR).mkdir(parents=True)

In [11]:
if not TRAIN_PKL_FILE.exists():
    train_df = join_with_topk_evidence(
        pd.DataFrame(TRAIN_DATA),
        mapping,
        topk=EVIDENCE_TOPK,
    )
    train_df.to_pickle(TRAIN_PKL_FILE, protocol=4)
else:
    with open(TRAIN_PKL_FILE, "rb") as f:
        train_df = pickle.load(f) #原本
        #train_df = pd.read_pickle(f) 

if not DEV_PKL_FILE.exists():
    dev_df = join_with_topk_evidence(
        pd.DataFrame(DEV_DATA),
        mapping,
        mode="eval",
        topk=EVIDENCE_TOPK,
    )
    dev_df.to_pickle(DEV_PKL_FILE, protocol=4)
else:
    with open(DEV_PKL_FILE, "rb") as f:
        dev_df = pickle.load(f) #原本 read_pickle
        #dev_df = pd.read_pickle(f) 

In [12]:
dev_short=dev_df[dev_df["predicted_evidence"].map(len)<5]#

In [13]:
dev_df=dev_df.drop(dev_df[dev_df["predicted_evidence"].map(len)<5].index)

In [14]:
NOT_ENOUGH_INFO_ROW=train_df[train_df["predicted_evidence"].map(len)==0]#predicted_evidence=[], NOT_ENOUGH_INFO_ROW

In [15]:
train_df=train_df.drop(train_df[train_df["predicted_evidence"].map(len)==0].index)#drop predicted_evidence=0

In [16]:

import random
def RandomPredictedEvidence(df):
    
    Max=len(df["predicted_evidence"])-1
    if Max != -1:
        
        if Max!=0:
            RandomEviList=[]
            if Max==1:
                RandomIndex=random.sample(range(0,Max+1),2)#Sample 2
                for index in RandomIndex:
                    RandomEvi=df["predicted_evidence"][index]
                    RandomEviList.append(mapping[RandomEvi[0]][str(RandomEvi[1])])
        
                return RandomEviList
            else:
                RandomIndex=random.sample(range(0,Max+1),3)#Sample 3
                for index in RandomIndex:
                    RandomEvi=df["predicted_evidence"][index]
                    RandomEviList.append(mapping[RandomEvi[0]][str(RandomEvi[1])])
        
                return RandomEviList
        else:
            RandomEvi=df["predicted_evidence"][0]
            return mapping[RandomEvi[0]][str(RandomEvi[1])]
    
    else:
        return []

In [17]:
DropNotInfo=False
if DropNotInfo:
    train_df=train_df.drop(train_df[train_df["label"]=="NOT ENOUGH INFO"].index)
    dev_df=dev_df.drop(dev_df[dev_df["label"]=="NOT ENOUGH INFO"].index)
else:
    a=train_df[train_df["label"]=="NOT ENOUGH INFO"].index
    a=list(a)
    train_df.loc[a,"evidence_list"]=train_df[train_df["label"]=="NOT ENOUGH INFO"].apply(RandomPredictedEvidence,axis=1)


In [18]:
train_df=train_df.explode('evidence_list')
train_df=train_df.rename(columns={"evidence_list": "SingleEvidence"})

In [19]:
#tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer = BertTokenizerFast.from_pretrained("bert-base-chinese")#BertTokenizerFast
train_dataset = AicupTopkEvidenceBERTDataset(
    train_df,
    tokenizer=tokenizer,
    max_length=MAX_SEQ_LEN,
    topk=EVIDENCE_TOPK
)
val_dataset = AicupTopkEvidenceBERT_Val_Dataset(
    dev_df,
    tokenizer=tokenizer,
    max_length=MAX_SEQ_LEN,
    topk=EVIDENCE_TOPK
)

train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    batch_size=TRAIN_BATCH_SIZE,
)
eval_dataloader = DataLoader(val_dataset, batch_size=TEST_BATCH_SIZE)

In [20]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device(
    "cpu")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(LABEL2ID),
)
model.to(device)

optimizer = AdamW(model.parameters(), lr=LR)
num_training_steps = NUM_EPOCHS * len(train_dataloader)
lr_scheduler = set_lr_scheduler(optimizer, num_training_steps)

writer = SummaryWriter(LOG_DIR)

Some weights of the model checkpoint at ckiplab/bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ckiplab/bert-base-c

In [None]:
progress_bar = tqdm(range(num_training_steps))
current_steps = 0

for epoch in range(NUM_EPOCHS):
    model.train()
    y_true_list = []
    y_pred_list = []
    Trainloss=0
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        
        outputs = model(**batch)
        loss = outputs.loss
        Trainloss+=loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        writer.add_scalar("training_loss", loss.item(), current_steps)

        y_pred = torch.argmax(outputs.logits, dim=1).tolist()
        y_true = batch["labels"].tolist()
        
        y_pred_list.extend(y_pred)
        y_true_list.extend(y_true)

        current_steps += 1

        if current_steps % VALIDATION_STEP == 0 and current_steps > 0:
            print("Start validation")
            val_results = run_evaluation(model, eval_dataloader, device)

            # log each metric separately to TensorBoard
            for metric_name, metric_value in val_results.items():
                print(f"{metric_name}: {metric_value}")
                writer.add_scalar(f"{metric_name}", metric_value, current_steps)

            save_checkpoint(
                model,
                CKPT_DIR,
                current_steps,
                mark=f"val_acc={val_results['val_acc']:.4f}",
            )
    
    train_acc = accuracy_score(y_true_list, y_pred_list)
    print(f"train_loss: {Trainloss / len(train_dataloader)}, train_acc:{train_acc}")
    
    

print("Finished training!")

### Test Not Enough Information

In [None]:
NotEnoughInfoDev=dev_df[dev_df["label"]=="NOT ENOUGH INFO"]

In [None]:
valid_dataset_not_info = AicupTopkEvidenceBERT_Val_Dataset(
    NotEnoughInfoDev,
    tokenizer=tokenizer,
    max_length=MAX_SEQ_LEN,
    topk=EVIDENCE_TOPK
)
valid_dataloader_not_info = DataLoader(valid_dataset_not_info, batch_size=5)
run_evaluation(model, valid_dataloader_not_info, device)


### Test Enough Information

In [None]:
EnoughInfoDev=dev_df[dev_df["label"]!="NOT ENOUGH INFO"]
valid_dataset_info = AicupTopkEvidenceBERT_Val_Dataset(
    EnoughInfoDev,
    tokenizer=tokenizer,
    max_length=MAX_SEQ_LEN,
    topk=EVIDENCE_TOPK
)
valid_dataloader_info = DataLoader(valid_dataset_info, batch_size=5)
run_evaluation(model, valid_dataloader_info, device)


### Test Full Data

In [None]:

valid_dataset_full = AicupTopkEvidenceBERT_Val_Dataset(
    dev_df,
    tokenizer=tokenizer,
    max_length=MAX_SEQ_LEN,
    topk=EVIDENCE_TOPK
)
valid_dataloader_full = DataLoader(valid_dataset_full, batch_size=5)
run_evaluation(model, valid_dataloader_full, device)
