In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from transformers import default_data_collator
from transformers import DataCollatorForLanguageModeling
from datasets import load_from_disk
import config
import string
import re


tokenizer = AutoTokenizer.from_pretrained(config.model_checkpoint)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)


#example = "(ii)   If so, whether it was reasonable for P to continue its action against D1 after receiving the Joint Report and/or after the withdrawal of contribution notice against D1 by D2?"



def clean_data(example):
    tmp = re.sub(r'^(\d+).', '', example)
    tmp = re.sub(r'^\((ix|iv|v?i{0,3})\)', '', tmp)
    tmp = re.sub(r'^\((\d+)\)', '', tmp)
    new_str = tmp.translate(str.maketrans('','',string.punctuation)).replace("’s",'').replace("s’",'s')
    return new_str

# date prepared for training
def tokenize_function(examples):
    lower = [clean_data(x.lower()) for x in examples["paragraphs"]]
    result = tokenizer(lower)
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


def insert_random_mask(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = data_collator(features)
    # Create a new "masked" column for each column in the dataset
    return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}


def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // config.chunk_size) * config.chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + config.chunk_size] for i in range(0, total_length, config.chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result


## data loader
def getDataloader(hklii_dataset,eval_dataset):
    train_dataloader = DataLoader(
        hklii_dataset["train"],
        shuffle=True,
        batch_size=config.batch_size,
        collate_fn=data_collator,
    )
    eval_dataloader = DataLoader(
        eval_dataset, batch_size=config.batch_size, collate_fn=default_data_collator
    )
    return train_dataloader,eval_dataloader

def getDataset():
    tokenized_datasets=load_from_disk("/home/huijie/legal/huggface/data_prepare/HKLII_all_cail/" )

    # tokenized_datasets = hklii_dataset.map(
    #     tokenize_function, batched=True, remove_columns=["ID","topic","paragraphs"],num_proc =16
    # )
    hklii_dataset = tokenized_datasets.map(group_texts, batched=True,num_proc = 16)

    # train_size = 10_000
    # test_size = int(0.1 * train_size)

    # hklii_dataset = hklii_dataset["train"].train_test_split(
    #     train_size=train_size, test_size=test_size, seed=42
    # )


    hklii_dataset = hklii_dataset.remove_columns(["word_ids"])

    eval_dataset = hklii_dataset["test"].map(
        insert_random_mask,
        batched=True,
        remove_columns=hklii_dataset["test"].column_names,
    )
    eval_dataset = eval_dataset.rename_columns(
        {
            "masked_input_ids": "input_ids",
            "masked_attention_mask": "attention_mask",
            "masked_labels": "labels",
        }
    )
    eval_dataset =  eval_dataset.remove_columns(["masked_token_type_ids"])
    return hklii_dataset,eval_dataset




In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from datasets import load_from_disk

In [2]:
from datasets import load_dataset
dataset = load_dataset('json',data_files='/home/huijie/legal/cail_scm_2/data/raw/CAIL2019-SCM-big/SCM_5k.json')


Using custom data configuration default-a7bb499a86560565
Reusing dataset json (/home/huijie/.cache/huggingface/datasets/json/default-a7bb499a86560565/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)


  0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['B', 'A', 'label', 'C'],
        num_rows: 5102
    })
})

In [None]:
def augment_scm(example):
    outputs = []
    df_cp1 = example.copy()
    df_cp1["B"] = example["C"]
    df_cp1["C"] = example["B"]
    df_cp1["label"] = "C" if example['label'] == "B" else "B"

    # 自反性增广
    df_cp2 = example.copy()
    df_cp2["A"] = example["C"]
    df_cp2["B"] = example["C"]
    df_cp2["C"] = example["A"]
    df_cp2["label"] = "B"

    # 自反性+反对称增广
    df_cp3 = example.copy()
    df_cp3["A"] = example["C"]
    df_cp3["B"] = example["A"]
    df_cp3["C"] = example["C"]
    df_cp3["label"] = "C"

    # 启发式增广
    df_cp4 = example.copy()
    if example["label"] == "B":
        df_cp4["A"] = example["B"]
        df_cp4["B"] = example["A"]
        df_cp4["C"] = example["C"]
        df_cp4["label"] = "B"
    else:
        df_cp4["A"] = example["C"]
        df_cp4["B"] = example["B"]
        df_cp4["C"] = example["A"]
        df_cp4["label"] = "C"

    # 启发式+反对称增广
    df_cp5 = example.copy()
    if example["label"] == "B":
        df_cp5["A"] = example["B"]
        df_cp5["B"] = example["C"]
        df_cp5["C"] = example["A"]
        df_cp5["label"] = "C"
    else:
        df_cp5["A"] = example["C"]
        df_cp5["B"] = example["A"]
        df_cp5["C"] = example["B"]
        df_cp5["label"] = "B"
    outputs+= [example, df_cp1, df_cp2, df_cp3, df_cp4, df_cp5]
    return {'data': outputs}

augment_scm(dataset['train'][1])

: 

In [10]:
augmented_dataset = dataset['train'].map(augment_scm,num_proc=4)
augmented_dataset[:9]['data']

[1, 2]

In [18]:
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-uncased")
model = BertModel.from_pretrained("bert-base-multilingual-uncased")



Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length.

In [None]:
inputs = tokenizer(["Hello, my dog is cute","No, it is not."], return_tensors="pt")
outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state

In [16]:
outputs.pooler_output.shape

torch.Size([1, 768])

In [17]:
inputs

{'input_ids': tensor([[  101, 29155,   117, 11153, 14791, 10127, 18233, 10111,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}