In [1]:
from datasets import load_dataset
from tqdm.notebook import tqdm
import torch
import pandas as pd
from glob import glob
import numpy as np
import os

os.environ["TRANSFORMERS_OFFLINE"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
!pip install /kaggle/input/sci-llm-pip-v2/sentence-transformers-2.2.2.tar.gz


Processing /kaggle/input/sci-llm-pip-v2/sentence-transformers-2.2.2.tar.gz
  Preparing metadata (setup.py) ... [?25l- \ done
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l- \ | done
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125924 sha256=94288921c916a0f14d1fe4b4ede52212da526c93ed2ea1a0450c4341e75ceaaf
  Stored in directory: /root/.cache/pip/wheels/c6/26/e2/13ef17530724efc5be0bf3d290de1ecaa6c0ff0225fd548014
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.2.2


In [3]:
from pathlib import Path

data_path = Path("/kaggle/input/kaggle-llm-science-exam")

test = pd.read_csv(data_path / "test.csv")
CALC_SCORE = False

test.head()
test.to_parquet("test_raw.pq")

In [4]:
%%writefile get_topk.py

import pandas as pd
from glob import glob
import torch
import numpy as np
from sentence_transformers import SentenceTransformer
from pathlib import Path
from joblib import Parallel, delayed
import argparse


def cos_similarity_matrix(a: torch.Tensor, b: torch.Tensor):
    """Calculates cosine similarities between tensor a and b."""

    sim_mt = torch.mm(a, b.transpose(0, 1))
    return sim_mt


def get_topk(embeddings_from, embeddings_to, topk=1000, bs=512):
    chunk = bs
    embeddings_chunks = embeddings_from.split(chunk)

    vals = []
    inds = []
    for idx in range(len(embeddings_chunks)):
        cos_sim_chunk = cos_similarity_matrix(
            embeddings_chunks[idx].to(embeddings_to.device).half(), embeddings_to
        ).float()

        cos_sim_chunk = torch.nan_to_num(cos_sim_chunk, nan=0.0)

        topk = min(topk, cos_sim_chunk.size(1))
        vals_chunk, inds_chunk = torch.topk(cos_sim_chunk, k=topk, dim=1)
        vals.append(vals_chunk[:, :].detach().cpu())
        inds.append(inds_chunk[:, :].detach().cpu())

        del vals_chunk
        del inds_chunk
        del cos_sim_chunk

    vals = torch.cat(vals).detach().cpu()
    inds = torch.cat(inds).detach().cpu()

    return inds, vals


def insert_value_at(tensor, value, position):
    # Ensure the position is valid
    if position < 0 or position >= len(tensor):
        raise ValueError("Position should be between 0 and tensor length - 1.")

    # Slice the tensor into two parts
    left = tensor[:position]
    right = tensor[position:]

    # Create a tensor for the value to be inserted
    value_tensor = torch.tensor([value], dtype=tensor.dtype)

    # Concatenate the tensors together and slice to the original length
    result = torch.cat([left, value_tensor, right])[:-1]

    return result


def insert_value_at_list(lst, value, position):
    # Ensure the position is valid
    if position < 0 or position >= len(lst):
        raise ValueError("Position should be between 0 and list length - 1.")

    # Insert value at the specified position
    lst.insert(position, value)

    # Remove the last value to maintain original length
    lst.pop()

    return lst


def remove_consecutive_duplicates(input_list):
    if not input_list:
        return [" "] * args.topk

    new_list = [input_list[0]]
    for i in range(1, len(input_list)):
        if input_list[i] != input_list[i - 1]:
            new_list.append(input_list[i])

    # Append empty strings if new_list length is less than 5
    while len(new_list) < args.topk:
        new_list.append(" ")

    return new_list


if __name__ == "__main__":

    ap = argparse.ArgumentParser()
    ap.add_argument("--wiki", type=str, required=True)
    ap.add_argument("--model_name", type=str, required=True)
    ap.add_argument("--test_file", type=str, required=True)
    ap.add_argument("--topk", type=int, required=True)
    ap.add_argument("--ind", type=int, required=True)
    args = ap.parse_args()

    if args.topk == 10:
        TOP_K = 20
    else:
        TOP_K = 10

    data_path = Path("/kaggle/input/kaggle-llm-science-exam")

    if args.wiki == "cirrus":
        files_all = sorted(list(glob("/kaggle/input/cirruswiki-titles/*.parquet")))
    elif args.wiki == "new":
        files_all = sorted(list(glob("/kaggle/input/newwiki-titles/*.parquet")))

    if "e5-large" in args.model_name:
        files_np = sorted(
            list(glob("/kaggle/input/enwiki-cirrus-20230701-e5-large-part*/*.npy"))
        )
    elif "gte-large" in args.model_name:
        files_np = sorted(
            list(glob("/kaggle/input/wiki31m-gte-large-title-p*/*.npy"))
        )

    files_all = [(x, y) for x, y in zip(files_all, files_np)]
    files = [files_all[: len(files_all) // 2], files_all[len(files_all) // 2 :]]

    if "e5-large" in args.model_name:
        model = SentenceTransformer("/kaggle/input/intfloat-e5-large-v2").to("cuda:0")
    elif "gte-large" in args.model_name:
        model = SentenceTransformer("/kaggle/input/thenlper-gte-large").to("cuda:0")

    test = pd.read_parquet("test_raw.pq")

    embs = []
    for idx, row in test.iterrows():
        if "e5" in args.model_name:
            sentences = [
                "query: "
                + row.prompt
                + " "
                + row.A
                + " "
                + row.B
                + " "
                + row.C
                + " "
                + row.D
                + " "
                + row.E
            ]
        elif "gte" in args.model_name:
            sentences = [
                row.prompt
                + " "
                + row.A
                + " "
                + row.B
                + " "
                + row.C
                + " "
                + row.D
                + " "
                + row.E
            ]

        embeddings = torch.Tensor(
            model.encode(sentences, show_progress_bar=False, normalize_embeddings=True)
        )
        embs.append(torch.nn.functional.normalize(embeddings, dim=1))

    query_embeddings = torch.Tensor(np.stack(embs)).squeeze(1)
    print(f'query_embeddings.shape: {query_embeddings.shape}')

    # Create placeholders for top-k matches
    all_vals_gpu_0 = torch.full((len(test), TOP_K), -float("inf"), dtype=torch.float16)
    all_texts_gpu_0 = [[None] * TOP_K for _ in range(len(all_vals_gpu_0))]

    all_vals_gpu_1 = torch.full((len(test), TOP_K), -float("inf"), dtype=torch.float16)
    all_texts_gpu_1 = [[None] * TOP_K for _ in range(len(all_vals_gpu_1))]

    def load_data(files, device):
        for file, file_np in files:
            df = pd.read_parquet(file, engine="pyarrow", use_threads=True)
            file_embeddings = np.load(file_np)

            data_embeddings = torch.Tensor(file_embeddings).to(device).half()
            data_embeddings = torch.nn.functional.normalize(data_embeddings, dim=1)

            max_inds, max_vals = get_topk(
                query_embeddings, data_embeddings, topk=TOP_K, bs=8
            )

            # loop through all queries (test)
            for i in range(len(test)):
                # start with highest new val (pos 0) vs worst value already in the toplist (pos topk - 1)
                for new in range(TOP_K):
                    if device == "cuda:0":
                        if max_vals[i][new].item() < all_vals_gpu_0[i][TOP_K - 1]:
                            break
                        for old in range(TOP_K):
                            if max_vals[i][new].item() > all_vals_gpu_0[i][old]:
                                all_vals_gpu_0[i] = insert_value_at(
                                    all_vals_gpu_0[i],
                                    value=max_vals[i][new].item(),
                                    position=old,
                                )
                                all_texts_gpu_0[i] = insert_value_at_list(
                                    all_texts_gpu_0[i],
                                    value=df.iloc[max_inds[i][new].item()].text,
                                    position=old,
                                )
                                break
                    else:
                        if max_vals[i][new].item() < all_vals_gpu_1[i][TOP_K - 1]:
                            break
                        for old in range(TOP_K):
                            if max_vals[i][new].item() > all_vals_gpu_1[i][old]:
                                all_vals_gpu_1[i] = insert_value_at(
                                    all_vals_gpu_1[i],
                                    value=max_vals[i][new].item(),
                                    position=old,
                                )
                                all_texts_gpu_1[i] = insert_value_at_list(
                                    all_texts_gpu_1[i],
                                    value=df.iloc[max_inds[i][new].item()].text,
                                    position=old,
                                )
                                break

    Parallel(n_jobs=2, backend="threading")(
        delayed(load_data)(files[i], f"cuda:{i}") for i in range(2)
    )
    all_vals = torch.hstack([all_vals_gpu_0, all_vals_gpu_1])
    val, inds = torch.topk(all_vals.float(), axis=1, k=TOP_K)
    all_texts = [
        [(t0 + t1)[inner_idx.item()] for inner_idx in idx]
        for t0, t1, idx in zip(all_texts_gpu_0, all_texts_gpu_1, inds)
    ]

    all_texts = [remove_consecutive_duplicates(lst) for lst in all_texts]

    test["context"] = [
        "\n###\n".join([x[i] for i in list(range(args.topk))[::-1]]) for x in all_texts
    ]

    test["context_v2"] = [
        "Context 4: "
        + x[4]
        + "\n###\n"
        + "Context 3: "
        + x[3]
        + "\n###\n"
        + "Context 2: "
        + x[2]
        + "\n###\n"
        + "Context 1: "
        + x[1]
        + "\n###\n"
        + "Context 0: "
        + x[0]
        for x in all_texts
    ]

    print(test["context"].values[0])

    test.to_parquet(f"{args.test_file}.pq", index=False)

Writing get_topk.py


In [5]:
%%writefile run.sh

python get_topk.py --wiki "cirrus" --model_name "e5-large" --test_file "test" --topk 5 --ind 0  &&
python get_topk.py --wiki "new" --model_name "gte-large" --test_file "test2" --topk 5 --ind 0  &&

wait 
echo "All done"

Writing run.sh


In [6]:
!sh run.sh

query_embeddings.shape: torch.Size([200, 1024])
Physical Review Letters. 99 (14): 141302. arXiv:0704.1932. Bibcode:2007PhRvL..99n1302Z. doi:10.1103/PhysRevLett.99.141302. PMID 17930657. S2CID 119672184. Alzain, Mohammed (2017). "Modified Newtonian Dynamics (MOND) as a Modification of Newtonian Inertia". Journal of Astrophysics and Astronomy. 38 (4): 59. arXiv:1708.05385. Bibcode:2017JApA...38...59A. doi:10.1007/s12036-017-9479-0. S2CID 119245210. S. McGaugh, The EFE in MOND Archived 2017-07-16 at the Wayback Machine Milgrom, Mordehai (2008).
###
The homogeneously distributed mass of the universe would result in a roughly scalar field that permeated the universe and would serve as a source for Newton's gravitational constant; creating a theory of quantum gravity. Modified Newtonian Dynamics (MOND) is a relatively modern proposal to explain the galaxy rotation problem based on a variation of Newton's Second Law of Dynamics at low accelerations.
###
Modified Newtonian dynamics (MOND)

In [7]:
!pip install /kaggle/input/sci-llm-pip-v2/bitsandbytes-0.41.0-py3-none-any.whl
!pip install --no-index --find-links="/kaggle/input/transformers-main" /kaggle/input/transformers-main/transformers-4.46.3-py3-none-any.whl


Processing /kaggle/input/sci-llm-pip-v2/bitsandbytes-0.41.0-py3-none-any.whl
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.41.0
Looking in links: /kaggle/input/transformers-main
Processing /kaggle/input/transformers-main/transformers-4.46.3-py3-none-any.whl
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.45.1
    Uninstalling transformers-4.45.1:
      Successfully uninstalled transformers-4.45.1
Successfully installed transformers-4.46.3


In [8]:
%%writefile inference.py

# zero-shot: preprompt + instructions
import sys

from transformers import AutoModelForCausalLM, AutoTokenizer
import argparse
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm
import gc
import itertools

def longest_common_prefix(strs):
    if not strs:
        return ""

    shortest = min(strs, key=len)

    for i, char in enumerate(shortest):
        for other in strs:
            if other[i] != char:
                return shortest[:i]

    return shortest


if __name__ == "__main__":

    ap = argparse.ArgumentParser()
    ap.add_argument("--device", type=str, required=True)
    ap.add_argument("--model_name", type=str, required=True)
    ap.add_argument("--quantization", type=int, required=True)
    ap.add_argument("--model_type", type=int, required=True)
    ap.add_argument("--test_file", type=str, required=True)
    args = ap.parse_args()

    if args.device == "auto":
        DEVICE_MAP = "auto"
        DEVICE = "cuda:0"
    else:
        DEVICE_MAP = {"": args.device}
        DEVICE = args.device

    llm_backbone = args.model_name

    test = pd.read_parquet(args.test_file).reset_index(drop=True)

    new_obs = []
    for idx, row in test.iterrows():
        for opt in "ABCDE":
            new_obs.append(
                (row["id"], row["context"], row["context_v2"], row["prompt"], row[opt])
            )
    df = pd.DataFrame(
        new_obs, columns=["id", "context", "context_v2", "question", "answer"]
    )

    tokenizer = AutoTokenizer.from_pretrained(
        llm_backbone,
        use_fast=True,
        trust_remote_code=True,
        padding_side="right",
        truncation_side="left",
    )

    if tokenizer.pad_token is None:
        if tokenizer.unk_token is not None:
            tokenizer.pad_token = tokenizer.unk_token
        else:
            tokenizer.pad_token = tokenizer.eos_token

    from transformers import BitsAndBytesConfig

    if args.quantization == 0:
        quantization_config = None
    elif args.quantization == 1:
        quantization_config = BitsAndBytesConfig(
            load_in_8bit=True, llm_int8_threshold=0.0
        )

    model = AutoModelForCausalLM.from_pretrained(
        llm_backbone,
        torch_dtype=torch.float16,
        quantization_config=quantization_config,
        device_map=DEVICE_MAP,
        low_cpu_mem_usage=True,
        trust_remote_code=True,
    ).eval()

    # head_weights = torch.load(llm_backbone + "/head.pth", map_location="cpu")
    # hidden_size = head_weights.shape[1] # 64004

    # print('hidden_size: ', hidden_size)
    # head = torch.nn.Linear(hidden_size, 1, bias=False)

    # head.weight.data = head_weights

    hidden_size = 64004
    head = torch.nn.Linear(hidden_size, 1, bias=False)

    head.to(DEVICE).eval()

    model.config.pad_token_id = tokenizer.pad_token_id

    gc.collect()

    progress_bar = tqdm(df.iterrows(), total=len(df))

    preds = []
    instructions = []
    pooled = []
    past_key_values = None
    for idx, row in progress_bar:
        inst = f"Answer: {row['answer']}\n###\nIs this answer correct? "
        instructions.append(inst)

        if idx % 5 == 0:
            if past_key_values is not None:
                del past_key_values

            preprompt = f"{row['context_v2']}\n###\nQuestion: {row['question']}\n###\n"
            inputs = tokenizer(
                preprompt,
                return_tensors="pt",
                add_special_tokens=False,
                truncation=True,
                padding="longest",
                max_length=1024,
            )

            tok_length = (
                inputs["input_ids"].shape[1]
                + tokenizer(
                    instructions,
                    return_tensors="pt",
                    add_special_tokens=False,
                    truncation=True,
                    padding="longest",
                    max_length=1024,
                )["input_ids"].shape[1]
            )

            BATCH_SIZE = 5 # ABCDE

            with torch.no_grad():
                past_key_values = list(
                    model(input_ids=inputs["input_ids"].to(DEVICE)).past_key_values
                )

                for idx0 in range(len(past_key_values)):
                    past_key_values[idx0] = list(past_key_values[idx0])
                    for idx1 in range(len(past_key_values[idx0])):
                        past_key_values[idx0][idx1] = past_key_values[idx0][
                            idx1
                        ].expand(BATCH_SIZE, -1, -1, -1)
            del inputs

        if (idx + 1) % BATCH_SIZE == 0 or idx == len(df) - 1:
            inputs = tokenizer(
                instructions,
                return_tensors="pt",
                add_special_tokens=False,
                truncation=True,
                padding="longest",
            )

            with torch.no_grad():
                out = model(
                    input_ids=inputs["input_ids"].to(DEVICE),
                    past_key_values=past_key_values,
                ).logits #[5,57,32002]


                # ## ----------------display
                # att_idx = inputs["attention_mask"].sum(dim=1)[0] - 1
                # logit = out[3,att_idx,:]
                # soft_logit = torch.softmax(logit, dim=-1)
                # max_index = torch.argmax(soft_logit).item()
                # vocab = tokenizer.get_vocab() #[token:id]
                # id_to_token = {v:k for k,v in vocab.items()}
                # print('max_value:', id_to_token[max_index])

                # with open('id_to_token.txt', 'w') as f:
                #     for k, v in id_to_token.items():
                #         f.write(f'id: {k}, token: {v}\n')

                # with open('logit.txt', 'w') as f:
                #     for i, v in enumerate(soft_logit):
                #         f.write(f'id: {i}, prob: {v}\n')
                # sys.exit()
                # # --------------------
                
                for jjj in range(len(out)):
                    att_idx = inputs["attention_mask"].sum(dim=1)[jjj] - 1
                    pooled.append(out[jjj, att_idx, :].float().unsqueeze(0)) 

                # pooled:batch_size个[1, 32002]

            instructions = []
            del out
            del inputs

        if (idx + 1) % 5 == 0:
            with torch.no_grad():
                pooled = torch.cat(pooled) # [5, 32002]

                new_poolings = []

                # 当前答案嵌入与其余答案嵌入均值拼接
                indexes = np.arange(0, 5)
                for jj in indexes:
                    other_embeddings = pooled[
                        [jjj for jjj in indexes if jjj != jj]
                    ]
                    new_poolings.append(
                        torch.cat([pooled[jj], torch.mean(other_embeddings, dim=0)])
                    ) # 5个[64004]

                new_poolings = torch.stack(new_poolings) # [5, 64004]

                logits = head(new_poolings) # [5,1] logits=new_poolings×W⊤
                logits = logits[:, 0] # [5] 
                logits = logits.detach().cpu().numpy()

                for lg in logits:
                    preds.append(lg)

            del logits
            pooled = []

    np.save(f"scores_{llm_backbone.split('/')[-1]}_{args.test_file.split('/')[-1][:-3]}", preds)


# 最后一个token的每个vocab概率 -> 答案正确率


Writing inference.py


In [9]:
%%writefile run.sh

# python inference.py --device "auto" --model_name "/kaggle/input/openorca-mistral-7b-openorca" --quantization 0 --model_type 1 --test_file "/kaggle/input/llmse-zeroshot-gettopk/test.pq"  &&
# python inference.py --device "auto" --model_name "/kaggle/input/openorca-mistral-7b-openorca" --quantization 0 --model_type 1 --test_file "/kaggle/input/llmse-zeroshot-gettopk/test2.pq"  &&

python inference.py --device "auto" --model_name "/kaggle/input/openorca-mistral-7b-openorca" --quantization 0 --model_type 1 --test_file "test.pq"  &&
python inference.py --device "auto" --model_name "/kaggle/input/openorca-mistral-7b-openorca" --quantization 0 --model_type 1 --test_file "test2.pq"  &&

wait 
echo "All done"

# /kaggle/input/teamhydrogen-white-malamute-prompt-openorca-v2 head预训练过    

Overwriting run.sh


In [10]:
!sh run.sh

Loading checkpoint shards: 100%|██████████████████| 2/2 [01:45<00:00, 52.60s/it]
  0%|                                          | 1/1000 [00:00<15:00,  1.11it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class (https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)
100%|███████████████████████████████████████| 1000/1000 [02:32<00:00,  6.56it/s]
Loading checkpoint shards: 100%|██████████████████| 2/2 [00:16<00:00,  8.33s/it]
  0%|                                          | 1/1000 [00:00<13:34,  1.23it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
We detected that you are passing `past_key_values` as

In [11]:
import glob
from scipy.special import softmax

# test = pd.read_parquet("/kaggle/input/llmse-zeroshot-gettopk/test.pq")
test = pd.read_parquet("test.pq")
curr_scores = []

for f in glob.glob("scores_*.npy"):
    print(f)
    a = np.array(np.load(f))
    a = softmax(a.reshape(-1,5), axis=1)
    a = a.flatten()
    print(a.mean(axis=0))
    curr_scores.append(a)
    # os.remove(f)
curr_scores = np.array(curr_scores)
preds = np.nanmean(curr_scores, axis=0)

scores_openorca-mistral-7b-openorca_test.npy
0.2
scores_openorca-mistral-7b-openorca_test2.npy
0.2


In [12]:
options = "ABCDE"
indices = list(range(5))

option_to_index = {option: index for option, index in zip(options, indices)}
index_to_option = {index: option for option, index in zip(options, indices)}
post_proc_preds = []

for gr in range(0, len(preds), 5):
    pr = preds[gr : gr + 5]
    pr = np.argsort(-pr)[:3]
    post_proc_preds.append(" ".join([index_to_option[x] for x in pr]))
len(post_proc_preds)

200

In [13]:
test["prediction"] = post_proc_preds


In [14]:
test[["id", "prediction"]].to_csv("submission.csv", index=False)
pd.read_csv("submission.csv")

Unnamed: 0,id,prediction
0,0,D E A
1,1,E A B
2,2,E C A
3,3,A C E
4,4,A D B
...,...,...
195,195,B D E
196,196,C B A
197,197,B D C
198,198,E C B
