In [1]:
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType # type: ignore
from transformers import BitsAndBytesConfig
import torch
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold,StratifiedGroupKFold
import numpy as np
import random
import pandas as pd
import os
from __future__ import annotations
from datasets import Dataset
from transformers import AutoTokenizer, LlamaForSequenceClassification, MistralForSequenceClassification
from transformers import DataCollatorWithPadding
from sklearn.metrics import log_loss
from transformers import TrainingArguments, Trainer
from shutil import rmtree
from scipy.special import softmax
import gc
from pathlib import Path

In [2]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
os.environ['TRANSFORMERS_CACHE'] = '///mnt/c/Personal/Competitions/HFCache/'
os.environ['HF_HOME'] = '///mnt/c/Personal/Competitions/HFCache/'

In [3]:
class Config:
    # General settings
    EXP_NAME = 'nb006'
    competition_name = 'h2O_llm'
    seed = 2022 #42
    debug = False
    train = True
    n_fold = 5
    TARGET_MODEL = 'HuggingFaceH4/zephyr-7b-beta' #"mistralai/Mistral-7B-Instruct-v0.1" #"mistralai/Mistral-7B-v0.1"
    DEBUG = False
    max_len = 2048
    
CFG = Config()

In [4]:
def seed_everything(seed, use_cuda = True):
    np.random.seed(seed) # cpu vars
    torch.manual_seed(seed) # cpu  vars
    random.seed(seed) # Python
    os.environ['PYTHONHASHSEED'] = str(seed) # Python hash building
    if use_cuda:
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False

seed_everything(CFG.seed)

In [5]:
data_dir = "///mnt/c/Personal/Competitions/Kaggle/h2oai-predict-the-llm/"
OUTPUT_DIR = f'///mnt/c/Personal/Competitions/Kaggle/h2oai-predict-the-llm/runs/' + CFG.EXP_NAME + "/"
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

train = pd.read_csv(data_dir + "train.csv").rename(columns={'target': 'label'})
test = pd.read_csv(data_dir + "test.csv")
sample_submission = pd.read_csv(data_dir + "sample_submission.csv")

In [6]:
train.head()

Unnamed: 0,Question,Response,label
0,Explain the concept of coevolution.,Coevolution is a biological process that occur...,3
1,Is it possible that recurring fever and chills...,"Yes, recurring fever and chills can be a sympt...",4
2,Evaluate the expression 3!,The expression 3! represents the factorial of ...,1
3,What are the roles of different types of RNA i...,1. Messenger RNA (mRNA): mRNA carries genetic ...,3
4,What is the role of gene flow in population ge...,Gene flow refers to the movement of individual...,3


In [None]:
# train['Question'] = train['Question'].str.replace('\n', '')
# train['Response'] = train['Response'].str.replace('\n', '')
# test['Question'] = test['Question'].str.replace('\n', '')
# test['Response'] = test['Response'].str.replace('\n', '')


In [7]:
gkf = StratifiedGroupKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)

for i, (_, val_index) in enumerate(gkf.split(train, train["label"],groups=train['Question'])):
    train.loc[val_index, "fold"] = i

train = train.fillna("NA")
test = test.fillna("NA")

In [8]:
train['all_text'] = 'Question: ' + train.Question + '; Answer: ' + train.Response
test['all_text'] = 'Question: ' + test.Question + '; Answer: ' + test.Response

In [9]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    target_modules=[
        "q_proj",
        "v_proj"
    ],
)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [10]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = softmax(predictions,axis=1)
    print(predictions)
    logloss_val = log_loss(labels, predictions)
    
    return {
        "logloss": logloss_val,
    }

In [11]:
def run(fld):
    
    tokenizer = AutoTokenizer.from_pretrained(CFG.TARGET_MODEL, use_fast=False)
    tokenizer.pad_token = tokenizer.eos_token
    # LlamaForSequenceClassification(
    base_model = MistralForSequenceClassification.from_pretrained(
        CFG.TARGET_MODEL,
        num_labels=7,
        cache_dir='///mnt/c/Personal/Competitions/HFCache/',
        quantization_config=bnb_config,
        device_map={"":0})
    
    # base_model.config.pretraining_tp = 1 # 1 is 7b
    base_model.config.pad_token_id = tokenizer.pad_token_id
    model = get_peft_model(base_model, peft_config)
    model.print_trainable_parameters()

    train_df = train[train['fold']!=fld].reset_index(drop=True)
    valid_df = train[train['fold']==fld].reset_index(drop=True)

    print('train shape:',train_df.shape, 'valid shape:',valid_df.shape)
    # from pandas
    train_ds = Dataset.from_pandas(train_df)
    valid_ds = Dataset.from_pandas(valid_df)

    def preprocess_function(examples, max_length=CFG.max_len):
        return tokenizer(examples["all_text"], 
                         truncation=True, 
                         max_length=max_length, 
                         padding=True)

    train_tokenized_ds = train_ds.map(preprocess_function, batched=True)
    valid_tokenized_ds = valid_ds.map(preprocess_function, batched=True)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")    
    model_fold_dir = os.path.join(OUTPUT_DIR,str(fld)) 
    
    training_args = TrainingArguments(
        output_dir=model_fold_dir,
        learning_rate=3e-4,#5e-5,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=32,
        max_grad_norm=0.3,
        optim='paged_adamw_32bit',
        lr_scheduler_type="cosine",
        num_train_epochs=5,
        weight_decay=0.01,
        evaluation_strategy="steps",
        save_strategy="steps",
        load_best_model_at_end=True,
        push_to_hub=False,
        warmup_steps=100,
        eval_steps=50,
        save_steps = 50,
        logging_steps=50,
        report_to='none' # if DEBUG else 'wandb',
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_tokenized_ds,
        eval_dataset=valid_tokenized_ds,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    
    trainer.train()

    trainer.save_model(output_dir=str(model_fold_dir))
    
    for path in Path(training_args.output_dir).glob("checkpoint-*"):
        if path.is_dir():
            rmtree(path)
            
    del trainer, model, base_model

    for i in range(5):
        torch.cuda.empty_cache()
        gc.collect()

In [12]:
for f in range(5):
    run(f)

Downloading (…)lve/main/config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at HuggingFaceH4/zephyr-7b-beta and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 6,873,088 || all params: 7,117,533,184 || trainable%: 0.09656559122829936
train shape: (3178, 5) valid shape: (798, 5)


Map:   0%|          | 0/3178 [00:00<?, ? examples/s]

Map:   0%|          | 0/798 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Logloss
50,7.6189,2.658203,2.619318
100,2.0546,1.399414,1.399243
150,1.2903,1.291992,1.28916
200,1.1346,0.98584,0.986463
250,0.7783,0.95752,0.959075
300,0.6975,0.941406,0.941624
350,0.615,3.322266,1.897734
400,0.3069,1.293945,1.208485
450,0.0986,2.351562,1.629028


[[0.1106   0.005875 0.0353   ... 0.3271   0.1847   0.1193  ]
 [0.0879   0.011696 0.00208  ... 0.629    0.02641  0.10565 ]
 [0.08856  0.01159  0.02502  ... 0.3208   0.2471   0.1234  ]
 ...
 [0.4905   0.004883 0.02036  ... 0.0214   0.04913  0.263   ]
 [0.06824  0.008026 0.02342  ... 0.489    0.1583   0.1656  ]
 [0.0802   0.001928 0.00964  ... 0.2678   0.306    0.2607  ]]
[[0.1076   0.261    0.2505   ... 0.0826   0.02719  0.03152 ]
 [0.0277   0.013084 0.007633 ... 0.03165  0.862    0.02043 ]
 [0.1521   0.0769   0.1174   ... 0.5913   0.01301  0.02979 ]
 ...
 [0.0875   0.2566   0.03708  ... 0.06445  0.209    0.2644  ]
 [0.0953   0.0745   0.1704   ... 0.5977   0.015144 0.04053 ]
 [0.002062 0.012985 0.008194 ... 0.007145 0.958    0.004505]]
[[2.2266e-01 1.1774e-01 3.5278e-01 ... 1.1584e-01 4.3564e-03 2.2369e-02]
 [1.8356e-02 1.5076e-01 3.4904e-03 ... 1.0544e-02 2.4707e-01 1.8265e-02]
 [6.5039e-01 2.2545e-03 1.1023e-01 ... 2.2363e-01 8.2636e-04 1.0506e-02]
 ...
 [1.2577e-05 2.7039e-02 5.3644e-



[[0.000e+00 0.000e+00 0.000e+00 ... 0.000e+00 0.000e+00 5.960e-08]
 [7.337e-05 2.139e-01 2.260e-04 ... 1.009e-03 7.397e-01 4.923e-05]
 [8.027e-01 5.364e-07 1.484e-01 ... 4.895e-02 5.960e-08 5.007e-06]
 ...
 [0.000e+00 0.000e+00 0.000e+00 ... 0.000e+00 1.000e+00 0.000e+00]
 [7.935e-04 0.000e+00 5.817e-03 ... 9.932e-01 0.000e+00 0.000e+00]
 [0.000e+00 0.000e+00 0.000e+00 ... 0.000e+00 1.000e+00 0.000e+00]]




[[0.000e+00 0.000e+00 0.000e+00 ... 0.000e+00 0.000e+00 0.000e+00]
 [0.000e+00 5.299e-03 5.960e-08 ... 0.000e+00 9.941e-01 0.000e+00]
 [9.922e-01 0.000e+00 8.133e-03 ... 3.093e-05 0.000e+00 0.000e+00]
 ...
 [0.000e+00 0.000e+00 0.000e+00 ... 0.000e+00 1.000e+00 0.000e+00]
 [2.412e-04 0.000e+00 1.264e-03 ... 9.980e-01 0.000e+00 0.000e+00]
 [0.000e+00 0.000e+00 0.000e+00 ... 0.000e+00 1.000e+00 0.000e+00]]




Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at HuggingFaceH4/zephyr-7b-beta and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 6,873,088 || all params: 7,117,533,184 || trainable%: 0.09656559122829936
train shape: (3178, 5) valid shape: (798, 5)


Map:   0%|          | 0/3178 [00:00<?, ? examples/s]

Map:   0%|          | 0/798 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Logloss
50,6.278,1.9375,1.937405
100,2.1677,2.285156,2.213167
150,1.9429,1.824219,1.812596
200,1.8373,1.694336,1.693922
250,1.7157,1.640625,1.640927
300,1.5881,1.526367,1.526646
350,1.4591,1.474609,1.472436
400,1.367,1.420898,1.42064
450,1.208,1.396484,1.395767


[[0.1918  0.0425  0.2103  ... 0.1833  0.1136  0.1726 ]
 [0.0892  0.07043 0.2267  ... 0.1769  0.0738  0.1249 ]
 [0.0865  0.1365  0.1533  ... 0.2128  0.1355  0.0871 ]
 ...
 [0.11206 0.065   0.198   ... 0.1558  0.1442  0.1141 ]
 [0.1353  0.05103 0.1395  ... 0.4758  0.04382 0.1328 ]
 [0.10443 0.0809  0.142   ... 0.1592  0.1187  0.10724]]
[[7.0679e-02 1.3892e-01 1.5588e-01 ... 1.2396e-01 3.0469e-01 1.3123e-01]
 [6.0608e-02 1.4026e-01 1.3782e-01 ... 9.8816e-02 3.4399e-01 9.3689e-02]
 [1.8402e-02 1.5308e-01 8.2458e-02 ... 3.5339e-02 3.8062e-01 4.2206e-02]
 ...
 [1.2970e-02 1.5540e-01 7.9163e-02 ... 2.9678e-02 4.0088e-01 3.6133e-02]
 [1.8677e-01 7.6914e-04 1.4294e-01 ... 7.6180e-03 1.0908e-05 6.5186e-01]
 [9.4376e-03 1.4905e-01 5.8563e-02 ... 2.4384e-02 3.8428e-01 3.3051e-02]]
[[0.0777  0.18    0.2532  ... 0.1566  0.1519  0.1482 ]
 [0.0824  0.2194  0.3003  ... 0.1394  0.149   0.0752 ]
 [0.04868 0.2485  0.339   ... 0.03152 0.1509  0.0594 ]
 ...
 [0.06033 0.2595  0.2773  ... 0.0516  0.1729  0.08

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at HuggingFaceH4/zephyr-7b-beta and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 6,873,088 || all params: 7,117,533,184 || trainable%: 0.09656559122829936
train shape: (3178, 5) valid shape: (798, 5)


Map:   0%|          | 0/3178 [00:00<?, ? examples/s]

Map:   0%|          | 0/798 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Logloss
50,6.0943,2.132812,2.01254
100,2.1119,1.994141,1.932809
150,1.2847,1.088867,1.088517
200,1.0941,1.103516,1.102558
250,0.7972,1.132812,1.134392
300,0.7591,1.033203,1.033628
350,0.3298,1.112305,1.09575
400,0.2848,0.974609,0.970695
450,0.0387,1.445312,1.341999


[[8.3313e-02 8.5526e-03 2.0623e-05 ... 3.7785e-03 2.5094e-05 9.0283e-01]
 [2.3666e-02 3.6646e-01 1.1139e-03 ... 1.0658e-02 2.3758e-02 3.4912e-02]
 [1.6220e-02 2.5757e-01 3.5205e-01 ... 9.9258e-03 2.6880e-01 8.4290e-02]
 ...
 [1.2671e-01 3.5840e-01 1.6724e-01 ... 1.0223e-01 8.9355e-02 1.2169e-02]
 [8.2324e-01 1.7761e-02 1.5221e-03 ... 8.7097e-02 1.4900e-02 3.8872e-03]
 [9.4238e-02 0.0000e+00 1.4305e-06 ... 9.0137e-01 3.7861e-03 3.8981e-04]]
[[1.2934e-04 1.2338e-04 1.6093e-06 ... 2.3365e-05 0.0000e+00 1.0000e+00]
 [3.6682e-02 7.7942e-02 5.1384e-03 ... 2.8820e-03 5.3644e-05 8.5742e-01]
 [1.6260e-03 6.4812e-03 4.7159e-04 ... 2.1782e-03 9.2480e-01 6.3965e-02]
 ...
 [2.3169e-01 1.6357e-01 1.9141e-01 ... 1.0986e-01 1.7452e-04 3.0103e-01]
 [9.3750e-01 5.9462e-04 6.4039e-04 ... 1.5640e-02 3.0014e-02 1.5701e-02]
 [1.4648e-02 5.9605e-08 1.5497e-06 ... 9.7998e-01 2.2590e-04 5.2872e-03]]
[[1.915e-01 1.702e-01 3.311e-02 ... 2.805e-02 7.957e-03 5.122e-01]
 [9.880e-04 1.641e-03 8.321e-04 ... 1.247e-04

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at HuggingFaceH4/zephyr-7b-beta and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 6,873,088 || all params: 7,117,533,184 || trainable%: 0.09656559122829936
train shape: (3185, 5) valid shape: (791, 5)


Map:   0%|          | 0/3185 [00:00<?, ? examples/s]

Map:   0%|          | 0/791 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Logloss
50,6.2076,2.121094,2.010738
100,2.0898,1.689453,1.658255
150,1.2524,1.188477,1.17613
200,1.101,1.111328,1.109068
250,0.7998,1.117188,1.111799
300,0.7283,0.944824,0.937459
350,0.3331,1.271484,1.218016
400,0.3134,1.174805,1.113628
450,0.0737,1.545898,1.354563


[[2.4918e-02 1.1841e-01 5.9662e-03 ... 1.6724e-02 2.8458e-02 5.4626e-02]
 [2.6749e-02 4.3915e-02 8.3374e-02 ... 2.8687e-02 1.1909e-02 1.7505e-01]
 [8.4543e-04 4.2877e-02 3.6216e-04 ... 3.4332e-05 5.2986e-03 1.1644e-03]
 ...
 [3.6694e-01 1.6320e-04 5.5075e-05 ... 2.1912e-01 3.6987e-01 4.4006e-02]
 [2.0248e-02 7.6416e-02 2.5589e-02 ... 4.8950e-02 2.4475e-02 7.9932e-01]
 [6.6772e-02 1.4038e-01 7.2899e-03 ... 6.5137e-01 9.7046e-03 1.0345e-01]]




[[3.3508e-02 1.0452e-02 1.2012e-01 ... 1.9646e-03 4.1351e-02 1.0400e-01]
 [1.0729e-06 3.5882e-05 1.4186e-05 ... 1.2517e-06 9.9805e-01 7.3552e-05]
 [1.1903e-04 1.2481e-04 2.1720e-04 ... 3.5167e-06 2.3174e-03 1.8339e-03]
 ...
 [6.9385e-01 6.3896e-03 1.2061e-01 ... 2.4261e-02 5.3894e-02 6.7139e-02]
 [1.0107e-01 1.2436e-03 2.7075e-01 ... 1.6467e-01 5.6152e-03 4.4946e-01]
 [1.9873e-01 2.1286e-02 3.0786e-01 ... 2.2430e-02 1.2581e-02 2.6123e-01]]
[[2.472e-03 3.088e-01 4.917e-03 ... 1.176e-03 1.487e-03 9.076e-02]
 [3.239e-03 5.539e-02 1.723e-02 ... 8.698e-03 5.195e-01 9.625e-02]
 [2.840e-04 1.357e-02 7.081e-04 ... 1.982e-04 7.701e-05 3.681e-03]
 ...
 [9.351e-02 2.676e-02 6.415e-02 ... 2.507e-01 3.635e-01 1.259e-01]
 [9.271e-02 1.181e-02 3.149e-01 ... 3.591e-01 2.419e-03 2.107e-01]
 [1.892e-01 4.758e-02 1.920e-01 ... 2.524e-01 9.636e-03 1.772e-01]]
[[4.3945e-03 5.2441e-01 5.1613e-03 ... 3.0823e-03 1.3969e-02 5.0812e-02]
 [4.3511e-06 1.4484e-05 5.3644e-07 ... 9.5367e-07 1.0000e+00 5.8413e-06]
 [



[[0.0000e+00 7.2539e-05 0.0000e+00 ... 0.0000e+00 0.0000e+00 1.4091e-04]
 [0.0000e+00 0.0000e+00 0.0000e+00 ... 0.0000e+00 1.0000e+00 0.0000e+00]
 [0.0000e+00 0.0000e+00 0.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 ...
 [3.1853e-03 4.9472e-06 2.5988e-05 ... 1.1122e-04 9.9707e-01 1.0967e-05]
 [1.2726e-02 0.0000e+00 3.3975e-06 ... 9.8730e-01 0.0000e+00 5.9605e-07]
 [7.3853e-02 0.0000e+00 1.9424e-02 ... 9.0723e-01 0.0000e+00 2.3842e-05]]


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at HuggingFaceH4/zephyr-7b-beta and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 6,873,088 || all params: 7,117,533,184 || trainable%: 0.09656559122829936
train shape: (3185, 5) valid shape: (791, 5)


Map:   0%|          | 0/3185 [00:00<?, ? examples/s]

Map:   0%|          | 0/791 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Logloss
50,5.9406,2.626953,2.444725
100,2.1209,1.535156,1.519947
150,1.17,1.206055,1.202628
200,1.09,1.224609,1.212254
250,0.7254,1.016602,1.014004
300,0.725,0.900391,0.889808
350,0.3557,1.144531,1.087595
400,0.2968,1.145508,1.074275
450,0.0697,1.741211,1.373249


[[0.2896   0.6274   0.009834 ... 0.0279   0.01455  0.001156]
 [0.4375   0.00735  0.05228  ... 0.2893   0.006145 0.198   ]
 [0.654    0.08026  0.00823  ... 0.05472  0.1084   0.02074 ]
 ...
 [0.82     0.01587  0.01     ... 0.03867  0.07745  0.004765]
 [0.03464  0.904    0.003551 ... 0.003782 0.0202   0.00935 ]
 [0.1819   0.132    0.007713 ... 0.002184 0.01087  0.0478  ]]
[[0.0656   0.6836   0.05624  ... 0.0758   0.004147 0.00615 ]
 [0.371    0.1476   0.06033  ... 0.1936   0.05667  0.10736 ]
 [0.1848   0.216    0.1532   ... 0.07794  0.01245  0.04733 ]
 ...
 [0.29     0.1377   0.1661   ... 0.1935   0.009155 0.0554  ]
 [0.1376   0.521    0.0631   ... 0.04535  0.011375 0.08075 ]
 [0.1146   0.2236   0.04385  ... 0.01557  0.004375 0.1884  ]]
[[3.458e-02 8.203e-02 3.047e-01 ... 5.483e-01 1.884e-04 5.272e-03]
 [3.032e-01 7.256e-03 1.771e-01 ... 4.612e-01 1.638e-03 3.726e-02]
 [1.644e-01 2.229e-02 2.098e-01 ... 4.966e-01 9.737e-04 5.316e-02]
 ...
 [1.020e-01 9.979e-03 2.004e-01 ... 6.396e-01 5.97



[[4.2038e-03 2.2054e-06 9.7803e-01 ... 1.7639e-02 0.0000e+00 0.0000e+00]
 [6.8018e-01 3.9935e-06 1.0638e-01 ... 2.1167e-01 7.0751e-05 1.4648e-03]
 [8.0322e-02 1.4305e-05 7.6172e-01 ... 1.5784e-01 6.5565e-07 7.1526e-05]
 ...
 [1.1993e-01 2.6345e-04 1.8372e-01 ... 6.8750e-01 5.9605e-08 6.9580e-03]
 [8.8215e-04 9.9805e-01 1.8346e-04 ... 9.5367e-07 0.0000e+00 7.4005e-04]
 [3.4571e-06 6.5029e-05 5.3644e-07 ... 0.0000e+00 0.0000e+00 1.0000e+00]]




[[2.646e-05 0.000e+00 1.000e+00 ... 9.763e-05 0.000e+00 0.000e+00]
 [9.316e-01 0.000e+00 1.019e-02 ... 5.817e-02 3.576e-07 4.172e-07]
 [1.634e-02 0.000e+00 9.282e-01 ... 5.533e-02 0.000e+00 0.000e+00]
 ...
 [5.127e-02 1.669e-06 2.791e-01 ... 6.699e-01 0.000e+00 1.192e-07]
 [7.153e-06 1.000e+00 9.537e-07 ... 0.000e+00 0.000e+00 2.503e-06]
 [0.000e+00 3.022e-05 0.000e+00 ... 0.000e+00 0.000e+00 1.000e+00]]




In [13]:
gc.collect()

0

In [14]:
torch.cuda.empty_cache()

In [15]:
def preprocess_function(examples, max_length=CFG.max_len):
    return tokenizer(examples["all_text"], 
                     truncation=True, 
                     max_length=max_length, 
                     padding=True)
tokenizer = AutoTokenizer.from_pretrained(CFG.TARGET_MODEL, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token

test_ds = Dataset.from_pandas(test)
test_tokenized_ds = test_ds.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")

Map:   0%|          | 0/1001 [00:00<?, ? examples/s]

In [16]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [17]:
base_model = MistralForSequenceClassification.from_pretrained(
    CFG.TARGET_MODEL,
    num_labels=7,
    cache_dir='///mnt/c/Personal/Competitions/HFCache/',
    quantization_config=bnb_config,
    device_map={"":0})

base_model.config.pad_token_id = tokenizer.pad_token_id

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at HuggingFaceH4/zephyr-7b-beta and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
import scipy
out = np.zeros((1001, 7))
for f in [0,2,3,4]:#(range(5)): fold 1 was really bad
    print(f'----------- Fold: {f} ----------')
    model = PeftModel.from_pretrained(base_model, f'///mnt/c/Personal/Competitions/Kaggle/h2oai-predict-the-llm/runs/nb006/{f}/')

    trainer = Trainer(model=model,
                      tokenizer=tokenizer,
                      data_collator=data_collator)
    pred_output = trainer.predict(test_tokenized_ds)
    logits = pred_output.predictions
    probits = scipy.special.softmax(logits,1)

    out += probits/4
    del trainer, model
    torch.cuda.empty_cache()    
    gc.collect()

----------- Fold: 0 ----------


----------- Fold: 2 ----------


----------- Fold: 3 ----------


----------- Fold: 4 ----------


In [19]:
out

array([[2.42165565e-01, 9.00506973e-03, 6.56509399e-02, ...,
        5.36315918e-01, 1.26384676e-01, 1.90911293e-02],
       [1.37627125e-04, 4.68301773e-03, 5.76376915e-05, ...,
        2.79188156e-04, 9.94506836e-01, 4.95254993e-04],
       [1.19209290e-07, 5.96046448e-07, 5.96046448e-08, ...,
        4.76837158e-07, 1.00000000e+00, 5.96046448e-08],
       ...,
       [4.92553711e-02, 1.31215096e-01, 3.27293396e-01, ...,
        2.95013428e-01, 1.61668360e-01, 3.06643844e-02],
       [3.54125977e-01, 1.47848129e-02, 3.08013916e-01, ...,
        3.07586670e-01, 2.63869762e-04, 1.32680535e-02],
       [2.23785639e-03, 9.34197903e-02, 1.60312653e-03, ...,
        1.09428167e-03, 5.06639481e-05, 9.01489258e-01]])

In [20]:
sample_submission.iloc[:,1:] = out

In [21]:
OUTPUT_DIR

'///mnt/c/Personal/Competitions/Kaggle/h2oai-predict-the-llm/runs/nb006/'

In [22]:
sample_submission.to_csv(f'{OUTPUT_DIR}/submit.csv',index=False)

### Smaller max len inference

In [30]:
def preprocess_function(examples, max_length=1536):
    return tokenizer(examples["all_text"], 
                     truncation=True, 
                     max_length=max_length, 
                     padding=True)
tokenizer = AutoTokenizer.from_pretrained(CFG.TARGET_MODEL, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token

test_ds = Dataset.from_pandas(test)
test_tokenized_ds = test_ds.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")

Map:   0%|          | 0/1001 [00:00<?, ? examples/s]

In [31]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [32]:
base_model = MistralForSequenceClassification.from_pretrained(
    CFG.TARGET_MODEL,
    num_labels=7,
    cache_dir='///mnt/c/Personal/Competitions/HFCache/',
    quantization_config=bnb_config,
    device_map={"":0})

base_model.config.pad_token_id = tokenizer.pad_token_id

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at HuggingFaceH4/zephyr-7b-beta and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
import scipy
out = np.zeros((1001, 7))
for f in [0,2,3,4]:#(range(5)): fold 1 was really bad
    print(f'----------- Fold: {f} ----------')
    model = PeftModel.from_pretrained(base_model, f'///mnt/c/Personal/Competitions/Kaggle/h2oai-predict-the-llm/runs/nb006/{f}/')

    trainer = Trainer(model=model,
                      tokenizer=tokenizer,
                      data_collator=data_collator)
    pred_output = trainer.predict(test_tokenized_ds)
    logits = pred_output.predictions
    probits = scipy.special.softmax(logits,1)

    out += probits/4
    del trainer, model
    torch.cuda.empty_cache()    
    gc.collect()

----------- Fold: 0 ----------


----------- Fold: 2 ----------


----------- Fold: 3 ----------


----------- Fold: 4 ----------


In [34]:
sample_submission.iloc[:,1:] = out

In [35]:
# sample_submission.to_csv(f'{OUTPUT_DIR}/submit_maxlenX2.csv',index=False)
sample_submission.to_csv(f'{OUTPUT_DIR}/submit_maxlen_1536.csv',index=False)