In [1]:
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType # type: ignore
from transformers import BitsAndBytesConfig
import torch
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold,StratifiedGroupKFold
import numpy as np
import random
import pandas as pd
import os
from __future__ import annotations
from datasets import Dataset
from transformers import AutoTokenizer, LlamaForSequenceClassification, MistralForSequenceClassification
from transformers import DataCollatorWithPadding
from sklearn.metrics import log_loss
from transformers import TrainingArguments, Trainer
from shutil import rmtree
from scipy.special import softmax
import gc
from pathlib import Path

In [2]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
os.environ['TRANSFORMERS_CACHE'] = '///mnt/c/Personal/Competitions/HFCache/'
os.environ['HF_HOME'] = '///mnt/c/Personal/Competitions/HFCache/'

In [3]:
class Config:
    # General settings
    EXP_NAME = 'nb005'
    competition_name = 'h2O_llm'
    seed = 2022 #42
    debug = False
    train = True
    n_fold = 5
    TARGET_MODEL = "mistralai/Mistral-7B-Instruct-v0.1" #"mistralai/Mistral-7B-v0.1"
    DEBUG = False
    max_len = 2048
    
CFG = Config()

In [4]:
def seed_everything(seed, use_cuda = True):
    np.random.seed(seed) # cpu vars
    torch.manual_seed(seed) # cpu  vars
    random.seed(seed) # Python
    os.environ['PYTHONHASHSEED'] = str(seed) # Python hash building
    if use_cuda:
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False

seed_everything(CFG.seed)

In [5]:
data_dir = "///mnt/c/Personal/Competitions/Kaggle/h2oai-predict-the-llm/"
OUTPUT_DIR = f'///mnt/c/Personal/Competitions/Kaggle/h2oai-predict-the-llm/runs/' + CFG.EXP_NAME + "/"
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

train = pd.read_csv(data_dir + "train.csv").rename(columns={'target': 'label'})
test = pd.read_csv(data_dir + "test.csv")
sample_submission = pd.read_csv(data_dir + "sample_submission.csv")

In [6]:
train.head()

Unnamed: 0,Question,Response,label
0,Explain the concept of coevolution.,Coevolution is a biological process that occur...,3
1,Is it possible that recurring fever and chills...,"Yes, recurring fever and chills can be a sympt...",4
2,Evaluate the expression 3!,The expression 3! represents the factorial of ...,1
3,What are the roles of different types of RNA i...,1. Messenger RNA (mRNA): mRNA carries genetic ...,3
4,What is the role of gene flow in population ge...,Gene flow refers to the movement of individual...,3


In [7]:
# train['Question'] = train['Question'].str.replace('\n', '')
# train['Response'] = train['Response'].str.replace('\n', '')
# test['Question'] = test['Question'].str.replace('\n', '')
# test['Response'] = test['Response'].str.replace('\n', '')


In [8]:
gkf = StratifiedGroupKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)

for i, (_, val_index) in enumerate(gkf.split(train, train["label"],groups=train['Question'])):
    train.loc[val_index, "fold"] = i

train = train.fillna("NA")
test = test.fillna("NA")

In [9]:
train['all_text'] = 'Question: ' + train.Question + '; Answer: ' + train.Response
test['all_text'] = 'Question: ' + test.Question + '; Answer: ' + test.Response

In [10]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    target_modules=[
        "q_proj",
        "v_proj"
    ],
)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [11]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = softmax(predictions,axis=1)
    print(predictions)
    logloss_val = log_loss(labels, predictions)
    
    return {
        "logloss": logloss_val,
    }

In [12]:
def run(fld):
    
    tokenizer = AutoTokenizer.from_pretrained(CFG.TARGET_MODEL, use_fast=False)
    tokenizer.pad_token = tokenizer.eos_token
    # LlamaForSequenceClassification(
    base_model = MistralForSequenceClassification.from_pretrained(
        CFG.TARGET_MODEL,
        num_labels=7,
        cache_dir='///mnt/c/Personal/Competitions/HFCache/',
        quantization_config=bnb_config,
        device_map={"":0})
    
    # base_model.config.pretraining_tp = 1 # 1 is 7b
    base_model.config.pad_token_id = tokenizer.pad_token_id
    model = get_peft_model(base_model, peft_config)
    model.print_trainable_parameters()

    train_df = train[train['fold']!=fld].reset_index(drop=True)
    valid_df = train[train['fold']==fld].reset_index(drop=True)

    print('train shape:',train_df.shape, 'valid shape:',valid_df.shape)
    # from pandas
    train_ds = Dataset.from_pandas(train_df)
    valid_ds = Dataset.from_pandas(valid_df)

    def preprocess_function(examples, max_length=CFG.max_len):
        return tokenizer(examples["all_text"], 
                         truncation=True, 
                         max_length=max_length, 
                         padding=True)

    train_tokenized_ds = train_ds.map(preprocess_function, batched=True)
    valid_tokenized_ds = valid_ds.map(preprocess_function, batched=True)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")    
    model_fold_dir = os.path.join(OUTPUT_DIR,str(fld)) 
    
    training_args = TrainingArguments(
        output_dir=model_fold_dir,
        learning_rate=3e-4,#5e-5,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=32,
        max_grad_norm=0.3,
        optim='paged_adamw_32bit',
        lr_scheduler_type="cosine",
        num_train_epochs=5,
        weight_decay=0.01,
        evaluation_strategy="steps",
        save_strategy="steps",
        load_best_model_at_end=True,
        push_to_hub=False,
        warmup_steps=100,
        eval_steps=50,
        save_steps = 50,
        logging_steps=50,
        report_to='none' # if DEBUG else 'wandb',
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_tokenized_ds,
        eval_dataset=valid_tokenized_ds,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    
    trainer.train()

    trainer.save_model(output_dir=str(model_fold_dir))
    
    for path in Path(training_args.output_dir).glob("checkpoint-*"):
        if path.is_dir():
            rmtree(path)
            
    del trainer, model, base_model

    for i in range(5):
        torch.cuda.empty_cache()
        gc.collect()

In [13]:
for f in range(5):
    run(f)

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-Instruct-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 6,873,088 || all params: 7,117,533,184 || trainable%: 0.09656559122829936
train shape: (3178, 5) valid shape: (798, 5)


Map:   0%|          | 0/3178 [00:00<?, ? examples/s]

Map:   0%|          | 0/798 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Logloss
50,5.5229,1.916992,1.849874
100,1.509,1.147461,1.146088
150,0.9607,1.045898,1.037506
200,0.9624,1.024414,1.025505
250,0.5267,0.999023,0.99266
300,0.4911,1.101562,1.074148
350,0.1582,1.369141,1.262734
400,0.1257,1.530273,1.358043
450,0.0148,2.164062,1.625505


[[9.4452e-03 2.9126e-01 2.1362e-01 ... 5.4207e-03 1.3535e-02 1.5649e-01]
 [3.3295e-02 4.1847e-03 1.2268e-01 ... 1.0996e-03 8.2910e-01 7.5150e-03]
 [4.3921e-01 4.0680e-02 2.0178e-01 ... 2.7783e-01 1.5808e-02 2.2064e-02]
 ...
 [8.0615e-01 2.6684e-03 2.9588e-04 ... 1.4305e-06 6.2561e-02 1.2805e-01]
 [1.6064e-01 1.1511e-01 6.0547e-01 ... 9.4666e-02 6.8893e-03 1.5312e-02]
 [8.7357e-03 4.9171e-03 6.3538e-02 ... 2.1000e-03 9.0137e-01 1.6388e-02]]
[[3.630e-05 1.018e-02 2.537e-04 ... 5.442e-05 2.164e-05 1.062e-02]
 [2.399e-02 2.859e-01 1.635e-03 ... 3.304e-03 6.699e-01 5.890e-03]
 [1.285e-01 2.983e-03 1.022e-01 ... 7.563e-01 2.243e-03 7.652e-03]
 ...
 [2.532e-01 2.258e-02 1.840e-03 ... 3.010e-03 6.250e-01 9.308e-02]
 [7.373e-02 2.684e-03 9.747e-02 ... 8.237e-01 4.230e-04 1.788e-03]
 [9.280e-05 2.171e-03 1.249e-04 ... 2.902e-04 9.961e-01 1.166e-03]]
[[4.719e-03 2.448e-02 1.529e-02 ... 6.622e-05 1.192e-07 3.583e-02]
 [6.706e-05 4.599e-04 2.754e-05 ... 5.388e-05 9.990e-01 4.690e-04]
 [6.011e-01 2.



[[4.351e-06 4.941e-05 7.749e-07 ... 0.000e+00 0.000e+00 6.285e-04]
 [6.557e-07 2.980e-05 5.960e-08 ... 5.960e-08 1.000e+00 3.576e-07]
 [4.126e-02 2.384e-07 3.403e-01 ... 6.182e-01 5.960e-08 5.186e-06]
 ...
 [1.754e-04 7.808e-06 5.960e-08 ... 5.960e-08 1.000e+00 1.925e-05]
 [9.895e-03 2.980e-07 5.068e-01 ... 4.834e-01 1.192e-07 2.742e-06]
 [2.384e-07 1.195e-02 4.691e-05 ... 0.000e+00 9.863e-01 1.415e-03]]
[[0.000e+00 0.000e+00 0.000e+00 ... 0.000e+00 0.000e+00 2.325e-06]
 [0.000e+00 0.000e+00 0.000e+00 ... 0.000e+00 1.000e+00 0.000e+00]
 [1.330e-03 5.960e-08 5.845e-01 ... 4.143e-01 0.000e+00 5.960e-08]
 ...
 [5.960e-08 0.000e+00 0.000e+00 ... 0.000e+00 1.000e+00 0.000e+00]
 [8.833e-05 5.960e-08 9.048e-01 ... 9.576e-02 0.000e+00 2.980e-07]
 [0.000e+00 5.960e-08 0.000e+00 ... 0.000e+00 1.000e+00 0.000e+00]]




[[0.0000e+00 0.0000e+00 0.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [0.0000e+00 0.0000e+00 0.0000e+00 ... 0.0000e+00 1.0000e+00 0.0000e+00]
 [7.7332e-02 0.0000e+00 4.8120e-01 ... 4.4141e-01 0.0000e+00 0.0000e+00]
 ...
 [1.1921e-07 0.0000e+00 0.0000e+00 ... 0.0000e+00 1.0000e+00 0.0000e+00]
 [3.9601e-04 0.0000e+00 8.9111e-01 ... 1.0815e-01 0.0000e+00 0.0000e+00]
 [0.0000e+00 5.9605e-08 0.0000e+00 ... 0.0000e+00 1.0000e+00 0.0000e+00]]




[[0.000e+00 0.000e+00 0.000e+00 ... 0.000e+00 0.000e+00 0.000e+00]
 [0.000e+00 0.000e+00 0.000e+00 ... 0.000e+00 1.000e+00 0.000e+00]
 [3.080e-03 0.000e+00 9.023e-01 ... 9.509e-02 0.000e+00 0.000e+00]
 ...
 [0.000e+00 0.000e+00 0.000e+00 ... 0.000e+00 1.000e+00 0.000e+00]
 [1.490e-06 0.000e+00 9.912e-01 ... 8.713e-03 0.000e+00 0.000e+00]
 [0.000e+00 0.000e+00 0.000e+00 ... 0.000e+00 1.000e+00 0.000e+00]]




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-Instruct-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 6,873,088 || all params: 7,117,533,184 || trainable%: 0.09656559122829936
train shape: (3178, 5) valid shape: (798, 5)


Map:   0%|          | 0/3178 [00:00<?, ? examples/s]

Map:   0%|          | 0/798 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Logloss
50,5.162,1.914062,1.808452
100,1.5003,1.230469,1.225307
150,1.0398,1.182617,1.183256
200,0.9682,0.89502,0.896527
250,0.5784,0.95166,0.953653
300,0.581,0.956055,0.941454
350,0.1964,1.228516,1.16093
400,0.137,1.518555,1.322069
450,0.0134,2.052734,1.578867


[[1.3684e-01 1.1835e-03 7.3059e-02 ... 6.8115e-01 1.0300e-02 9.3811e-02]
 [1.7258e-02 4.1962e-03 6.8542e-02 ... 7.0020e-01 2.4281e-03 3.5492e-02]
 [1.0864e-02 1.4087e-01 5.3772e-02 ... 9.8938e-02 2.5220e-01 7.3059e-02]
 ...
 [1.3504e-03 6.3658e-05 1.8716e-04 ... 3.5214e-04 9.9414e-01 3.9124e-04]
 [2.2839e-01 2.4887e-02 1.4514e-01 ... 4.0527e-02 3.3813e-01 2.2180e-01]
 [7.2876e-02 1.1768e-01 2.2864e-01 ... 7.8796e-02 7.8613e-02 1.5027e-01]]
[[9.8511e-02 1.2726e-02 4.4312e-02 ... 8.9722e-02 5.3062e-03 7.4268e-01]
 [6.9458e-02 1.9324e-01 1.5955e-01 ... 1.2079e-01 2.3056e-02 5.1544e-02]
 [3.5038e-03 8.7354e-01 1.5762e-02 ... 3.7003e-03 2.7809e-03 2.5482e-02]
 ...
 [1.6809e-05 1.1963e-04 1.0073e-05 ... 2.2054e-06 1.0000e+00 6.3002e-05]
 [2.4817e-01 1.5039e-01 1.0669e-01 ... 1.4343e-01 7.4341e-02 2.6099e-01]
 [4.8518e-04 9.2334e-01 6.7291e-03 ... 1.0815e-03 2.0957e-04 2.2751e-02]]
[[2.3877e-01 1.1147e-02 3.1567e-01 ... 1.7773e-01 2.8858e-03 2.5122e-01]
 [1.3824e-02 2.4280e-01 4.5068e-01 ... 



[[9.961e-01 0.000e+00 2.113e-03 ... 2.016e-03 0.000e+00 1.079e-05]
 [0.000e+00 1.013e-06 0.000e+00 ... 0.000e+00 3.850e-05 0.000e+00]
 [1.297e-03 2.560e-03 9.854e-01 ... 1.597e-05 9.888e-03 8.696e-05]
 ...
 [0.000e+00 0.000e+00 0.000e+00 ... 0.000e+00 1.000e+00 0.000e+00]
 [9.873e-01 8.941e-07 6.676e-06 ... 1.243e-02 2.384e-07 7.749e-07]
 [7.927e-06 9.595e-01 2.071e-02 ... 5.364e-06 2.861e-06 1.361e-03]]




[[1.000e+00 0.000e+00 2.980e-07 ... 1.431e-06 0.000e+00 0.000e+00]
 [0.000e+00 0.000e+00 0.000e+00 ... 0.000e+00 0.000e+00 0.000e+00]
 [2.592e-04 5.144e-05 1.000e+00 ... 5.960e-08 4.172e-06 1.550e-06]
 ...
 [0.000e+00 0.000e+00 0.000e+00 ... 0.000e+00 1.000e+00 0.000e+00]
 [9.990e-01 0.000e+00 5.960e-08 ... 1.017e-03 0.000e+00 0.000e+00]
 [3.576e-07 9.922e-01 5.884e-04 ... 5.960e-08 0.000e+00 4.232e-05]]




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-Instruct-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 6,873,088 || all params: 7,117,533,184 || trainable%: 0.09656559122829936
train shape: (3178, 5) valid shape: (798, 5)


Map:   0%|          | 0/3178 [00:00<?, ? examples/s]

Map:   0%|          | 0/798 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Logloss
50,5.1684,1.970703,1.877719
100,1.6047,1.194336,1.195062
150,1.0162,1.102539,1.1031
200,0.9432,0.919434,0.920444
250,0.5633,1.105469,1.106163
300,0.5552,1.063477,1.050746
350,0.1904,1.34375,1.298372
400,0.1427,1.345703,1.264475
450,0.0184,1.988281,1.607474


[[2.957e-02 7.416e-03 6.419e-05 ... 1.402e-02 8.762e-06 9.492e-01]
 [6.677e-02 2.832e-02 3.670e-03 ... 4.498e-03 2.068e-02 5.428e-03]
 [7.898e-02 5.698e-02 3.622e-02 ... 1.408e-02 7.051e-01 7.288e-02]
 ...
 [8.490e-02 4.641e-01 6.171e-02 ... 2.328e-02 8.704e-02 7.080e-02]
 [2.720e-01 1.436e-01 7.362e-03 ... 7.626e-03 5.283e-01 1.223e-02]
 [5.684e-03 0.000e+00 6.855e-06 ... 9.941e-01 2.980e-07 3.862e-05]]
[[0.1016   0.4114   0.01233  ... 0.02687  0.0132   0.4294  ]
 [0.02113  0.01974  0.01482  ... 0.004932 0.003391 0.2642  ]
 [0.00398  0.005352 0.00196  ... 0.001181 0.9844   0.00246 ]
 ...
 [0.1699   0.2025   0.2793   ... 0.1523   0.10706  0.07666 ]
 [0.2815   0.08563  0.04202  ... 0.02112  0.5396   0.02394 ]
 [0.3433   0.04993  0.01445  ... 0.3562   0.01628  0.2079  ]]
[[1.3306e-02 8.3936e-01 2.7447e-03 ... 5.6992e-03 2.1973e-03 1.1884e-01]
 [6.5565e-06 9.3877e-05 1.0133e-06 ... 1.7881e-07 5.9605e-07 1.8477e-05]
 [4.9829e-05 6.0883e-03 3.1781e-04 ... 7.3075e-05 9.9219e-01 2.4533e-04]
 



[[0.000e+00 4.009e-03 0.000e+00 ... 0.000e+00 0.000e+00 9.961e-01]
 [0.000e+00 0.000e+00 0.000e+00 ... 0.000e+00 0.000e+00 0.000e+00]
 [0.000e+00 0.000e+00 0.000e+00 ... 0.000e+00 1.000e+00 0.000e+00]
 ...
 [0.000e+00 1.000e+00 2.623e-06 ... 0.000e+00 0.000e+00 1.252e-06]
 [1.000e+00 3.576e-07 5.269e-05 ... 1.911e-04 1.079e-05 0.000e+00]
 [2.927e-05 1.311e-01 7.749e-07 ... 8.613e-01 0.000e+00 4.364e-03]]


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-Instruct-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 6,873,088 || all params: 7,117,533,184 || trainable%: 0.09656559122829936
train shape: (3185, 5) valid shape: (791, 5)


Map:   0%|          | 0/3185 [00:00<?, ? examples/s]

Map:   0%|          | 0/791 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Logloss
50,5.0585,1.90625,1.8153
100,1.5189,1.06543,1.065367
150,0.9763,1.251953,1.247175
200,1.0012,0.92334,0.92209
250,0.5694,1.05957,1.05433
300,0.5516,0.983887,0.970532
350,0.1604,1.626953,1.407686
400,0.1804,1.385742,1.256275
450,0.0248,1.946289,1.521751


[[2.5558e-02 5.6836e-01 4.1656e-02 ... 1.9073e-02 3.9307e-02 1.7603e-01]
 [1.3893e-02 1.0327e-01 4.2053e-02 ... 1.1726e-02 3.1934e-01 9.4788e-02]
 [9.2545e-03 2.1472e-01 1.1475e-02 ... 1.2045e-03 1.2207e-02 3.2684e-02]
 ...
 [8.4473e-01 8.1682e-04 5.8060e-03 ... 7.3853e-02 3.3600e-02 4.0680e-02]
 [8.8196e-02 1.7700e-01 8.8989e-02 ... 2.9248e-01 1.4847e-02 3.3667e-01]
 [3.1299e-01 3.0441e-03 3.4912e-02 ... 5.3467e-01 1.9350e-03 1.1292e-01]]
[[3.035e-04 5.563e-02 3.077e-03 ... 1.587e-04 1.484e-05 6.989e-03]
 [1.192e-07 5.960e-07 5.960e-08 ... 0.000e+00 1.000e+00 1.788e-07]
 [5.484e-06 2.621e-03 7.403e-05 ... 8.345e-07 1.073e-06 9.966e-05]
 ...
 [1.729e-01 5.890e-02 5.240e-02 ... 1.259e-01 2.546e-01 2.969e-01]
 [4.707e-01 7.462e-03 1.274e-01 ... 2.417e-01 1.401e-03 1.501e-01]
 [2.695e-01 2.596e-02 8.868e-02 ... 3.237e-01 2.790e-03 2.593e-01]]
[[2.9683e-04 2.7740e-02 2.5332e-05 ... 3.7551e-06 3.3319e-05 7.9468e-02]
 [7.7486e-07 3.4809e-05 2.5630e-06 ... 1.7881e-07 1.0000e+00 1.1384e-05]
 [



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-Instruct-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 6,873,088 || all params: 7,117,533,184 || trainable%: 0.09656559122829936
train shape: (3185, 5) valid shape: (791, 5)


Map:   0%|          | 0/3185 [00:00<?, ? examples/s]

Map:   0%|          | 0/791 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Logloss
50,5.0554,2.103516,1.97011
100,1.5154,1.21875,1.214971
150,1.0154,1.085938,1.083193
200,0.9218,1.141602,1.125524
250,0.5743,1.016602,0.995805
300,0.5555,0.92334,0.924513
350,0.2179,1.425781,1.326373
400,0.1589,1.5,1.321854
450,0.0254,2.128906,1.550182


[[0.05463  0.00801  0.2825   ... 0.6025   0.03513  0.0127  ]
 [0.01968  0.0009   0.02263  ... 0.477    0.03363  0.4363  ]
 [0.08716  0.00493  0.1628   ... 0.2496   0.1683   0.3193  ]
 ...
 [0.1538   0.01714  0.1248   ... 0.2786   0.04556  0.3704  ]
 [0.01119  0.563    0.0837   ... 0.0488   0.001907 0.2876  ]
 [0.1675   0.3242   0.04156  ... 0.02277  0.053    0.1808  ]]
[[1.0736e-01 4.1089e-01 1.9226e-01 ... 2.3132e-01 1.5774e-03 1.8524e-02]
 [2.0630e-01 1.0565e-01 2.1082e-01 ... 1.7664e-01 2.8091e-02 1.9507e-01]
 [2.9053e-01 7.7698e-02 1.5186e-01 ... 1.2732e-01 1.5612e-03 2.1692e-01]
 ...
 [3.0542e-01 6.0852e-02 1.4673e-01 ... 1.9409e-01 1.1454e-03 2.1484e-01]
 [1.9653e-02 5.0879e-01 3.0777e-02 ... 5.5847e-03 1.3185e-04 4.2993e-01]
 [2.4399e-02 2.3779e-01 1.8387e-02 ... 1.3494e-03 7.8011e-03 3.0908e-01]]
[[4.9011e-02 9.8953e-03 2.9224e-01 ... 6.3916e-01 2.5826e-03 2.4486e-04]
 [1.6125e-01 2.1011e-02 2.3047e-01 ... 3.9966e-01 1.3574e-01 2.9694e-02]
 [1.8408e-01 5.7129e-02 2.8735e-01 ...

In [15]:
gc.collect()

0

In [17]:
torch.cuda.empty_cache()

In [23]:
def preprocess_function(examples, max_length=CFG.max_len):
    return tokenizer(examples["all_text"], 
                     truncation=True, 
                     max_length=max_length, 
                     padding=True)
tokenizer = AutoTokenizer.from_pretrained(CFG.TARGET_MODEL, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token

test_ds = Dataset.from_pandas(test)
test_tokenized_ds = test_ds.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")

Map:   0%|          | 0/1001 [00:00<?, ? examples/s]

In [48]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [49]:
base_model = MistralForSequenceClassification.from_pretrained(
    CFG.TARGET_MODEL,
    num_labels=7,
    cache_dir='///mnt/c/Personal/Competitions/HFCache/',
    quantization_config=bnb_config,
    device_map={"":0})

base_model.config.pad_token_id = tokenizer.pad_token_id

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-Instruct-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [51]:
import scipy
out = np.zeros((1001, 7))
for f in (range(5)):
    print(f'----------- Fold: {f} ----------')
    model = PeftModel.from_pretrained(base_model, f'///mnt/c/Personal/Competitions/Kaggle/h2oai-predict-the-llm/runs/nb005/{f}/')

    trainer = Trainer(model=model,
                      tokenizer=tokenizer,
                      data_collator=data_collator)
    pred_output = trainer.predict(test_tokenized_ds)
    logits = pred_output.predictions
    probits = scipy.special.softmax(logits,1)

    out += probits/5
    del trainer, model
    torch.cuda.empty_cache()    
    gc.collect()

----------- Fold: 0 ----------


----------- Fold: 1 ----------


----------- Fold: 2 ----------


----------- Fold: 3 ----------


----------- Fold: 4 ----------


In [52]:
out

array([[3.57788086e-01, 2.13571787e-02, 1.42257690e-01, ...,
        3.51562500e-01, 5.91926575e-02, 5.65297604e-02],
       [1.32858753e-04, 3.08442116e-03, 3.81469727e-05, ...,
        1.68383121e-04, 9.94873047e-01, 1.45876408e-03],
       [3.57627869e-07, 1.60932541e-06, 3.57627869e-07, ...,
        4.17232513e-07, 9.99755859e-01, 1.37090683e-06],
       ...,
       [1.96762085e-01, 1.01788521e-01, 2.78015137e-01, ...,
        3.66149902e-01, 1.05845928e-02, 3.69189978e-02],
       [2.90420532e-01, 3.60937119e-02, 2.08209991e-01, ...,
        4.19372559e-01, 1.63280964e-03, 3.95431519e-02],
       [1.06906891e-03, 2.99870968e-03, 5.77747822e-04, ...,
        1.06978416e-03, 1.09314919e-04, 9.93896484e-01]])

In [55]:
sample_submission.iloc[:,1:] = out

In [58]:
OUTPUT_DIR

'///mnt/c/Personal/Competitions/Kaggle/h2oai-predict-the-llm/runs/nb005/'

In [59]:
sample_submission.to_csv(f'{OUTPUT_DIR}/submit.csv',index=False)