In [1]:
import torch

if torch.cuda.is_available():
    print("GPU is available")
else:
    print("GPU is not available")

GPU is available


In [2]:
!pip install evaluate 

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.1


In [3]:
torch.cuda.get_device_name(0)

'Tesla P100-PCIE-16GB'

In [4]:
from datasets import Dataset
import pandas as pd
import evaluate
import numpy as np
from transformers import (
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    AutoTokenizer,
    set_seed,
    LlamaTokenizer,
    LlamaForSequenceClassification,
    ElectraForSequenceClassification,
    ElectraTokenizerFast,
    ElectraModel
)
import os
from sklearn.model_selection import train_test_split
from scipy.special import softmax
import argparse
import logging

2024-03-24 21:53:23.452599: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-24 21:53:23.452717: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-24 21:53:23.599063: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [5]:
import torch

data = pd.read_json('/kaggle/input/subtaska/subtaskA_train_monolingual.jsonl', lines = True)
df = data[['text','label']]


In [6]:
df = df.reset_index(drop=True)

In [8]:

huggingdata = Dataset.from_pandas(df)

In [9]:
huggingdata 

Dataset({
    features: ['text', 'label'],
    num_rows: 119757
})

In [13]:
df_train, df_test = train_test_split(df,test_size=None, shuffle = True)

In [14]:
from datasets import Dataset

# from pandas
train_ds = Dataset.from_pandas(df_train)
test_ds = Dataset.from_pandas(df_test)

In [17]:
import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy_val = accuracy_score(labels, predictions)
    roc_auc_val = roc_auc_score(labels, predictions)
    f1_score_val_macro = f1_score(labels, predictions, average = "macro")
    f1_score_val_micro = f1_score(labels, predictions, average = "micro")
    
    return {
        "accuracy": accuracy_val,
        "roc_auc": roc_auc_val,
        "f1_score_val_macro" : f1_score_val_macro,
        "f1_score_val_micro" : f1_score_val_micro
    }

In [18]:
from datasets import Dataset
import pandas as pd
import evaluate
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorWithPadding, AutoTokenizer, set_seed, BitsAndBytesConfig
import os
from sklearn.model_selection import train_test_split
from scipy.special import softmax
import argparse
import logging
import datetime
import bitsandbytes as bnb
from peft import LoraConfig, PeftConfig, PeftModel, AutoPeftModelForCausalLM, TaskType, AutoPeftModelForSequenceClassification, prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer
from datasets import disable_caching
import torch
import torch.nn.functional as F

disable_caching()

def preprocess_function(examples, **fn_kwargs):
    return fn_kwargs['tokenizer'](examples["text"], truncation=True)


def get_data(train_path, test_path, random_seed):
    """
    function to read dataframe with columns
    """

    train_df = pd.read_json(train_path, lines=True)
    test_df = pd.read_json(test_path, lines=True)
    
    train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['label'], random_state=random_seed)

    return train_df, val_df, test_df

def compute_metrics(eval_pred):

    f1_metric = evaluate.load("f1")

    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    results = {}
    results.update(f1_metric.compute(predictions=predictions, references = labels, average="micro"))

    return results


class CustomTrainer(Trainer):

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")

        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # compute custom loss
        loss = F.binary_cross_entropy_with_logits(logits[:,1], labels.to(torch.float32))#, pos_weight=self.label_weights)
        return (loss, outputs) if return_outputs else loss

def find_all_linear_names(model):
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, bnb.nn.Linear4bit):
            names = name.split(".")
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if "lm_head" in lora_module_names:  # needed for 16-bit
        lora_module_names.remove("lm_head")
    return list(lora_module_names)

def fine_tune(train_df, valid_df, checkpoints_path, id2label, label2id, model):
    checkpoints_path = "abc" + checkpoints_path
    # pandas dataframe to huggingface Dataset
    train_dataset = Dataset.from_pandas(train_df)
    valid_dataset = Dataset.from_pandas(valid_df)
    
    train_dataset = Dataset.from_pandas(train_df)
    valid_dataset = Dataset.from_pandas(valid_df)
    
    floatorbfloat = torch.float16
    if 'lama' in model:
        floatorbfloat = torch.bfloat16
    
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=floatorbfloat,
    )
    
    tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    
    model_name = model
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        trust_remote_code=True,
        num_labels=len(label2id), id2label=id2label, label2id=label2id
    )
    model.config.use_cache = False
    
    #DM added
    if tokenizer.pad_token is None:
      if tokenizer.eos_token is not None:
        tokenizer.pad_token = tokenizer.eos_token
      else:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=32)
    try:
      model.config.pad_token_id = tokenizer.get_vocab()[tokenizer.pad_token]
    except:
      print("Warning: Exception occured while setting pad_token_id")
    
    # tokenize data for train/valid
    tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True, fn_kwargs={'tokenizer': tokenizer})
    tokenized_valid_dataset = valid_dataset.map(preprocess_function, batched=True,  fn_kwargs={'tokenizer': tokenizer})
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    
    lora_alpha = 16
    lora_dropout = 0.1
    lora_r = 64
    
    target_modules=[]
    if 'falcon' in model_name:
      target_modules=["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"]
    elif 'mistral' in model_name:
      target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj']
    else:
      target_modules=find_all_linear_names(model)
    
    peft_config = LoraConfig(
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        r=lora_r,
        bias="none",
        task_type=TaskType.SEQ_CLS,
        #task_type="CAUSAL_LM",
        target_modules=target_modules,
        modules_to_save=["score"]
    )
    
    model.gradient_checkpointing_enable()
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, peft_config)
    
    output_dir = checkpoints_path + "abc"
    per_device_train_batch_size = 16 #4
    gradient_accumulation_steps = 4
    optim = "paged_adamw_32bit"
    save_steps = 1000 #10
    logging_steps = 1000 #10
    learning_rate = 2e-5 #2e-4
    max_grad_norm = 0.3
    max_steps = 10 #500
    num_train_epochs=1 #added
    warmup_ratio = 0.03
    lr_scheduler_type = "constant"
    fp16 = True
    bf16 = False
    
    if 'lama' in model_name:
        fp16 = False
        bf16 = True
    
    training_arguments = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        optim=optim,
        save_steps=save_steps,
        logging_steps=logging_steps,
        learning_rate=learning_rate,
        fp16=fp16,
        bf16=bf16,
        max_grad_norm=max_grad_norm,
        #max_steps=max_steps, #for testing
        num_train_epochs=num_train_epochs,
        warmup_ratio=warmup_ratio,
        group_by_length=True,
        lr_scheduler_type=lr_scheduler_type,
        gradient_checkpointing=True,
        load_best_model_at_end=True,
        evaluation_strategy="steps",
    )
    
    max_seq_length = 512

    trainer = CustomTrainer(
        model=model,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_valid_dataset,
        #peft_config=peft_config,
        #dataset_text_field="text",
        #max_seq_length=max_seq_length,
        tokenizer=tokenizer,
        args=training_arguments,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    for name, module in trainer.model.named_modules():
        if "norm" in name:
            module = module.to(torch.float32)

    trainer.train()

    # save best model
    best_model_path = checkpoints_path+'/best/'
    
    if not os.path.exists(best_model_path):
        os.makedirs(best_model_path)
    
    trainer.save_model(best_model_path)
    trainer.model.save_pretrained(best_model_path)
    tokenizer.save_pretrained(best_model_path)
#     torch.save(trainer.model.score.state_dict(), f'{best_model_path}/score-params.pt')
    tokenized_train_dataset.cleanup_cache_files()
    tokenized_valid_dataset.cleanup_cache_files()
    
    return #skip merging
    print('Merging model...')
    model_temp = AutoPeftModelForSequenceClassification.from_pretrained(
    #model_temp = AutoPeftModelForCausalLM.from_pretrained(
        best_model_path,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
    )
    model_temp = model_temp.merge_and_unload()        
    model_temp.save_pretrained(
       best_model_path, safe_serialization=True, max_shard_size="2GB"
    )


def test(test_df, model_path, id2label, label2id):
    print('Loading model for predictions...')
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    
    # load best model
    model = AutoModelForSequenceClassification.from_pretrained(
       model_path, trust_remote_code=True, num_labels=len(label2id),ignore_mismatched_sizes=True, id2label=id2label, label2id=label2id, torch_dtype=torch.float16
    )
    
    #DM added
    if tokenizer.pad_token is None:
      if tokenizer.eos_token is not None:
        tokenizer.pad_token = tokenizer.eos_token
      else:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=32)
    try:
      model.config.pad_token_id = tokenizer.get_vocab()[tokenizer.pad_token]
    except:
      print("Warning: Exception occured while setting pad_token_id")

            
    test_dataset = Dataset.from_pandas(test_df)

    tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True,  fn_kwargs={'tokenizer': tokenizer})
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # create Trainer
    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    # get logits from predictions and evaluate results using classification report
    predictions = trainer.predict(tokenized_test_dataset)
    prob_pred = softmax(predictions.predictions, axis=-1)
    preds = np.argmax(predictions.predictions, axis=-1)
    metric = evaluate.load("bstrai/classification_report")
    results = metric.compute(predictions=preds, references=predictions.label_ids)
    
    # return dictionary of classification report
    return results, preds, prob_pred


if __name__ == '__main__':
    random_seed = 0 
    train_path =  "/kaggle/input/reduced/downsampled_data.json" # For example 'subtaskA_train_multilingual.jsonl'
    test_path =  "/kaggle/input/text-dataset/subtaskA_monolingual.jsonl" # For example 'subtaskA_test_multilingual.jsonl'
    model =  "BAAI/bge-small-en-v1.5" # For example 'xlm-roberta-base'
    subtask =  'A' # For example 'A'
    prediction_path = 'subtaskA_predictions.jsonl' # For example subtaskB_predictions.jsonl

    if not os.path.exists(train_path):
        logging.error("File doesnt exists: {}".format(train_path))
        raise ValueError("File doesnt exists: {}".format(train_path))
    
    if not os.path.exists(test_path):
        logging.error("File doesnt exists: {}".format(train_path))
        raise ValueError("File doesnt exists: {}".format(train_path))
    

    if subtask == 'A':
        id2label = {0: "human", 1: "machine"}
        label2id = {"human": 0, "machine": 1}
    elif subtask == 'B':
        id2label = {0: 'human', 1: 'chatGPT', 2: 'cohere', 3: 'davinci', 4: 'bloomz', 5: 'dolly'}
        label2id = {'human': 0, 'chatGPT': 1,'cohere': 2, 'davinci': 3, 'bloomz': 4, 'dolly': 5}
    else:
        logging.error("Wrong subtask: {}. It should be A or B".format(train_path))
        raise ValueError("Wrong subtask: {}. It should be A or B".format(train_path))

    set_seed(random_seed)

    #get data for train/dev/test sets
    train_df, valid_df, test_df = get_data(train_path, test_path, random_seed)
    
    # train detector model
    fine_tune(train_df, valid_df, f"{model}/subtask{subtask}/{random_seed}", id2label, label2id, model)

    # test detector model
    results, predictions = test(test_df, f"{model}/subtask{subtask}/{random_seed}/best/", id2label, label2id)
    
    logging.info(results)
    predictions_df = pd.DataFrame({'id': test_df['id'], 'label': predictions})
    predictions_df.to_json(prediction_path, lines=True, orient='records')

pytorch_model.bin:   0%|          | 0.00/54.2M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy,Roc Auc,F1 Score Val Macro,F1 Score Val Micro
1,0.2319,0.427842,0.899299,0.903312,0.899282,0.899299
2,0.5325,0.608526,0.880261,0.88613,0.880019,0.880261
3,0.002,0.794878,0.869405,0.876045,0.86896,0.869405


TrainOutput(global_step=44910, training_loss=0.22610588656753747, metrics={'train_runtime': 2357.6031, 'train_samples_per_second': 114.29, 'train_steps_per_second': 19.049, 'total_flos': 1981787910455808.0, 'train_loss': 0.22610588656753747, 'epoch': 3.0})

In [19]:
test_tokenized_ds

Dataset({
    features: ['text', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 29940
})

In [25]:
dataTest = pd.read_json('/kaggle/input/testjson/subtaskA_monolingual.jsonl', lines = True)
df2 = dataTest[['text']]
df2 = df2.reset_index(drop=True)
df_test_ds = Dataset.from_pandas(df2)
df_test_tokenized_ds = df_test_ds.map(preprocess_function, batched=True)

  0%|          | 0/35 [00:00<?, ?ba/s]

In [27]:
pred_output = trainer.predict(df_test_tokenized_ds)
logits = pred_output.predictions

In [28]:
prob_pred = softmax(logits, axis=-1)
preds = np.argmax(logits, axis=-1)

In [29]:
metric1 = evaluate.load("accuracy")

In [30]:
GoldDataset = pd.read_json('/kaggle/input/gs-dataset/subtaskA_monolingual_gs.jsonl', lines = True)

In [31]:
GoldDataset

Unnamed: 0,text,label,id
0,"Today, many adults or teenage drivers are hook...",0,0
1,"The automobile, since its advent, has revoluti...",1,1
2,One policy that could potentially improve aca...,1,2
3,Title: Navigating the Road Ahead: The Case for...,1,3
4,Have you ever woken up in the morning and wish...,0,4
...,...,...,...
34267,There are many advantages of limiting car usag...,0,34267
34268,When discussing the merits of the electoral co...,1,34268
34269,In favor of student-designed summer assignment...,1,34269
34270,"No, FACE is not created by aliens. as a person...",0,34270


In [32]:
GoldDataset_labels = GoldDataset['label'].to_list()

In [39]:
metric_classification = evaluate.load("bstrai/classification_report")
results = metric_classification.compute(predictions=preds, references=GoldDataset_labels)
print(results)

Downloading builder script:   0%|          | 0.00/5.24k [00:00<?, ?B/s]

{'0': {'precision': 0.8992033542976939, 'recall': 0.6589847590953786, 'f1-score': 0.760577366386495, 'support': 16272}, '1': {'precision': 0.75168926477827, 'recall': 0.9332222222222222, 'f1-score': 0.8326765310927703, 'support': 18000}, 'accuracy': 0.8030170401493931, 'macro avg': {'precision': 0.825446309537982, 'recall': 0.7961034906588004, 'f1-score': 0.7966269487396327, 'support': 34272}, 'weighted avg': {'precision': 0.8217274669450553, 'recall': 0.8030170401493931, 'f1-score': 0.7984445747406311, 'support': 34272}}


In [34]:
f1_metric = evaluate.load("f1")

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [35]:
results_f1_micro = f1_metric.compute(predictions=preds, references=GoldDataset_labels, average = "micro")

In [36]:
print(results_f1_micro)

{'f1': 0.8030170401493932}


In [37]:
results_f1_macro = f1_metric.compute(predictions=preds, references=GoldDataset_labels, average = "macro")
print(results_f1_macro)

{'f1': 0.7966269487396327}


In [38]:
accuracy = metric1.compute(predictions=preds, references=GoldDataset_labels)
print(accuracy)

{'accuracy': 0.8030170401493931}


In [40]:
json_result_df = GoldDataset['id']

In [41]:
json_result_df = json_result_df.to_frame()

In [42]:
json_result_df

Unnamed: 0,id
0,0
1,1
2,2
3,3
4,4
...,...
34267,34267
34268,34268
34269,34269
34270,34270


In [43]:
pred_df = pd.DataFrame({'label':preds})

In [44]:
pred_df

Unnamed: 0,label
0,1
1,1
2,1
3,1
4,1
...,...
34267,0
34268,1
34269,1
34270,1


In [45]:
json_result_df_final = pd.concat([json_result_df, pred_df], axis = 1)

In [46]:
json_result_df_final

Unnamed: 0,id,label
0,0,1
1,1,1
2,2,1
3,3,1
4,4,1
...,...,...
34267,34267,0
34268,34268,1
34269,34269,1
34270,34270,1


In [47]:
pwd

'/kaggle/working'

In [50]:
json_result_df_final.to_json('json_predictions_peft_bge', orient='records', lines=True)