In [10]:
import torch
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import AutoTokenizer, DataCollatorWithPadding
from datasets import Dataset
from datasets import load_metric

from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

from scipy.stats import mode
import os
os.environ['WANDB_DISABLED'] = 'True'

In [11]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train, valid = train_test_split(train, test_size=0.2, random_state=2)
train.head(10)

Unnamed: 0,id,keyword,location,text,target
4549,6466,injured,USA,Offers : http://t.co/Gl3C1vc88P #8392 Deluxe T...,1
4512,6413,hurricane,,The hurricane mixxtail kinda tastes like the w...,0
4368,6203,hijacker,,Complete Solution to Get Rid of http://t.co/9C...,0
4297,6103,hellfire,,@HellFire_eV @JackPERU1 then I do this to one ...,0
13,19,,,#Flood in Bago Myanmar #We arrived Bago,1
6235,8903,snowstorm,Manchester,@Groupon_UK it won't let me as you don't follo...,0
3160,4537,emergency,Southern Maine,Former heroin addict shares story as city lead...,1
2917,4191,drown,somewhere in Indiana,Going to go drown my sorrows with sad music brb,0
2318,3334,demolished,Chicago,ÛÏ@SplottDave: @TeamPalestina That's about 28...,1
3392,4856,evacuation,,This is an evil generation\nRock and roll evac...,0


In [12]:
test.head(10)

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
5,12,,,We're shaking...It's an earthquake
6,21,,,They'd probably still show more life than Arse...
7,22,,,Hey! How are you?
8,27,,,What a nice hat?
9,29,,,Fuck off!


In [21]:
def process_token(example, tokenizer=tokenizer):
        return tokenizer(example['text'])

In [30]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
train_ds = Dataset.from_pandas(train)
tokenized_train = train_ds.map(process_token)

Map: 100%|██████████| 6090/6090 [00:00<00:00, 9189.26 examples/s]


In [13]:
def tokenization(model_path, train_df, valid_df, test_df):
    print(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    train_ds = Dataset.from_pandas(train_df)
    valid_ds = Dataset.from_pandas(valid_df)
    test_ds = Dataset.from_pandas(test_df)

    def process_token(example, tokenizer=tokenizer):
        return tokenizer(example['text'])
    
    tokenized_train = train_ds.map(process_token)
    tokenized_valid = valid_ds.map(process_token)
    tokenized_test = test_ds.map(process_token)

    columns_to_remove = ['id', 'keyword', 'location', '__index_level_0__'] #if __index_level_0__ notice the first index
    train_dataset = tokenized_train.remove_columns(columns_to_remove)
    valid_dataset = tokenized_valid.remove_columns(columns_to_remove)
    columns_to_remove_test = ['id', 'keyword', 'location']
    test_dataset = tokenized_test.remove_columns(columns_to_remove_test)

    train_dataset = train_dataset.rename_column('target', 'label')
    valid_dataset = valid_dataset.rename_column('target', 'label')
    return train_dataset, valid_dataset, test_dataset, tokenizer

In [14]:
def compute_metrics(eval_pred):
    load_acc = load_metric('accuracy')
    load_f1 = load_metric('f1')
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    acc = load_acc.compute(predictions=predictions, refereences = labels)['accuracy']
    f1 = load_f1.compute(predictions=predictions, references = labels)['f1']
    return {'acc':acc, 'f1':f1}

In [15]:
def init_trainer(model_path, tokenizer, lr, ep, train_dataset, valid_dataset):
    model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    train_args = TrainingArguments(
        learning_rate=lr,
        num_train_epochs=ep,
        per_device_train_batch_size=16,
        weight_decay=0.01,
        output_dir=model_path
    )

    trainer = Trainer(
        model=model,
        args=train_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    return trainer

In [45]:
class model_fusion:
    def __init__(self, model_paths, lr, ep) -> None:
        self.model_paths= model_paths
        self.learning_rate = lr
        self.epoch_num = ep
        self.preds =[]

    def train_pred_multiple_models(self):
        for model_path in self.model_paths:
            print(f'training : {model_path}')
            print('total:', torch.cuda.get_device_properties(0).total_memory/1e9)
            print('allocated:', torch.cuda.memory_allocated(0)/1e9)
            print('cached', torch.cuda.memory_reserved(0)/1e9)

            train_dataset, valid_dataset, test_dataset, tokenizer = tokenization(model_path, train, valid, test)
            trainer = init_trainer(model_path, tokenizer, self.learning_rate, self.epoch_num, train_dataset, valid_dataset)
            trainer.train()

            print(test_dataset.shape)
            prediction = trainer.predict(test_dataset=test_dataset)
            print(prediction.predictions.shape)
            predictions = np.argmax(a=prediction.predictions, axis=-1)
            self.preds.append(model_path, predictions)

    def fusion_pred(self):
        all_preds = [pred[1] for pred in self.preds]
        final_preds = mode(all_preds, axis=0)[0]

        return final_preds.ravel()

In [46]:
roberta_path = 'roberta-base'
deberta_path = 'microsoft/deberta-v3-base'
distilbert_path = 'distilbert-base-uncased'
model_paths = [distilbert_path,roberta_path,deberta_path]
lr = 2e-5
ep = 3
models = model_fusion(model_paths,lr,ep)

In [47]:
models.train_pred_multiple_models()
pred = models.fusion_pred()
models.preds

training : distilbert-base-uncased
total: 25.756696576
allocated: 0.829641728
cached 1.68820736
distilbert-base-uncased


 35%|███▍      | 398/1143 [26:54<50:21,  4.06s/it]
Map: 100%|██████████| 6090/6090 [00:00<00:00, 8330.56 examples/s]
Map: 100%|██████████| 1523/1523 [00:00<00:00, 9620.66 examples/s]
Map: 100%|██████████| 3263/3263 [00:00<00:00, 10233.77 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
 44%|████▎     | 500/1143 [00:09<00:12, 53.01it/s]

{'loss': 0.4138, 'grad_norm': 5.9178643226623535, 'learning_rate': 1.1251093613298338e-05, 'epoch': 1.31}


 87%|████████▋ | 1000/1143 [00:20<00:02, 49.65it/s]

{'loss': 0.2948, 'grad_norm': 1.9131118059158325, 'learning_rate': 2.502187226596676e-06, 'epoch': 2.62}


100%|██████████| 1143/1143 [00:25<00:00, 45.50it/s]


{'train_runtime': 25.1196, 'train_samples_per_second': 727.319, 'train_steps_per_second': 45.502, 'train_loss': 0.340983606907535, 'epoch': 3.0}
(3263, 4)


100%|██████████| 408/408 [00:01<00:00, 269.15it/s]

(3263, 2)





TypeError: list.append() takes exactly one argument (2 given)

Tokenizer : AutoTokenizer 'pre-train' <br>
Dataset : from_pandas()<br>
Preprocessing : Dataset.map()<br>
init_trainer : AutoModelForSequenceClassification 'pre-train'<br>
data_collator : define how to batch data<br>
dataset_argument : define the parameters of trainer<br>
compute_metrics : define evalua method<br>
trainer.train()<br>
trainer.predict() - > (n, num_class)<br>