In [1]:
!pip install datasets -qqq

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from datasets import load_dataset

In [3]:
data_files = {"train": "/content/Youtube01-Psy.csv", "test": "/content/Youtube02-KatyPerry.csv"}

In [4]:
yt_dataset = load_dataset("csv", data_files=data_files)

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [5]:
yt_dataset

DatasetDict({
    train: Dataset({
        features: ['COMMENT_ID', 'AUTHOR', 'DATE', 'CONTENT', 'CLASS'],
        num_rows: 350
    })
    test: Dataset({
        features: ['COMMENT_ID', 'AUTHOR', 'DATE', 'CONTENT', 'CLASS'],
        num_rows: 350
    })
})

In [6]:
import torch

In [7]:
checkpoint = "google/electra-base-discriminator"

In [8]:
from transformers import AutoTokenizer, DataCollatorWithPadding

tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["CONTENT"], truncation=True)


tokenized_datasets = yt_dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

tokenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/350 [00:00<?, ? examples/s]

Map:   0%|          | 0/350 [00:00<?, ? examples/s]

In [9]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['COMMENT_ID', 'AUTHOR', 'DATE', 'CONTENT', 'CLASS', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 350
    })
    test: Dataset({
        features: ['COMMENT_ID', 'AUTHOR', 'DATE', 'CONTENT', 'CLASS', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 350
    })
})

In [10]:
samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k in ['input_ids', 'token_type_ids', 'attention_mask']}
[len(x) for x in samples["input_ids"]]

[19, 45, 14, 14, 17, 23, 7, 23]

In [11]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': torch.Size([8, 45]),
 'token_type_ids': torch.Size([8, 45]),
 'attention_mask': torch.Size([8, 45])}

In [12]:
tokenized_datasets = tokenized_datasets.remove_columns(['COMMENT_ID', 'AUTHOR', 'DATE', 'CONTENT'])
tokenized_datasets = tokenized_datasets.rename_column("CLASS", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [13]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["test"], batch_size=8, collate_fn=data_collator
)

In [14]:
!pip install tqdm -qqq

In [15]:
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
from tqdm import tqdm

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
optimizer = AdamW(model.parameters(), lr=3e-5)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 99%|█████████▉| 131/132 [00:18<00:00,  5.76it/s]

In [16]:
!pip install evaluate -qqq

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/84.1 kB[0m [31m624.4 kB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m984.5 kB/s[0m eta [36m0:00:00[0m
[?25h

In [17]:
# from datasets import list_metrics
# metrics_list = list_metrics()
# len(metrics_list)
# print(metrics_list)

In [18]:
import evaluate

acc_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    acc_metric.add_batch(predictions=predictions, references=batch["labels"])
    f1_metric.add_batch(predictions=predictions, references=batch["labels"])

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [19]:
acc_metric.compute()

{'accuracy': 0.9657142857142857}

In [20]:
f1_metric.compute()

{'f1': 0.9659090909090909}

In [22]:
!pip install /content/weasel-main.zip

Processing ./weasel-main.zip
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pytorch-lightning==1.7.7 (from weasel==0.1.0)
  Downloading pytorch_lightning-1.7.7-py3-none-any.whl (708 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m708.1/708.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting torchmetrics>=0.7.0 (from pytorch-lightning==1.7.7->weasel==0.1.0)
  Downloading torchmetrics-1.2.1-py3-none-any.whl (806 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m806.1/806.1 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyDeprecate>=0.3.1 (from pytorch-lightning==1.7.7->weasel==0.1.0)
  Downloading pyDeprecate-0.3.2-py3-none-any.whl (10 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics>=0.7.0->pytorch-lightning==1.7.7->weasel==0.1.0)
  Downloading lightning_utilities-0.10.0-py3-none-any.whl (24 kB)
Building wheels for collected packages: weasel
  Building wheel for weasel (setup.py) ... [?25l[?25hd

In [27]:
!pip install torchmetrics==0.11.4 -qqq

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/519.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/519.2 kB[0m [31m1.3 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.4/519.2 kB[0m [31m1.4 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━[0m [32m307.2/519.2 kB[0m [31m2.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.2/519.2 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [29]:
from weasel.models.downstream_models.transformers import Transformers

# instantiate our transformers end model
end_model = model

ImportError: ignored

In [21]:
from weasel.models import Weasel

# instantiate our weasel end-to-end model
weasel = Weasel(
    end_model=end_model,
    num_LFs=len(weak_labels.rules),
    n_classes=2,
    encoder={"hidden_dims": [32, 10]},
    optim_encoder={"name": "adam", "lr": 1e-4},
    optim_end_model={"name": "adam", "lr": 5e-5},
)

#https://docs.argilla.io/en/v1.1.0/guides/techniques/weak_supervision.html