In [1]:
import os
from typing import List, Iterable
import pandas as pd
import numpy as np
import datasets
from tqdm.notebook import tqdm
import random

from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline
import evaluate

In [2]:
DATA_DIR = 'crisis_data'
FILE_BLACKLIST = [
    'crisis_data/Afera Rywina.xlsx',
    'crisis_data/Ministerstwo Zdrowia_respiratory od handlarza bronią.xlsx',
    'crisis_data/Fake news_baza publikacji.xlsx'
]

files = [os.path.join(DATA_DIR, f) for f in os.listdir(DATA_DIR) if f[-5:] == '.xlsx']
for f in FILE_BLACKLIST:
    files.remove(f)

In [3]:
def load_dataset(filename: str) -> datasets.Dataset:
    df = pd.read_excel(filename)
    df = df[['Kryzys', 'Tytuł publikacji', 'Lead', 'Kontekst publikacji']]

    if df['Kryzys'].hasnans:
        df['Kryzys'] = df['Kryzys'].notna()
    else:
        df['Kryzys'] = (df['Kryzys'] != 'NIE') & (df['Kryzys'] != 'Nie')
    if df['Kryzys'].nunique() != 2:
        raise RuntimeError(f'Crisis column data error in file {filename}.')

    df['text'] = df.apply(lambda x: ".".join([str(x['Tytuł publikacji']), str(x['Lead']), str(x['Kontekst publikacji'])]), axis=1)
    df = df[['Kryzys', 'text']].rename(columns={'Kryzys': 'label'})
    return datasets.Dataset.from_pandas(df).cast_column('label', datasets.ClassLabel(names=['negative', 'positive']))

In [4]:
def prepare_dataset(filenames: List[str]) -> datasets.Dataset:
    dss = []
    for filename in tqdm(filenames):
        df = pd.read_excel(filename)
        df = df[['Kryzys', 'Tytuł publikacji', 'Lead', 'Kontekst publikacji']]

        if df['Kryzys'].hasnans:
            df['Kryzys'] = df['Kryzys'].notna()
        else:
            df['Kryzys'] = (df['Kryzys'] != 'NIE') & (df['Kryzys'] != 'Nie')
        if df['Kryzys'].nunique() != 2:
            raise RuntimeError(f'Crisis column data error in file {filename}.')

        df['text'] = df.apply(lambda x: ".".join([str(x['Tytuł publikacji']), str(x['Lead']), str(x['Kontekst publikacji'])]), axis=1)
        df = df[['Kryzys', 'text']].rename(columns={'Kryzys': 'label'})
        dss.append(datasets.Dataset.from_pandas(df).cast_column('label', datasets.ClassLabel(names=['negative', 'positive'])))
    return datasets.concatenate_datasets(dss)

In [5]:
def split_tensors(dss: List[datasets.Dataset], lengths = Iterable[int], shuffle: bool = True) -> List[List]:
    lengths = np.array(lengths) / sum(lengths) * sum(len(ds) for ds in dss)
    if shuffle:
        random.shuffle(dss)
    else:
        dss.sort(key=lambda x: len(x), reverse=True)
    splits = [[] for _ in range(len(lengths))]
    split_lengths = [0 for _ in range(len(lengths))]
    li = 0
    for ds in dss:
        if split_lengths[li] + len(ds) / 2 < lengths[li] or li == len(lengths) - 1:
            splits[li].append(ds)
            split_lengths[li] += len(ds)
        else:
            li += 1
            splits[li].append(ds)
            split_lengths[li] += len(ds)
    return splits

In [6]:
# ds = prepare_dataset(files)
# ds.save_to_disk('other_data/text_dataset')

In [7]:
# dss = {}
# for fname in tqdm(files):
#     dss[fname[len(DATA_DIR)+1:-5]] = load_dataset(fname)
# dss = datasets.DatasetDict(dss)
# dss.save_to_disk('other_data/text_dataset')

In [8]:
ds = datasets.load_from_disk('other_data/text_dataset')

In [9]:
ds

DatasetDict({
    Devil Energy: Dataset({
        features: ['label', 'text'],
        num_rows: 267
    })
    Urzad Miasta Helu: Dataset({
        features: ['label', 'text'],
        num_rows: 18
    })
    Ewa Kopacz: Dataset({
        features: ['label', 'text'],
        num_rows: 13496
    })
    Forever 21: Dataset({
        features: ['label', 'text'],
        num_rows: 835
    })
    Komenda Główna Policji: Dataset({
        features: ['label', 'text'],
        num_rows: 122671
    })
    Afera szczepionkowa na WUM: Dataset({
        features: ['label', 'text'],
        num_rows: 128658
    })
    Barbara Kurdej-Szatan: Dataset({
        features: ['label', 'text'],
        num_rows: 82546
    })
    Sii Polska: Dataset({
        features: ['label', 'text'],
        num_rows: 632
    })
    Dove_rasistowska reklama: Dataset({
        features: ['label', 'text'],
        num_rows: 1118
    })
    Netflix_współdzielenie konta: Dataset({
        features: ['label', 'text'],
     

In [9]:
tokenizer = AutoTokenizer.from_pretrained("sdadas/polish-distilroberta")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [10]:
# tokenized_ds = ds.map(preprocess_function, batched=True)
# tokenized_ds.save_to_disk('other_data/tokenized_dataset')

In [11]:
tokenized_ds = datasets.load_from_disk('other_data/tokenized_dataset').shuffle().select(range(20000))

In [12]:
tokenized_ds = tokenized_ds.train_test_split(0.2)

In [10]:
model = AutoModelForSequenceClassification.from_pretrained("sdadas/polish-distilroberta", num_labels=2)

Some weights of the model checkpoint at sdadas/polish-distilroberta were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at sdadas/polish-distilroberta and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classif

In [15]:
def compute_metrics(eval_pred):
    load_accuracy = evaluate.load("accuracy")
    load_f1 = evaluate.load("f1")
    load_precision = evaluate.load("precision")
    load_recall = evaluate.load("recall")
    
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    precision = load_precision.compute(predictions=predictions, references=labels)["precision"]
    recall = load_recall.compute(predictions=predictions, references=labels)["recall"]
    return {"accuracy": accuracy, "f1": f1, "precision": precision, "recall": recall}

In [12]:
training_args = TrainingArguments(
    output_dir='other_data/trained',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [13]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 16000
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 5000
  Number of trainable parameters = 81917186
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


Saving model checkpoint to other_data/trained/checkpoint-1000
Configuration saved in other_data/trained/checkpoint-1000/config.json
Configuration saved in other_data/trained/checkpoint-1000/config.json
Model weights saved in other_data/trained/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in other_data/trained/checkpoint-1000/tokenizer_config.json
Special tokens file saved in other_data/trained/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to other_data/trained/checkpoint-2000
Configuration saved in other_data/trained/checkpoint-2000/config.json
Model weights saved in other_data/trained/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in other_data/trained/checkpoint-2000/tokenizer_config.json
Special tokens file saved in other_data/trained/checkpoint-2000/special_tokens_map.json
Saving model checkpoint to other_data/trained/checkpoint-3000
Configuration saved in other_data/trained/checkpoint-3000/config.json
Model weights saved in other_d

TrainOutput(global_step=5000, training_loss=0.09485429706573487, metrics={'train_runtime': 637.0964, 'train_samples_per_second': 125.57, 'train_steps_per_second': 7.848, 'total_flos': 7038642808767936.0, 'train_loss': 0.09485429706573487, 'epoch': 5.0})

In [16]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 4000
  Batch size = 16


{'eval_loss': 0.18401412665843964,
 'eval_accuracy': 0.965,
 'eval_f1': 0.7819314641744548,
 'eval_precision': 0.8451178451178452,
 'eval_recall': 0.7275362318840579,
 'eval_runtime': 13.5163,
 'eval_samples_per_second': 295.938,
 'eval_steps_per_second': 18.496,
 'epoch': 5.0}

In [67]:
trainer.save_model('other_data/text_classifier')

Saving model checkpoint to other_data/text_classifier
Configuration saved in other_data/text_classifier/config.json
Model weights saved in other_data/text_classifier/pytorch_model.bin
tokenizer config file saved in other_data/text_classifier/tokenizer_config.json
Special tokens file saved in other_data/text_classifier/special_tokens_map.json


In [35]:
classifier = pipeline(task='text-classification', model=model, tokenizer=tokenizer, device='cuda:0')

In [68]:
tokenized_ds['train']

{'label': [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
 'text': ['Nie tylko Cyberpunk Edgerunners - kolejne dwie gry otrzymają anime od Netflix i nie tylko.NieR: Automata i Onimusha otrzymają własne anime. Za jedno z nich odpowiada Netflix..NieR: Automata i Onimusha otrzymają własne anime. Za jedno z nich odpowiada Netflix. PlatinumGames/materiały\n\nFani gier i anime otrzymali od Netflix ucztę zmysłów, w postaci serialu Cyberpunk: Edgerunners. To jednak nie koniec, bowiem nadchodzą kolejne adaptacje gier w postaci serii anime. Sprawdźcie koniecznie, o jakie tytuły chodzi.\n\nPrawda jest ...  Edgerunners\n\nTrudno sobie wyobrazić, co mogłoby przekonać ...',
  'SirRaVei - 2022-05-24 02:39.RT @Jowita_W: Córka Kurdej-Szatan przystąpiła wczoraj do Pierwszej Komunii Świętej.\n\nJa zapytam- po co?.RT <mention data-type="Twitter" data-value="Jowita_W">@Jowita_W</mention>: Córka Kurdej-Szatan przystąpiła wczoraj do Pierwszej Komunii Świętej.\n\nJa zapytam- po co?',
  'Program telewizyjny na wtorek, 27.12.2

In [66]:
classifier([])

[{'label': 'LABEL_0', 'score': 0.9383643865585327}]

In [23]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50001, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (