In [None]:
import re
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW

from pathlib import Path
!pip install sacremoses

#Powtarzalnośc wyników
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [7]:
DATAPATH = Path('../data/processed/Tweety25k-1.csv')

In [8]:
# Ładowanie danych
df = pd.read_csv(DATAPATH)
df = pd.concat([df[df['label'] == "1"], df[df['label'] == "0"]])
df.label = df.label.astype(int)
df = df.sample(frac = 1).reset_index(drop=True)


print(df.shape)
df.head()

(20928, 3)


Unnamed: 0,id,text,label
0,1634610870570438661,@KrystPawlowicz Tam najpierw przetrze szlak Ru...,0
1,1634193639109799938,@JachiraKlaudia Kurwa I szmata zawsze I wszędz...,1
2,1634199203906830340,„Zorganizowana grupa przestępcza”. Czarzasty w...,0
3,1635547952168026112,"@MichalSzczerba @WWnioski Mam nadzieję, że bad...",0
4,1636046850150088725,@Antysyst @michaldworczyk @AndriyYermak @OlKub...,1


In [41]:
model_names = {
    "herbert-klej-cased-v1": {
        "tokenizer": "allegro/herbert-klej-cased-tokenizer-v1", 
        "model": "allegro/herbert-klej-cased-v1",
    },
    "herbert-base-cased": {
        "tokenizer": "allegro/herbert-base-cased", 
        "model": "allegro/herbert-base-cased",
    },
    "herbert-large-cased": {
        "tokenizer": "allegro/herbert-large-cased", 
        "model": "allegro/herbert-large-cased",
    },
}

In [1]:
# Preprocessing
def preprocess_text(text):
    # Usunięcie URL-i
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    # Usunięcie wzmianek
    text = re.sub(r"@\w+", "", text)
    # Usunięcie hashtagów
    text = re.sub(r"#\w+", "", text)
    return text

df["text"] =df.text.astype(str).apply(preprocess_text)

# Podział na zbiór treningowy i testowy
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["text"].tolist(), df["label"].tolist(), test_size=0.15, random_state=42
)

# Tokenizacja
tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-large-cased")
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.6/880.6 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25ldone
[?25h  Created wheel for sacremoses: filename=sacremoses-0.0.53-py3-none-any.whl size=895260 sha256=08cf7a7d65982bf7fe8f2eda8284630775d5117dacac05b5c69141c2abecba56
  Stored in directory: /root/.cache/pip/wheels/00/24/97/a2ea5324f36bc626e1ea0267f33db6aa80d157ee977e9e42fb
Successfully built sacremoses
Installing collected packages: sacremoses
Successfully installed sacremoses-0.0.53
[0m

In [None]:
# Tworzenie zbioru danych
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = Dataset(train_encodings, train_labels)
test_dataset = Dataset(test_encodings, test_labels)


In [None]:

# Inicjalizacja modelu HERBERT
model = AutoModelForSequenceClassification.from_pretrained("allegro/herbert-large-cased", num_labels=2)

# Trenowanie modelu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)

optimizer = AdamW(model.parameters(), lr=1e-5)
num_epochs = 3

for epoch in range(num_epochs):
    print(epoch)
    for idx, batch in enumerate(train_loader):
        print(idx)
        optimizer.zero_grad()
        inputs = {key: val.to(device) for key, val in batch.items()}
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Ewaluacja modelu
model.eval()
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)

predictions = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        inputs = {key: val.to(device) for key, val in batch.items()}
        outputs = model(**inputs)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=1).tolist())
        all_labels.extend(inputs["labels"].tolist())


In [None]:
accuracy = accuracy_score(all_labels, predictions)
precision = precision_score(all_labels, predictions)
recall = recall_score(all_labels, predictions)
f1 = f1_score(all_labels, predictions)
auc = roc_auc_score(all_labels, predictions)

print("Dokładność:", accuracy)
print("Swoistość:", precision)
print("Czułość:", recall)
print("F1 Score:", f1)
print("AUC:", auc)