In [17]:
!pip install -r "../requirements.txt"

Collecting transformers (from -r ../requirements.txt (line 7))
  Downloading transformers-4.42.4-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m162.2 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers->-r ../requirements.txt (line 7))
  Using cached huggingface_hub-0.23.4-py3-none-any.whl.metadata (12 kB)
Collecting safetensors>=0.4.1 (from transformers->-r ../requirements.txt (line 7))
  Downloading safetensors-0.4.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (3.8 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers->-r ../requirements.txt (line 7))
  Downloading tokenizers-0.19.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.7 kB)
Downloading transformers-4.42.4-py3-none-any.whl (9.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m892.9 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hUsing cached huggingface_hu

In [3]:
TEST_PATH = "../data/raw/test_data.csv"
TRAIN_PATH = "../data/raw/train_data.csv"
AUGMENTED_DATA_LABEL_1 = "../data/raw/aug_1.txt"
AUGMENTED_DATA_LABEL_0 = "../data/raw/aug_0.txt"

# HyperParameres
MAX_LEN = 256
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
EPOCHS = 45
LEARNING_RATE = 1e-05
BERT_MODEL_NAME = 'DeepPavlov/rubert-base-cased'

In [5]:
import sys
sys.path.append("../")

In [7]:
import pandas as pd
from src import NewsDataset, NewsDatasetTest, get_test_dataset_dataloader, get_dataset_dataloader, add_augmented_data
from src import train_epoch, validation_epoch, fit, loss_fn
from src import test, load_model, predict
from src import save_checkpoint, load_checkpoint, compute_metrics
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from nltk.stem.snowball import SnowballStemmer

[nltk_data] Downloading package punkt to /Users/poulyak/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
def preprocess():
    train_df = pd.read_csv(TRAIN_PATH, index_col=0)
    test_df = pd.read_csv(TEST_PATH, index_col="id")
    train_df = add_augmented_data(train_df, AUGMENTED_DATA_LABEL_1, 1)
    train_df = add_augmented_data(train_df, AUGMENTED_DATA_LABEL_0, 0)

    train_data, val_data = train_test_split(train_df, train_size=0.8, random_state=43, shuffle=True)

    tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)
    stemmer = SnowballStemmer("russian")
    train_set, training_loader = get_dataset_dataloader(train_data, tokenizer,
                                                        batch_size=TRAIN_BATCH_SIZE, shuffle=True, num_workers=0,
                                                        stemmer=stemmer, max_len=MAX_LEN)
    val_set, val_loader = get_dataset_dataloader(val_data, tokenizer,
                                                 batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0,
                                                 stemmer=stemmer, max_len=MAX_LEN)

    test_set, test_loader = get_test_dataset_dataloader(test_df, tokenizer,
                                                   batch_size=VALID_BATCH_SIZE, shuffle=False, num_workers=0,
                                                   stemmer=stemmer, max_len=MAX_LEN)
    return training_loader, val_loader, test_loader

In [9]:
training_loader, val_loader, test_loader = preprocess()

In [13]:
import torch
from models.news_classification import NewsClassification
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = NewsClassification(BERT_MODEL_NAME)



In [None]:
model.to(device)

In [None]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [None]:
device

In [None]:
fit(model, optimizer, device, training_loader, val_loader)
