In [None]:
!pip install navec
!pip install nltk
!pip install spacy
!pip install tensorflow
!pip install datasets
!pip install accelerate
!pip install transformers

In [None]:
import sys
import numpy as np
import pickle
import re
from nltk.stem import SnowballStemmer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
nltk.download("stopwords")
nltk.download('wordnet')
tqdm.pandas()

In [None]:
stop = set(stopwords.words("russian"))
stemmer = SnowballStemmer("russian")
tokenizer = RegexpTokenizer(r'\w+')

def clear_text(text):
    # return ' '.join(stemmer.stem(x) for x in tokenizer.tokenize(text.lower())[:256] if x not in stop)
    return text

In [None]:
df = pd.read_csv("deti.mail.csv")
df

In [None]:
df["text"] = df["Article Text"].progress_apply(clear_text)

In [None]:
df["labels"], indexer = df["Keywords"].factorize()

In [None]:
df["labels"].value_counts()

In [None]:
g = df.groupby('labels')
new_df = g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True))

In [282]:
text_train_x, text_test_x, train_y, test_y = train_test_split(new_df["text"].values, new_df["labels"].values, test_size=0.15)

In [283]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
model_name = "DeepPavlov/rubert-base-cased-sentence"
tokenizer = tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

In [284]:
# class MyDataset(torch.utils.data.Dataset):
#     def __init__(self, features, labels):
#         self.features = features
#         self.labels = labels
    
    
#     def __getitem__(self, idx):
#         return {"text": self.features[idx], "label": self.labels[idx]}
    
#     def __len__(self):
#         return len(self.features)


# train = MyDataset(text_train_x, train_y)
# test = MyDataset(text_test_x, test_y)

In [285]:
from datasets import Dataset


train = Dataset.from_dict({"text": text_train_x, "label": train_y})
test = Dataset.from_dict({"text": text_test_x, "label": test_y})

In [286]:
tokenized_train = train.map(preprocess_function, batched=True)
tokenized_test = test.map(preprocess_function, batched=True)

Map:   0%|          | 0/1262 [00:00<?, ? examples/s]

Map:   0%|          | 0/223 [00:00<?, ? examples/s]

In [287]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(indexer))
model

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-sentence and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [288]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [290]:
training_args = TrainingArguments(
    output_dir="results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss


KeyboardInterrupt: 

###Test

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier

In [None]:
text_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', OneVsRestClassifier(LogisticRegression())),
])

In [None]:
text_clf.fit(text_train_x, train_y)

In [None]:
import tensorflow as tf

In [None]:
text_clf.transform(test_train_x)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense()
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(10)
])

In [None]:
pred = text_clf.predict(text_test_x)

In [None]:
f1_score(test_y, pred, average="micro")

In [None]:
accuracy_score(test_y, pred)

In [None]:
roc_auc_score(test_y, pred[:, ], multi_class="ova")