# Чемпионат Саратовской области - доп. материал

Задача: классифицировать категории товаров в огромном наборе различных наименований

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
df_train = pd.read_csv("train_dataset_train.csv")
df_test = pd.read_csv("test_dataset_test.csv")

### Препроцессинг

1) Удаление пунктуации

2) Перевод ошибочных латинских букв в названиях в русские

3) Удаление всех цифр

4) Удаление стоп-слов

In [3]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from string import punctuation
import re
russian_stopwords = stopwords.words("russian")

lat_rus_dict = {'a': 'а', 'b': 'в', 'c': 'с', 'e': 'е', 'h': 'н', 'k': 'к', 'm': 'м',
           'n': 'п', 'o': 'о', 'p': 'р', 't': 'т', 'u': 'и', 'x': 'х', 'y': 'у',
           'd': 'д', 'f': 'ф', 'g': 'г', 'i': 'и', 'j': 'ж', 'l': 'л', 'q': 'к',
           'r': 'р', 's': 'с', 'v': 'в', 'w': 'в', 'z': 'з'}

lat_rus_table = str.maketrans(lat_rus_dict)
                
def lat_rus(text):
    return text.translate(lat_rus_table)

# Удаление знаков пунктуации из текста
def remove_punct(text):
    table = {33: ' ', 34: ' ', 35: ' ', 36: ' ', 37: ' ', 38: ' ', 39: ' ', 40: ' ', 41: ' ', 42: ' ', 43: ' ', 44: ' ', 45: ' ', 46: ' ', 47: ' ', 58: ' ', 59: ' ', 60: ' ', 61: ' ', 62: ' ', 63: ' ', 64: ' ', 91: ' ', 92: ' ', 93: ' ', 94: ' ', 95: ' ', 96: ' ', 123: ' ', 124: ' ', 125: ' ', 126: ' '}
    return text.translate(table)

def preprocess(df):
    df['name_clean'] = df['name'].map(lambda x: x.lower())
    df['name_clean'] = df['name_clean'].map(lambda x: remove_punct(x))
    df['name_clean'] = df['name_clean'].map(lambda x: lat_rus(x))
    df['name_clean'] = df['name_clean'].map(lambda x: re.sub(r'\d+', ' ', x))
    df['name_clean'] = df['name_clean'].map(lambda x: x.split(' '))
    df['name_clean'] = df['name_clean'].map(lambda x: [token for token in x if token not in russian_stopwords\
                                                                      and token != " " \
                                                                      and token.strip() not in punctuation])

#     for i in range(5, 50, 5):
#         df[f'top{i}'] = df['name_clean'].map(lambda x: ' '.join(x[:i]))
    
    df['name_clean'] = df['name_clean'].map(lambda x: ' '.join(x))
    
    return df

[nltk_data] Downloading package stopwords to /home/viliar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


##### Применяем метод препроцессинга данных

In [4]:
df_train = preprocess(df_train)
df_test = preprocess(df_test)

In [9]:
train = pd.DataFrame(df_train[['name_clean', 'groups']])
train.columns = ["text", "labels"]

### Импортируем все что нужно для дообучения

In [10]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs

### Создаем конфиг

In [8]:
model_args = ClassificationArgs(num_train_epochs=10, reprocess_input_data=True, overwrite_output_dir=True,
                               optimizer="AdamW", scheduler="linear_schedule_with_warmup", weight_decay=0,
                               sliding_window=False, config={"output_hidden_states": True})

### Так как некоторые классы пропущены  - требуется чтобы классы шли непрерывно - перекодируем их

In [24]:
old_group2new_group = {k: v for k, v in zip(train['labels'].value_counts().sort_index().index, 
                                range(len(train['labels'].value_counts())))}
new_group2old_group = {k: v for k, v in zip(range(len(train['labels'].value_counts())), train['labels'].value_counts().sort_index().index)}

train['labels'] = train['labels'].map(old_group2new_group)

### Class balanced

In [33]:
coef = 20
count_by_group = np.bincount(train['labels'])
weight = list(len(train['labels']) / (coef * count_by_group))
num_labels = len(train['labels'].value_counts())

In [34]:
model_new = ClassificationModel(
    model_type='bert',
    model_name="cointegrated/rubert-tiny2", 
    tokenizer_name="cointegrated/rubert-tiny2", 
    num_labels=num_labels,
    weight=weight,
    use_cuda=True,
    args=model_args
)

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not 

In [None]:
model_new.train_model(train)

### Это очень, очень долго.