# Чемпионат Саратовской области

Задача: классифицировать категории товаров в огромном наборе различных наименований

In [35]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [36]:
df_train = pd.read_csv("train_dataset_train.csv")
df_test = pd.read_csv("test_dataset_test.csv")

### Препроцессинг

1) Удаление пунктуации

2) Перевод ошибочных латинских букв в названиях в русские

3) Удаление всех цифр

4) Удаление стоп-слов

In [37]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from string import punctuation
import re
russian_stopwords = stopwords.words("russian")

lat_rus_dict = {'a': 'а', 'b': 'в', 'c': 'с', 'e': 'е', 'h': 'н', 'k': 'к', 'm': 'м',
           'n': 'п', 'o': 'о', 'p': 'р', 't': 'т', 'u': 'и', 'x': 'х', 'y': 'у',
           'd': 'д', 'f': 'ф', 'g': 'г', 'i': 'и', 'j': 'ж', 'l': 'л', 'q': 'к',
           'r': 'р', 's': 'с', 'v': 'в', 'w': 'в', 'z': 'з'}

lat_rus_table = str.maketrans(lat_rus_dict)
                
def lat_rus(text):
    return text.translate(lat_rus_table)

# Удаление знаков пунктуации из текста
def remove_punct(text):
    table = {33: ' ', 34: ' ', 35: ' ', 36: ' ', 37: ' ', 38: ' ', 39: ' ', 40: ' ', 41: ' ', 42: ' ', 43: ' ', 44: ' ', 45: ' ', 46: ' ', 47: ' ', 58: ' ', 59: ' ', 60: ' ', 61: ' ', 62: ' ', 63: ' ', 64: ' ', 91: ' ', 92: ' ', 93: ' ', 94: ' ', 95: ' ', 96: ' ', 123: ' ', 124: ' ', 125: ' ', 126: ' '}
    return text.translate(table)

def preprocess(df):
    df['name_clean'] = df['name'].map(lambda x: x.lower())
    df['name_clean'] = df['name_clean'].map(lambda x: remove_punct(x))
    df['name_clean'] = df['name_clean'].map(lambda x: lat_rus(x))
    df['name_clean'] = df['name_clean'].map(lambda x: re.sub(r'\d+', ' ', x))
    df['name_clean'] = df['name_clean'].map(lambda x: x.split(' '))
    df['name_clean'] = df['name_clean'].map(lambda x: [token for token in x if token not in russian_stopwords\
                                                                      and token != " " \
                                                                      and token.strip() not in punctuation])

#     for i in range(5, 50, 5):
#         df[f'top{i}'] = df['name_clean'].map(lambda x: ' '.join(x[:i]))
    
    df['name_clean'] = df['name_clean'].map(lambda x: ' '.join(x))
    
    return df

[nltk_data] Downloading package stopwords to /home/viliar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


##### Применяем метод препроцессинга данных

In [38]:
df_train = preprocess(df_train)
df_test = preprocess(df_test)

### Импортируем все что нужно для поиска гиперпараметров

In [39]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import recall_score
from sklearn.metrics import make_scorer

##### Небольшой хак - ищем гиперпараметры по метрике лидерборда

In [40]:
def macro_recall(y_true, y_pred):
    return recall_score(y_true, y_pred, average='macro')

In [41]:
RScore = make_scorer(macro_recall)

## Выделим обучающую выборку

In [42]:
X_train = df_train["name_clean"]
y_train = df_train["groups"]

In [43]:
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__max_features': (None, 3000, 30000, 300000),
    'vect__use_idf': (True, False),
    'vect__norm': ('l1', 'l2', None),
    'clf__solver': ['lbfgs', 'adam'],
    'clf__max_iter': [1000, 2000],
    'clf__alpha': 10.0 ** -np.arange(1, 10, 5),
    'clf__hidden_layer_sizes': [(100,), (100, 100,), (100, 100, 100,), (100, 100, 100, 100,)],
    'random_state': [42]}

In [44]:
pipeline = Pipeline([
           ('vect', TfidfVectorizer()), ('clf', MLPClassifier())]
)

#### Из-за слишком большого кол-ва вариантов используем RandomizedSearchCV

In [45]:
grid_cv = RandomizedSearchCV(pipeline, parameters, scoring=RScore, cv=5, n_jobs=-1, n_iter=100)

In [None]:
grid_cv.fit(X_train, y_train)

In [None]:
grid_cv.best_params_

### Обучение на всех данных

In [33]:
full_pipeline = Pipeline([
           ('vect', TfidfVectorizer()), ('clf', MLPClassifier())]
)

In [34]:
full_pipeline.set_params(**grid_cv.best_params_)
print("Model with best params {full_pipeline}")

Model with best params {full_pipeline}


In [None]:
full_pipeline.fit(X_train, y_train)

#### predict на Тесте

In [27]:
X_test = df_test["name_clean"]

In [28]:
predicted = full_pipeline.predict(X_test)

In [29]:
sample = pd.read_csv("sample_solution.csv")

In [30]:
len(sample), len(predicted)

(282227, 282227)

In [31]:
sample["groups"] = predicted

In [32]:
sample.to_csv("NN_GridCV_LongTrain.csv", index=False)