# Задача 3. Catalog

**Задача от индустриального партнера «ТМК».**

Есть справочник различных позиций, используемых компанией ТМК. Каждая позиция в справочнике содержит всего два атрибута: "Название" и "Группа".

Задача: предсказывать атрибут "Группа" по атрибуту "Название".

Метрикой качества является `accuracy` — доля верных предсказаний.

**Формат ввода**

- train.txt — файл с обучающей выборкой: каждая строка представляет собой одну позицию и состоит из названия позиции и группы, разделенных символом табуляции.
- test.txt — файл с тестовой выборкой: файл состоит из 2346 строк, каждая строка полностью состоит из названия позиции, для которого нужно определить группу.

**Формат вывода**

Ответ требуется в следующем формате: файл из 2346 строк, i-ая строка должно представлять собой предсказанную группу для i-ой строки из файла test.txt.

Соревнование на kaggle: https://www.kaggle.com/c/catalog

## Создание данных

In [None]:
# Позволяет монтировать гугл-диск в colab
# и получить доступ к папкам и файлам на гугл-диске
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import re
import pandas as pd
import os

url = os.chdir(r"./gdrive/MyDrive/Я-профи подготовка по машинному обучению/Catalog")
os.getcwd()

'/content/gdrive/MyDrive/Я-профи подготовка по машинному обучению/Catalog'

In [None]:
files = os.listdir(url)
files

['test.txt',
 'train.txt',
 'answer.txt',
 'answer_catalog.csv',
 'sample_submit.csv',
 'test_submit.csv']

In [None]:
'Клещи L=630 прод-попер для захвата образ	Инструмент слесарный\n'.split('\t')[1].split('\n')

['Инструмент слесарный', '']

In [None]:
# Создадим датафрейм тренировочных данных
train_text = [] # название продукта, до табуляции
target_text = [] # группа, после табуляции
with open('train.txt', "r", encoding='utf-8') as f:
    for line in f.readlines():
        train_text.append(line.split('\t')[0])
        target_text.append(line.split('\t')[1].split('\n')[0])

train = pd.DataFrame({'Название': train_text, 'Группа': target_text})
train.head()

Unnamed: 0,Название,Группа
0,Валок ф108 5ФВ ч.В-241178-14,Инструменты
1,Державка 30531402 Mapal,Резцы
2,"Кабель КПСВВнг-LS 1х2х0,75",Кабельная продукция
3,"Трубка электроизоляционная ТКР ф16,0мм",Изделия электроустан
4,"Лента конвейер 2,1-1000-ТК-200-2-5/2",ИзделияРезино-технич


In [None]:
# Создадим тестовый набор данных в виде датафрейма
test_text = []
with open('test.txt', "r", encoding='utf-8') as f:
    for line in f.readlines():
        test_text.append(line.split('\n')[0])

# test_text[0:5]
test = pd.DataFrame({'Название': test_text})
test.head()

Unnamed: 0,Название
0,Подшипник 3630 (22330)
1,Винт 24х110 ГОСТ11738-84(DIN 912)
2,Пускатель ПМ ГОСТО 12-025-150 220В
3,Образец станд Ш13 концентрат плавико
4,Насос A4VG180EP2DT2/32R-PZD02F691LH-S


## Исследование и обработка данных

In [None]:
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

Train shape: (23973, 2)
Test shape: (2346, 1)


In [None]:
train['Группа'].value_counts()

Запчасти                5757
Метизы                  1378
З/Ч АвтомобПромышл      1257
З/Ч по чертежам          985
Инструменты              827
                        ... 
ПродукцЦеллюлозБумаж      39
Пилы                      37
ЗаготовкаИнстр и з/ч      34
Цепи и звенья             34
Теплоизоляционные         13
Name: Группа, Length: 96, dtype: int64

In [None]:
train['Группа'].unique()

array(['Инструменты', 'Резцы', 'Кабельная продукция',
       'Изделия электроустан', 'ИзделияРезино-технич', 'Запчасти',
       'З/Ч АвтомобПромышл', 'Подшипники', 'Фрезы',
       'ЗЧ АвтоматПускКонтак', 'РеактивыХимич.', 'З/Ч по чертежам',
       'Метизы', 'МодулПлатыСистАвтом', 'Огнеупоры', 'Редукторы',
       'ИздИзПолимеровСтанд', 'Химпродукция', 'Инструмент слесарный',
       'СветотехнИсточнСвета', 'Спецогнеупоры', 'Инструм. мерительный',
       'ВыключатАвтоматич', 'Комплектующие электр', 'Арматура к трубам',
       'Хоз.товары', 'ИздИзПолимерПоЧертеж', 'Стропы',
       'Инструмент режущий', 'ПрокатСортовойОбНазн', 'Смазки',
       'Конденсаторы', 'Кабельно-проводников', 'Металлопрокат',
       'Расходные материалы', 'Мебель', 'З/Ч Пневмооборудов',
       'Материалы лаб.', 'Сплав твердый', 'Фильтры, фильтроэлем',
       'Сверла', 'МатерСтроительные', 'Стройматериалы',
       'ВычОргТехн и З/Ч', 'Инстр. электрический', 'З/Ч к НасосВентилят',
       'ПриборыСистАвтоматик', 'ИздДля

In [None]:
train[train['Группа'] == 'СветотехнИсточнСвета']

Unnamed: 0,Название,Группа
29,Лампа сигнальная зеленая AD-22DS/230V,СветотехнИсточнСвета
45,Лампа накал ЛОН 220в 100вт,СветотехнИсточнСвета
102,Лампа светодиодн коммут СКЛ-К-2-360,СветотехнИсточнСвета
173,Светильник светодиодный PWP-С2 1200 ДСП,СветотехнИсточнСвета
294,Прожектор ЖО 04-400-001,СветотехнИсточнСвета
...,...,...
23456,Лампа КИПМ 42-22-Б-2-36 белая,СветотехнИсточнСвета
23517,Лампа накал миниат СМН 10в 55ма спец,СветотехнИсточнСвета
23715,Лампа ртутная ДРЛ-1000 Е40,СветотехнИсточнСвета
23725,Лампа ртутная ДРЛ-400вт Е40,СветотехнИсточнСвета


In [None]:
train[train['Группа'] == 'Конденсаторы']

Unnamed: 0,Название,Группа
62,Конденсатор КВИ3 16кв 470пФ 20%,Конденсаторы
342,Конденсатор К50-35 160в 470мкф,Конденсаторы
676,Конденсатор К50-35 25в 470мкф 105С,Конденсаторы
716,"Конденсатор К50-35 50в 4,7мкф 105С",Конденсаторы
754,Конденсатор К50-35 6800мкф 35В,Конденсаторы
767,Конденсатор К50-35 16в 220мкф 105С,Конденсаторы
847,"Конденсатор К50-35 50в 6,8мкф",Конденсаторы
1756,Конденсатор К50-35 100в 47мкф 105С,Конденсаторы
3236,Конденсатор К50-35 16в 1000мкф,Конденсаторы
3339,"Конденсатор К73-17 1500в 0,1мкф",Конденсаторы


In [None]:
train[train['Группа'] == 'МодулПлатыСистАвтом']
# feature engenering

Unnamed: 0,Название,Группа
15,Модуль 6GK7343-1СX10-0XE0 Siemens,МодулПлатыСистАвтом
55,Модуль 6ES7138-4CA01-0AA0,МодулПлатыСистАвтом
220,Соединитель 6ES7972-0BB12-0XA0,МодулПлатыСистАвтом
376,Кабель соед двойной разъемы Lemo 0+CP50,МодулПлатыСистАвтом
404,Модуль вывода сигнала 6ES7322-5GH00-0AB0,МодулПлатыСистАвтом
...,...,...
23300,Разъем DB-9F,МодулПлатыСистАвтом
23363,Карта памяти 6ES7952-1AK00-0AA0,МодулПлатыСистАвтом
23559,Коммутатор NIS-3200-204PSG,МодулПлатыСистАвтом
23660,Индикатор MG3100/IP54/TROP RED TYPE R,МодулПлатыСистАвтом


In [None]:
%%time
from string import punctuation

def remove_punct(text):
    # удаление пунктуации в тексте
    table = {33: ' ', 34: ' ', 35: ' ', 36: ' ', 37: ' ', 38: ' ', 39: ' ', 40: ' ', 41: ' ', 42: ' ',
             43: ' ', 44: ' ', 45: ' ', 46: ' ', 47: ' ', 58: ' ', 59: ' ', 60: ' ', 61: ' ', 62: ' ',
             63: ' ', 64: ' ', 91: ' ', 92: ' ', 93: ' ', 94: ' ', 95: ' ', 96: ' ', 123: ' ', 124: ' ', 125: ' ', 126: ' '}
    return text.translate(table)

def txt_prep(df):
    # функция приводит весь текст к нижнему регистру
    # удаляет пунктуацию
    df['Название начальный вид'] = df['Название']
    df['Название'] = df['Название'].str.lower() # Hello - hello
    df['Название'] = df['Название'].map(lambda x: remove_punct(x)) # удаляем пунктуацию
    df['Название'] = df['Название'].str.replace(r"\d+", "", flags=re.UNICODE) # удаляем цифры
    df['Название'] = df['Название'].str.replace(r"\b\w{1,2}\b", "") # удаляет слова из 1 или 2 символов
    # df['Название'] = df['Название'].str.replace(r"[a-zA-Z]", "")

    return df

CPU times: user 18 µs, sys: 0 ns, total: 18 µs
Wall time: 21.5 µs


In [None]:
re.findall(r'кг/м3', 'Карта памяти  кг/м3 6ES7952-1AK00-0AA0')

['кг/м3']

In [None]:
train

Unnamed: 0,Название,Группа
0,Валок ф108 5ФВ ч.В-241178-14,Инструменты
1,Державка 30531402 Mapal,Резцы
2,"Кабель КПСВВнг-LS 1х2х0,75",Кабельная продукция
3,"Трубка электроизоляционная ТКР ф16,0мм",Изделия электроустан
4,"Лента конвейер 2,1-1000-ТК-200-2-5/2",ИзделияРезино-технич
...,...,...
23968,"Фреза шпоночная ц/х 8,0",Фрезы
23969,Кирпич керам полнотел одинарный М200,МатерСтроительные
23970,"Клеймо тв спл 122""Ф"" ВК15",Инструменты
23971,Элемент питания Saft LS 14250/STD 1/2AA,Запчасти


In [None]:
%%time
def feature_generation(df):

    df['кг/м3'] = ''
    df['мм2'] = ''
    df['куллон'] = ''
    df['м2/см3'] = ''
    df['вт'] = ''
    df['в'] = ''
    df['кгс/см2'] = ''
    df['кг'] = ''
    df['Gb'] = ''
    df['ед'] = ''
    df['амп'] = ''
    df['л/мин'] = ''
    df['мм'] = ''

    for index, row in df.iterrows():
        if len(re.findall(r'кг/м3', row['Название'])) != 0:
            df.loc[index, 'кг/м3'] = 1
        else:
            df.loc[index, 'кг/м3'] = 0

        if len(re.findall(r'мм2', row['Название'])) != 0:
            df.loc[index, 'мм2'] = 1
        else:
            df.loc[index, 'мм2'] = 0

        if len(re.findall(r'(кл[0-9])', row['Название'])) != 0:
            df.loc[index, 'куллон'] = 1
        else:
            df.loc[index, 'куллон'] = 0

        if len(re.findall(r'м2/см3', row['Название'])) != 0:
            df.loc[index, 'м2/см3'] = 1
        else:
            df.loc[index, 'м2/см3'] = 0

        if len(re.findall(r'[\d]вт', row['Название'])) != 0:
            df.loc[index, 'вт'] = 1
        else:
            df.loc[index, 'вт'] = 0
        
        if len(re.findall(r'[\d]{1,}в', row['Название'])) != 0:
            df.loc[index, 'в'] = 1
        else:
            df.loc[index, 'в'] = 0
        
        if len(re.findall(r'[\d]мкф', row['Название'])) != 0:
            df.loc[index, 'мкф'] = 1
        else:
            df.loc[index, 'мкф'] = 0
        
        if len(re.findall(r'кгс/см2', row['Название'])) != 0:
            df.loc[index, 'кгс/см2'] = 1
        else:
            df.loc[index, 'кгс/см2'] = 0
        
        if len(re.findall(r'[\d]{1,}кг ', row['Название'])) != 0:
            df.loc[index, 'кг'] = 1
        else:
            df.loc[index, 'кг'] = 0
        
        if len(re.findall(r'[\d]{1,}Gb ', row['Название'])) != 0:
            df.loc[index, 'Gb'] = 1
        else:
            df.loc[index, 'Gb'] = 0
        
        if len(re.findall(r'[\d]{1,}ед', row['Название'])) != 0:
            df.loc[index, 'ед'] = 1
        else:
            df.loc[index, 'ед'] = 0

        if len(re.findall(r'[\d]{1,}амп ', row['Название'])) != 0:
            df.loc[index, 'амп'] = 1
        else:
            df.loc[index, 'амп'] = 0
        
        if len(re.findall(r'[\d]{1,}л/мин', row['Название'])) != 0:
            df.loc[index, 'л/мин'] = 1
        else:
            df.loc[index, 'л/мин'] = 0
        
        if len(re.findall(r'[\d]{1,}mm|[\d]{1,}мм', row['Название'])) != 0:
            df.loc[index, 'мм'] = 1
        else:
            df.loc[index, 'мм'] = 0

        if len(re.findall(r'[\d]{1,}л', row['Название'])) != 0:
            df.loc[index, 'л'] = 1
        else:
            df.loc[index, 'л'] = 0

    return df

train = feature_generation(train)
train.head()

CPU times: user 4min 11s, sys: 2.19 s, total: 4min 13s
Wall time: 4min 11s


In [None]:
train = txt_prep(train)
train

Unnamed: 0,Название,Группа,Название начальный вид
0,валок,Инструменты,Валок ф108 5ФВ ч.В-241178-14
1,державка,Резцы,Державка 30531402 Mapal
2,кабель кпсввнг,Кабельная продукция,"Кабель КПСВВнг-LS 1х2х0,75"
3,трубка электроизоляционная ткр,Изделия электроустан,"Трубка электроизоляционная ТКР ф16,0мм"
4,лента конвейер,ИзделияРезино-технич,"Лента конвейер 2,1-1000-ТК-200-2-5/2"
...,...,...,...
23968,фреза шпоночная,Фрезы,"Фреза шпоночная ц/х 8,0"
23969,кирпич керам полнотел одинарный,МатерСтроительные,Кирпич керам полнотел одинарный М200
23970,клеймо спл,Инструменты,"Клеймо тв спл 122""Ф"" ВК15"
23971,элемент питания,Запчасти,Элемент питания Saft LS 14250/STD 1/2AA


In [None]:
train

Unnamed: 0,Название,Группа,кг/м3,мм2,куллон,м2/см3,вт,в,кгс/см2,кг,Gb,ед,амп,л/мин,мм,мкф,л,Название начальный вид
0,валок,Инструменты,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,Валок ф108 5ФВ ч.В-241178-14
1,державка mapal,Резцы,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,Державка 30531402 Mapal
2,кабель кпсввнг,Кабельная продукция,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,"Кабель КПСВВнг-LS 1х2х0,75"
3,трубка электроизоляционная ткр,Изделия электроустан,0,0,0,0,0,0,0,0,0,0,0,0,1,0.0,0.0,"Трубка электроизоляционная ТКР ф16,0мм"
4,лента конвейер,ИзделияРезино-технич,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,"Лента конвейер 2,1-1000-ТК-200-2-5/2"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23968,фреза шпоночная,Фрезы,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,"Фреза шпоночная ц/х 8,0"
23969,кирпич керам полнотел одинарный,МатерСтроительные,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,Кирпич керам полнотел одинарный М200
23970,клеймо спл,Инструменты,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,"Клеймо тв спл 122""Ф"" ВК15"
23971,элемент питания saft std,Запчасти,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,Элемент питания Saft LS 14250/STD 1/2AA


### Посмотрим на данные по регуляркам и единицам измерения

In [None]:
# %%time
tmp = []
tmp2 = []
for index, row in train.iterrows():
    if len(re.findall(r'кг/м3', row['Название'])) != 0:
        tmp.append(row['Название'])
        tmp2.append(row['Группа'])

tmp3 = pd.DataFrame({'Название': tmp, 'Группа': tmp2})
print(tmp3['Группа'].value_counts())
tmp3

Теплоизоляционные    7
Name: Группа, dtype: int64


Unnamed: 0,Название,Группа
0,Блок модульн Fiberfrax 190кг/м3 305х305х,Теплоизоляционные
1,Одеяло Fiberfrax 160кг/м3 25х610х7320,Теплоизоляционные
2,Одеяло FiberBlanket Z 96 кг/м3,Теплоизоляционные
3,Лента муллитокремнез 96кг/м3 7200х100х20,Теплоизоляционные
4,Волокно керамич 128кг/м3 3660х610х50мм,Теплоизоляционные
5,Одеяло FiberBlanket Z 128кг/м3,Теплоизоляционные
6,Одеяло Fiberfrax 128кг/м3 25х610х7320,Теплоизоляционные


In [None]:
# %%time
tmp = []
tmp2 = []
for index, row in train.iterrows():
    if len(re.findall(r'[ч][\.][0-9А-Я]{1,}[-][0-9А-Я]{1,}', row['Название'])) != 0:
        tmp.append(row['Название'])
        tmp2.append(row['Группа'])

tmp3 = pd.DataFrame({'Название': tmp, 'Группа': tmp2})
print(tmp3['Группа'].value_counts())
tmp3

In [None]:
# %%time
tmp = []
tmp2 = []
for index, row in train.iterrows():
    if len(re.findall(r'мм2', row['Название'])) != 0:
        # print(row)
        tmp.append(row['Название'])
        tmp2.append(row['Группа'])

tmp3 = pd.DataFrame({'Название': tmp, 'Группа': tmp2})
print(tmp3['Группа'].value_counts())
tmp3

Изделия электроустан    64
Кабельная продукция     28
Кабельно-проводников     3
СветотехнИсточнСвета     2
Запчасти                 1
КнопкиПостыУправлен      1
Name: Группа, dtype: int64


Unnamed: 0,Название,Группа
0,Наконечник кабельн алюм ТА-50 мм2,Изделия электроустан
1,Наконечник кабельный 0.5мм2 37661,Кабельная продукция
2,"Маркер 38210 САВ3 0 черный 0,5-1,5мм2",Изделия электроустан
3,Наконечник кабельн медн ТМЛ-50мм2,Кабельная продукция
4,Зажим наборный ЗНИ-4мм2 YZN10-004-K07,Запчасти
...,...,...
94,Зажим кабельн наборн ЗНИ-4мм2,Изделия электроустан
95,Наконечник кабельн медн ТМ-185мм2,Изделия электроустан
96,Наконечник кабельн медн ТМЛ-16мм2,Изделия электроустан
97,Наконечник кабельн алюм-медн ТАМ-50мм2,Кабельная продукция


In [None]:
# %%time
tmp = []
tmp2 = []
for index, row in train.iterrows():
    if len(re.findall(r'(кл[0-9])', row['Название'])) != 0:
        # print(row)
        tmp.append(row['Название'])
        tmp2.append(row['Группа'])

tmp3 = pd.DataFrame({'Название': tmp, 'Группа': tmp2})
print(tmp3['Группа'].value_counts())
tmp3

КонтрИзмеритПриборы     20
Инструм. мерительный    15
Метизы                   8
Инструменты              5
ОборудПрочее             2
Name: Группа, dtype: int64


Unnamed: 0,Название,Группа
0,"Микрометр МК-100 75-100мм 0,01 кл2",Инструменты
1,Щуп №4 кл2,Инструм. мерительный
2,"Штангенциркуль ШЦЦ-150 0-150 0,1мм кл1",Инструм. мерительный
3,"Микрометр МК-200 175-200мм 0,01 кл1",Инструменты
4,Винт высок с цил гол и шест 14х40 кл12.9,Метизы
5,Щуп №4 кл1,Инструм. мерительный
6,"Нутромер НИ-100 50-100мм 0,01 кл2",Инструм. мерительный
7,"Манометр технич МП4-УУ2 0-10МПа кл1,5",КонтрИзмеритПриборы
8,"Микрометр МВМ-150 125-150мм 0,01кл2",Инструм. мерительный
9,"Штангенциркуль ШЦ-II-250-0,05 L=130 кл1",Инструм. мерительный


In [None]:
# %%time
tmp = []
tmp2 = []
for index, row in train.iterrows():
    if len(re.findall(r'м2/см3', row['Название'])) != 0:
        # print(row)
        tmp.append(row['Название'])
        tmp2.append(row['Группа'])

tmp3 = pd.DataFrame({'Название': tmp, 'Группа': tmp2})
print(tmp3['Группа'].value_counts())
tmp3

Химпродукция    3
Name: Группа, dtype: int64


Unnamed: 0,Название,Группа
0,"ГСО кальций-ион 5221-90 1,0 м2/см3",Химпродукция
1,"ГСО иодид-ион 0,1 м2/см3",Химпродукция
2,"АР этиленгликоль, 1, 0 м2/см3 ГСО №14-96",Химпродукция


In [None]:
# %%time
tmp = []
tmp2 = []
for index, row in train.iterrows():
    if len(re.findall(r'[\d]вт', row['Название'])) != 0:
        # print(row)
        tmp.append(row['Название'])
        tmp2.append(row['Группа'])

tmp3 = pd.DataFrame({'Название': tmp, 'Группа': tmp2})
print(tmp3['Группа'].value_counts())
tmp3

СветотехнИсточнСвета    14
Комплектующие электр     9
Инстр. электрический     4
Расходные материалы      1
ОборудБытовое            1
Инструменты              1
Name: Группа, dtype: int64


Unnamed: 0,Название,Группа
0,Лампа МГЛ 1000вт HQI-T Osram,Комплектующие электр
1,Лампа накал ЛОН 220в 100вт,СветотехнИсточнСвета
2,Лампа светодиод зерк LED 8вт E27 R63тепл,СветотехнИсточнСвета
3,Лампа ДНаТ 150вт NAV-T E40,Комплектующие электр
4,Лампа накаливания МО 36в 60вт,Комплектующие электр
5,Лампа накал МО 36в 100вт,СветотехнИсточнСвета
6,Лампа натриев ДНАТ-400вт Е40,СветотехнИсточнСвета
7,Лампа накал МО 24в 60вт,СветотехнИсточнСвета
8,Фен техн Bosch GHG 660 LCD 2300вт,Инструменты
9,Лампа эн/сбер 11вт E27,Комплектующие электр


In [None]:
# %%time
tmp = []
tmp2 = []
for index, row in train.iterrows():
    if len(re.findall(r'[\d]{1,}в', row['Название'])) != 0:
        # print(row)
        tmp.append(row['Название'])
        tmp2.append(row['Группа'])

tmp3 = pd.DataFrame({'Название': tmp, 'Группа': tmp2})
print(tmp3['Группа'].value_counts())
tmp3

Конденсаторы            35
Пускатели магнитные     31
Контакторы              25
Запчасти                23
СветотехнИсточнСвета    20
Изделия электроустан     9
Комплектующие электр     9
Реле                     6
РеактивыХимич.           5
Инстр. электрический     4
АккумБатареи и Элем      4
ОборудБытовое            3
Химпродукция             3
Инструмент слесарный     2
Редукторы                2
ВычОргТехн и З/Ч         1
З/Ч ТракСтроитТехн       1
Расходные материалы      1
КонтрИзмеритПриборы      1
ХозИнвентарь             1
З/Ч АвтомобПромышл       1
Инструменты              1
ДатчСистемАвтоматики     1
Name: Группа, dtype: int64


Unnamed: 0,Название,Группа
0,"Образец станд Ф25в ГСО1694-89Пфас0,100кг",РеактивыХимич.
1,Лампа МГЛ 1000вт HQI-T Osram,Комплектующие электр
2,Лампа накал ЛОН 220в 100вт,СветотехнИсточнСвета
3,Конденсатор К50-35 160в 470мкф,Конденсаторы
4,Вилка перен 025 3пол+заз+нейт 32а 380в,Изделия электроустан
...,...,...
184,Лампа ртутная ДРЛ-400вт Е40,СветотехнИсточнСвета
185,Коробка клеммная КЗНС-16 4в 252х232х92,Изделия электроустан
186,Лампа накал МО 12в 40вт,СветотехнИсточнСвета
187,Конденсатор К50-35 16в 330мкф,Конденсаторы


In [None]:
# %%time
tmp = []
tmp2 = []
for index, row in train.iterrows():
    if len(re.findall(r'[\d]мкф', row['Название'])) != 0:
        # print(row)
        tmp.append(row['Название'])
        tmp2.append(row['Группа'])

tmp3 = pd.DataFrame({'Название': tmp, 'Группа': tmp2})
print(tmp3['Группа'].value_counts())
tmp3

Конденсаторы    35
Name: Группа, dtype: int64


Unnamed: 0,Название,Группа
0,Конденсатор К50-35 160в 470мкф,Конденсаторы
1,Конденсатор К50-35 25в 470мкф 105С,Конденсаторы
2,"Конденсатор К50-35 50в 4,7мкф 105С",Конденсаторы
3,Конденсатор К50-35 6800мкф 35В,Конденсаторы
4,Конденсатор К50-35 16в 220мкф 105С,Конденсаторы
5,"Конденсатор К50-35 50в 6,8мкф",Конденсаторы
6,Конденсатор К50-35 100в 47мкф 105С,Конденсаторы
7,Конденсатор К50-35 16в 1000мкф,Конденсаторы
8,"Конденсатор К73-17 1500в 0,1мкф",Конденсаторы
9,Конденсатор К73-17 250в 1мкф,Конденсаторы


In [None]:
# %%time
tmp = []
tmp2 = []
for index, row in train.iterrows():
    if len(re.findall(r'кгс/см2', row['Название'])) != 0:
        # print(row)
        tmp.append(row['Название'])
        tmp2.append(row['Группа'])

tmp3 = pd.DataFrame({'Название': tmp, 'Группа': tmp2})
print(tmp3['Группа'].value_counts())
tmp3

In [None]:
# %%time
tmp = []
tmp2 = []
for index, row in train.iterrows():
    if len(re.findall(r'[\d]{1,}кг ', row['Название'])) != 0:
        # print(row)
        tmp.append(row['Название'])
        tmp2.append(row['Группа'])

tmp3 = pd.DataFrame({'Название': tmp, 'Группа': tmp2})
print(tmp3['Группа'].value_counts())
tmp3

З/Ч АвтомобПромышл      2
Инструмент слесарный    2
Инструменты             2
ХимПродОбщехимНазн      1
МатерСтроительные       1
Стройматериалы          1
ОборуТехнологическое    1
Name: Группа, dtype: int64


Unnamed: 0,Название,Группа
0,Насос подкач шин с электропр 5кг ГАЗ ВА,З/Ч АвтомобПромышл
1,Герметик борта 1кг Rossvik,З/Ч АвтомобПромышл
2,Кувалда 10кг ГОСТ737-80,Инструменты
3,Затирка CERESIT 2кг бел,Стройматериалы
4,Короб Q=2200кг 8.01.1567.00.00СБ,ОборуТехнологическое
5,Кувалда 3кг с фиберглассовой ручкой,Инструмент слесарный
6,Кувалда 2кг ГОСТ 737-80,Инструменты
7,Доводчик дверной до 80кг серый,МатерСтроительные
8,"Воск 0,25кг 800004005",ХимПродОбщехимНазн
9,Кувалда 8кг с фиберглассовой ручкой,Инструмент слесарный


In [None]:
# %%time
tmp = []
tmp2 = []
for index, row in train.iterrows():
    if len(re.findall(r'[\d]{1,}Gb ', row['Название'])) != 0:
        # print(row)
        tmp.append(row['Название'])
        tmp2.append(row['Группа'])

tmp3 = pd.DataFrame({'Название': tmp, 'Группа': tmp2})
print(tmp3['Группа'].value_counts())
tmp3

ВычОргТехн и З/Ч        7
ОборудПрочее            1
Измерительные прибор    1
Name: Группа, dtype: int64


Unnamed: 0,Название,Группа
0,Планшет Microsoft Pro 5 i5 8Gb 256 Gb,ОборудПрочее
1,'Накопитель HDD 750Gb 2.5'' SATA II,ВычОргТехн и З/Ч
2,Накопитель HDD 1000Gb внешний USB,ВычОргТехн и З/Ч
3,'Накопитель HDD 160Gb 2.5'' SATA,ВычОргТехн и З/Ч
4,Модуль памяти DDR4 4Gb 2133 UDIMM,ВычОргТехн и З/Ч
5,"'Накопитель HDD 320,0Gb 2.5'' IDE",ВычОргТехн и З/Ч
6,Накопитель HDD 750Gb внешний USB,ВычОргТехн и З/Ч
7,Накопитель HDD 500Gb SATA II/III,ВычОргТехн и З/Ч
8,Флэш диск 8Gb Transcend USB 2.0,Измерительные прибор


In [None]:
# %%time
tmp = []
tmp2 = []
for index, row in train.iterrows():
    if len(re.findall(r'[\d]{1,}ед', row['Название'])) != 0:
        # print(row)
        tmp.append(row['Название'])
        tmp2.append(row['Группа'])

tmp3 = pd.DataFrame({'Название': tmp, 'Группа': tmp2})
print(tmp3['Группа'].value_counts())
tmp3

З/Ч АвтомобПромышл      139
Инструмент режущий       67
ИздДляТрубПредохран      34
Спецодежда               27
З/Ч ТракСтроитТехн       22
КраныТельферыЛебедки     20
ИзделияРезино-технич     17
Инструмент слесарный     10
ИздИзПолимерПоЧертеж     10
З/Ч Гидрооборудован       8
З/Ч к компрессорам        6
Изделия электроустан      6
ВычОргТехн и З/Ч          5
З/Ч к НасосВентилят       4
ОборуТехнологическое      4
З/Ч по чертежам           3
РеактивыХимич.            3
Инструм. мерительный      2
Посуда и приборы хим      2
З/Ч Пневмооборудов        2
ИнстОснастТехнПоЧерт      2
ИздИзПолимеровСтанд       1
ХимПродОбщехимНазн        1
Фильтры, фильтроэлем      1
Подшипники                1
ОборудБытовое             1
СИЗ (без размеров)        1
МатерСтроительные         1
ДатчСистемАвтоматики      1
Name: Группа, dtype: int64


Unnamed: 0,Название,Группа
0,Ремкомплект сцепл 1601003(1)-182МАЗ 53ед,З/Ч АвтомобПромышл
1,Набор уплотн г/ц НС-30/40х28х345 9ед,ИзделияРезино-технич
2,Ремкомплект цилиндра сцепл раб ГАЗ 5ед,З/Ч АвтомобПромышл
3,Набор отверток 8РК-2021 PROSKIT 8ед,Инструмент слесарный
4,Ремкомплект 207730 Graco 4ед,Подшипники
...,...,...
396,Костюм рабочий с лог х/б черный к-т 2ед,Спецодежда
397,Комплект прокладок двиг ЗМЗ 405 22ед,З/Ч АвтомобПромышл
398,Комплект кол тор п 3501800-02ГАЗ3302 4ед,З/Ч АвтомобПромышл
399,Ремкомпл тензор домкрат HTT9870.000F 2ед,З/Ч Гидрооборудован


In [None]:
# %%time
tmp = []
tmp2 = []
for index, row in train.iterrows():
    if len(re.findall(r'[\d]{1,}амп', row['Название'])) != 0:
        # print(row)
        tmp.append(row['Название'])
        tmp2.append(row['Группа'])

tmp3 = pd.DataFrame({'Название': tmp, 'Группа': tmp2})
print(tmp3['Группа'].value_counts())
tmp3

РеактивыХимич.    21
Name: Группа, dtype: int64


Unnamed: 0,Название,Группа
0,Образец станд Ион марганца кор 5ампул,РеактивыХимич.
1,Титр станд трилон Б 10амп,РеактивыХимич.
2,"Титр станд рН-метри рН 9,18 кор 6амп",РеактивыХимич.
3,Образец станд формальдегид кор 5амп,РеактивыХимич.
4,Титр станд натрий серноватист 10амп,РеактивыХимич.
5,Титр станд кислота соляная 10амп,РеактивыХимич.
6,Титр станд соль Мора 10амп,РеактивыХимич.
7,Титр станд натрий серноватистокис 10амп,РеактивыХимич.
8,Титр станд кислота азотная 10амп,РеактивыХимич.
9,Титр станд для pH-метрии 6амп,РеактивыХимич.


In [None]:
# %%time
tmp = []
tmp2 = []
for index, row in train.iterrows():
    if len(re.findall(r'[\d]{1,}л', row['Название'])) != 0:
        # print(row)
        tmp.append(row['Название'])
        tmp2.append(row['Группа'])

tmp3 = pd.DataFrame({'Название': tmp, 'Группа': tmp2})
print(tmp3['Группа'].value_counts())
tmp3

Стройматериалы          7
Хоз.товары              6
ХозИнвентарь            5
Запчасти                4
Прод.Лако-красочная     4
ВычОргТехн и З/Ч        3
З/Ч АвтомобПромышл      3
Посуда и приборы хим    3
МатерСтроительные       2
Химпродукция            2
Масла фасовочные        2
Расходные материалы     1
Инструмент режущий      1
ХимПродОбщехимНазн      1
ОборудПрочее            1
Смазки                  1
Насосы                  1
Name: Группа, dtype: int64


Unnamed: 0,Название,Группа
0,"Шина ц.литая с диском 12-16,5 33x6-11лев",ОборудПрочее
1,Чернила REA-JET ТНТК-SI 040 1л,Прод.Лако-красочная
2,Корзина для бумаг 9л сетчатая,Хоз.товары
3,Ведро 12л оцинк.,ХозИнвентарь
4,Бак мембранный Wester WAV 500л top,Запчасти
5,"Колер PARADE №213 зеленый 0,75л",Стройматериалы
6,Топливозаборник 1104012бак 500л КАМАЗ,З/Ч АвтомобПромышл
7,Пъедестал-лоток 2х500л Kyocera PF-471,ХозИнвентарь
8,"Пена монтажная огнеупорная 0,75л",МатерСтроительные
9,Масло Ultra-coolant 5л Selmers,Смазки


In [None]:
# %%time
tmp = []
tmp2 = []
for index, row in train.iterrows():
    if len(re.findall(r'[\d]{1,}л/мин', row['Название'])) != 0:
        # print(row)
        tmp.append(row['Название'])
        tmp2.append(row['Группа'])

tmp3 = pd.DataFrame({'Название': tmp, 'Группа': tmp2})
print(tmp3['Группа'].value_counts())
tmp3

Насосы    1
Name: Группа, dtype: int64


Unnamed: 0,Название,Группа
0,Насос циркуляц 12л/мин 150мбар,Насосы


In [None]:
# %%time
tmp = []
tmp2 = []
for index, row in train.iterrows():
    if len(re.findall(r'[\d]{1,}mm|[\d]{1,}мм', row['Название'])) != 0:
        # print(row)
        tmp.append(row['Название'])
        tmp2.append(row['Группа'])

tmp3 = pd.DataFrame({'Название': tmp, 'Группа': tmp2})
print(tmp3['Группа'].value_counts())
tmp3

Изделия электроустан    175
Инструмент слесарный    134
ИзделияРезино-технич    105
Запчасти                 79
ИздИзПолимеровСтанд      74
                       ... 
ОборудТехнолог            1
Теплоизоляционные         1
З/Ч Пневмооборудов        1
ЗЧ АвтоматПускКонтак      1
КнопкиПостыУправлен       1
Name: Группа, Length: 62, dtype: int64


Unnamed: 0,Название,Группа
0,"Трубка электроизоляционная ТКР ф16,0мм",Изделия электроустан
1,Рукав РВД 20-33 (М42х2) 850мм,ИзделияРезино-технич
2,Нож плоский по металлу ф300х80х30мм,Инструменты
3,Линейка измерит метал 300мм,Инструм. мерительный
4,Валик малярный 180мм (меховой),Инструменты
...,...,...
1093,Ремень бандажный СП 5500х440мм,ИзделияРезино-технич
1094,Шлифшкурка бум пласт основа ф300мм 320,Инструм. абразивный
1095,Наконечник кабельн медн ТМ-185мм2,Кабельная продукция
1096,"Текстолит лист 8,0мм ГОСТ 2910-74",Электроизоляционные


## Построение модели

In [None]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer

# радуга (2-gramm символьный): ра  ад  ду  уг  га
# мама мыла раму рано утром (2-gramm словные):  (мамы мыла)    (мыла раму)   (раму утром)   (рано утром)
ngram_range = (1,3)
# словные
# униграммы, биграммы

min_df = 10
max_df = 1.
max_features = 1000

tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)
                        
features_train = tfidf.fit_transform(train['Название']).toarray() # fit()-обучение transfrom()-применение
labels_train = train['Группа']
print(features_train.shape)

(23973, 1000)
CPU times: user 369 ms, sys: 136 ms, total: 505 ms
Wall time: 508 ms


In [None]:
tf_idf_df = pd.DataFrame(features_train, columns = tfidf.get_feature_names())
tf_idf_df.head()

Unnamed: 0,ancarbon,aol,aos,aos aos,art,asc,bcsg,cgnk,classic,din,dko,dkos,dpal,ecopur,ecopur nbr,ecorubber,egnk,emag,fag,fkm,flon,flon rub,fpm,fpm shore,fstd,fstd bcsg,fstd lcsg,gedore,graco,haeusler,hmg,hydac,hyundai,ifm,iii,ina,iso,kqh,lcsg,led,...,шестерня,шестерня тпц,шестигранник,шестигранный,шина,шкаф,шланг,шланг вод,шлиф,шлифшкурка,шлицем,шнур,шпилька,шпиндель,шпонка,шпоночная,штанга,штангенциркуль,шток,шток хромированный,шток хромированный mnv,штока,штуцер,щетка,щит,щиток,электр,электрод,электрод сварочный,электроизоляционная,элемент,элемент питания,элемент фильтр,эмаль,энкодер,эскиз,эскиз тпц,эспц,ямз,ящик
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.609955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
train_full = pd.concat([train, tf_idf_df], axis=1)
train_full = train_full.drop(columns=['Название начальный вид'])
train_full

Unnamed: 0,Название,Группа,кг/м3,мм2,куллон,м2/см3,вт,в,кгс/см2,кг,Gb,ед,амп,л/мин,мм,мкф,л,ancarbon,aol,aos,aos aos,art,asc,bcsg,cgnk,classic,din,dko,dkos,dpal,ecopur,ecopur nbr,ecorubber,egnk,emag,fag,fkm,flon,flon rub,fpm,...,шестерня,шестерня тпц,шестигранник,шестигранный,шина,шкаф,шланг,шланг вод,шлиф,шлифшкурка,шлицем,шнур,шпилька,шпиндель,шпонка,шпоночная,штанга,штангенциркуль,шток,шток хромированный,шток хромированный mnv,штока,штуцер,щетка,щит,щиток,электр,электрод,электрод сварочный,электроизоляционная,элемент,элемент питания,элемент фильтр,эмаль,энкодер,эскиз,эскиз тпц,эспц,ямз,ящик
0,валок,Инструменты,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,державка mapal,Резцы,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,кабель кпсввнг,Кабельная продукция,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,трубка электроизоляционная ткр,Изделия электроустан,0,0,0,0,0,0,0,0,0,0,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.609955,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,лента конвейер,ИзделияРезино-технич,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23968,фреза шпоночная,Фрезы,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.619402,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23969,кирпич керам полнотел одинарный,МатерСтроительные,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23970,клеймо спл,Инструменты,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23971,элемент питания saft std,Запчасти,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.553659,0.643542,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
train_full

Unnamed: 0,Название,Группа,кг/м3,мм2,куллон,м2/см3,вт,в,кгс/см2,кг,Gb,ед,амп,л/мин,мм,мкф,л,ancarbon,aol,aos,aos aos,art,asc,bcsg,cgnk,classic,din,dko,dkos,dpal,ecopur,ecopur nbr,ecorubber,egnk,emag,fag,fkm,flon,flon rub,fpm,...,шестерня,шестерня тпц,шестигранник,шестигранный,шина,шкаф,шланг,шланг вод,шлиф,шлифшкурка,шлицем,шнур,шпилька,шпиндель,шпонка,шпоночная,штанга,штангенциркуль,шток,шток хромированный,шток хромированный mnv,штока,штуцер,щетка,щит,щиток,электр,электрод,электрод сварочный,электроизоляционная,элемент,элемент питания,элемент фильтр,эмаль,энкодер,эскиз,эскиз тпц,эспц,ямз,ящик
0,валок,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,державка mapal,73,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,кабель кпсввнг,34,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,трубка электроизоляционная ткр,24,0,0,0,0,0,0,0,0,0,0,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.609955,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,лента конвейер,25,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23968,фреза шпоночная,88,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.619402,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23969,кирпич керам полнотел одинарный,44,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23970,клеймо спл,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23971,элемент питания saft std,19,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.553659,0.643542,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Закодируем целевую переменную
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()
train_full['Группа'] = labelencoder.fit_transform(train_full['Группа'])

mapping = dict(zip(labelencoder.classes_, range(len(labelencoder.classes_))))

train_full.head()

Unnamed: 0,Название,Группа,кг/м3,мм2,куллон,м2/см3,вт,в,кгс/см2,кг,Gb,ед,амп,л/мин,мм,мкф,л,ancarbon,aol,aos,aos aos,art,asc,bcsg,cgnk,classic,din,dko,dkos,dpal,ecopur,ecopur nbr,ecorubber,egnk,emag,fag,fkm,flon,flon rub,fpm,...,шестерня,шестерня тпц,шестигранник,шестигранный,шина,шкаф,шланг,шланг вод,шлиф,шлифшкурка,шлицем,шнур,шпилька,шпиндель,шпонка,шпоночная,штанга,штангенциркуль,шток,шток хромированный,шток хромированный mnv,штока,штуцер,щетка,щит,щиток,электр,электрод,электрод сварочный,электроизоляционная,элемент,элемент питания,элемент фильтр,эмаль,энкодер,эскиз,эскиз тпц,эспц,ямз,ящик
0,валок,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,державка mapal,73,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,кабель кпсввнг,34,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,трубка электроизоляционная ткр,24,0,0,0,0,0,0,0,0,0,0,0,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.609955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,лента конвейер,25,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
 from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_full.drop(columns=['Название','Группа']), 
                                                    train_full['Группа'], 
                                                    test_size=0.3, 
                                                    random_state=8)

print(f'All train shape: {train_full.shape}')
print(f'X train shape: {X_train.shape}')
print(f'X test shape: {X_test.shape}')

All train shape: (23973, 1017)
X train shape: (16781, 1015)
X test shape: (7192, 1015)


In [None]:
%%time
import numpy as np
from sklearn import svm
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

svc = svm.SVC()

svc.fit(X_train, y_train)
accuracy_score(y_test, svc.predict(X_test))

CPU times: user 14min 29s, sys: 1.18 s, total: 14min 30s
Wall time: 14min 25s


In [None]:
accuracy_score(y_test, svc.predict(X_test))

0.6779755283648499

In [None]:
test

Unnamed: 0,Название
0,Подшипник 3630 (22330)
1,Винт 24х110 ГОСТ11738-84(DIN 912)
2,Пускатель ПМ ГОСТО 12-025-150 220В
3,Образец станд Ш13 концентрат плавико
4,Насос A4VG180EP2DT2/32R-PZD02F691LH-S
...,...
2341,Втулка ч.0301435-30.148
2342,Фильтроэлемент 2600R005BN4HC
2343,Пила цепная электр руч UC 4010А Makita
2344,Картридж Canon PFI-107C голубой 130 мл


In [None]:
tmp = feature_generation(test)
tmp

Unnamed: 0,Название,кг/м3,мм2,куллон,м2/см3,вт,в,кгс/см2,кг,Gb,ед,амп,л/мин,мм,мкф,л,Название начальный вид
0,подшипник,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,подшипник
1,винт гост din,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,винт гост din
2,пускатель госто,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,пускатель госто
3,образец станд концентрат плавико,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,образец станд концентрат плавико
4,насос avgepdt pzdflh,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,насос avgepdt pzdflh
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2341,втулка,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,втулка
2342,фильтроэлемент rbnhc,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,фильтроэлемент rbnhc
2343,пила цепная электр руч makita,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,пила цепная электр руч makita
2344,картридж canon pfi голубой,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,картридж canon pfi голубой


In [None]:
features_test = tfidf.transform(tmp['Название']).toarray()
print(features_test.shape)

tf_idf_df_test = pd.DataFrame(features_test, columns = tfidf.get_feature_names())
tf_idf_df_test.head()

(2346, 1000)


Unnamed: 0,ancarbon,aol,aos,aos aos,art,asc,bcsg,cgnk,classic,din,dko,dkos,dpal,ecopur,ecopur nbr,ecorubber,egnk,emag,fag,fkm,flon,flon rub,fpm,fpm shore,fstd,fstd bcsg,fstd lcsg,gedore,graco,haeusler,hmg,hydac,hyundai,ifm,iii,ina,iso,kqh,lcsg,led,...,шестерня,шестерня тпц,шестигранник,шестигранный,шина,шкаф,шланг,шланг вод,шлиф,шлифшкурка,шлицем,шнур,шпилька,шпиндель,шпонка,шпоночная,штанга,штангенциркуль,шток,шток хромированный,шток хромированный mnv,штока,штуцер,щетка,щит,щиток,электр,электрод,электрод сварочный,электроизоляционная,элемент,элемент питания,элемент фильтр,эмаль,энкодер,эскиз,эскиз тпц,эспц,ямз,ящик
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.396442,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
test_full = pd.concat([test, tf_idf_df_test], axis=1)
test_full = test_full.drop(columns=['Название', 'Название начальный вид'])
test_full.head()

Unnamed: 0,кг/м3,мм2,куллон,м2/см3,вт,в,кгс/см2,кг,Gb,ед,амп,л/мин,мм,мкф,л,ancarbon,aol,aos,aos aos,art,asc,bcsg,cgnk,classic,din,dko,dkos,dpal,ecopur,ecopur nbr,ecorubber,egnk,emag,fag,fkm,flon,flon rub,fpm,fpm shore,fstd,...,шестерня,шестерня тпц,шестигранник,шестигранный,шина,шкаф,шланг,шланг вод,шлиф,шлифшкурка,шлицем,шнур,шпилька,шпиндель,шпонка,шпоночная,штанга,штангенциркуль,шток,шток хромированный,шток хромированный mnv,штока,штуцер,щетка,щит,щиток,электр,электрод,электрод сварочный,электроизоляционная,элемент,элемент питания,элемент фильтр,эмаль,энкодер,эскиз,эскиз тпц,эспц,ямз,ящик
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.396442,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
answer = svc.predict(test_full)

answer_df = pd.DataFrame(answer, columns=['Группа'])
answer_df

Unnamed: 0,Группа
0,62
1,51
2,19
3,71
4,19
...,...
2341,19
2342,87
2343,28
2344,70


In [None]:
inverse_dict = dict([val,key] for key,val in mapping.items())
inverse_dict

In [None]:
answer_df.reset_index(inplace=True)

answer_df['Группа'] = answer_df['Группа'].map(inverse_dict).fillna(answer_df['Группа'])
answer_df

Unnamed: 0,index,Группа
0,0,Подшипники
1,1,Метизы
2,2,Запчасти
3,3,РеактивыХимич.
4,4,Запчасти
...,...,...
2341,2341,Запчасти
2342,2342,"Фильтры, фильтроэлем"
2343,2343,Инстр. электрический
2344,2344,Расходные материалы


In [None]:
answer_df.to_csv("test_submit.csv", index=False)