In [1]:
import pandas as pd
import numpy as np
import json

import string

import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
russian_stopwords = stopwords.words("russian")

import pymorphy2 
morph = pymorphy2.MorphAnalyzer()

from tqdm import tqdm
tqdm.pandas()

#import tensorflow


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rusla\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, MaxPooling1D, Conv1D, GlobalMaxPooling1D, Dropout, LSTM, GRU
from tensorflow.keras import utils
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras import utils



import matplotlib.pyplot as plt
%matplotlib online 

In [3]:
DIR = './'

with open(DIR + 'agora_hack_products/agora_hack_products.json', encoding='utf-8') as f:
   prdct = json.load(f)

df = pd.DataFrame.from_dict(prdct, orient='columns')

labels = df[df['is_reference'] == True]['product_id'].count()

df.loc[(df['is_reference'] == True),'reference_id'] = df['product_id']

In [4]:
df.sample(3)

Unnamed: 0,product_id,name,props,is_reference,reference_id
991,4ce12b22a6dfed7a,Mi Robot Vacuum-Mop 2 Pro EU White,"[Тип контейнера\tдля пыли/воды, Комплектация...",False,516c4c0cca619ea4
2146,a99c9cc7e081c1f2,Мобильный телефон Samsung Galaxy Note 20 Ultra...,"[Операционная система\tAndroid 10, SIM-карты...",False,7c078e4143695811
2772,d9fe709dfef4b09d,Роутер беспроводной Mercusys MW301R N300 10/10...,[Подключение к интернету (WAN)\tEthernet RJ-4...,False,21991eb9be11bcfe


In [5]:
def prep(df):
    
    # т.к. описание в формате list с несколькими данными объединим в один список 

    def jn(x):
        x = " ".join([ch for ch in x])
        x = str(x)
        return x

    df['props_un'] = df['props'].apply(jn)

    # добавим имя продукта для обработки

    def jn_name(x):
        x = "".join([ch for ch in x])
        x = str(x)
        return x

    df['props_un'] = df['props_un']+ ' ' +df['name'].apply(jn_name)

    # избавляемся от табуляции

    def rem_tab(x):
        x = x.replace("\t", " ")
        return x

    df['props_un'] = df['props_un'].progress_apply(rem_tab)

    # избавляемся от знаков препинания

    spec_chars = string.punctuation + '«'+ '»'+ '—'+ '"'+ '"'
    print(spec_chars)

    def rem_spec_chars(x):
        x = "".join([ch for ch in x if ch not in spec_chars])
        x = "".join([x.replace('\d+', '')])
        return x

    df['props_un'] = df['props_un'].progress_apply(rem_spec_chars)

    # переводим всё в нижний регистр

    def low(x):
        x = list(x.split())
        x = [w.lower() for w in x]
        return x

    df['props_un'] = df['props_un'].progress_apply(low)

    # удаляем стоп слова

    def stop_words(x):
        new_x = []
        for w in x:
            if w not in russian_stopwords:
                new_x.append(w)
        return new_x

    df['props_un'] = df['props_un'].progress_apply(stop_words)

    # лемматизируем текст

    def lem(x):
        #x = list(x.split())
        x = [morph.parse(w)[0].normal_form for w in x]
        return x

    df['props_un'] = df['props_un'].progress_apply(lem)

In [6]:
prep(df)

100%|██████████| 3251/3251 [00:00<00:00, 239520.85it/s]


!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~«»—""


100%|██████████| 3251/3251 [00:00<00:00, 16991.38it/s]
100%|██████████| 3251/3251 [00:00<00:00, 49492.69it/s]
100%|██████████| 3251/3251 [00:00<00:00, 10290.72it/s]
100%|██████████| 3251/3251 [00:24<00:00, 133.67it/s]


In [7]:
unique_words = {}
for i in tqdm(df.index):
    for j in set(df['props_un'].loc[i]):
        if len(j) == 1:
            pass
        else:
            if j in unique_words.keys():
                unique_words[j] += 1
            else:
                unique_words[j] = 1

100%|██████████| 3251/3251 [00:00<00:00, 11993.56it/s]


In [8]:
pop_words = []
for i in tqdm(unique_words.keys()):
    if unique_words[i] < 4:
        pass
    elif unique_words[i] > len(unique_words)*0.9:
        pass
    else:
        pop_words.append(i)

100%|██████████| 4160/4160 [00:00<00:00, 539393.61it/s]


In [9]:
# Максимальное количество слов 
num_words = len(pop_words)
# Максимальная длина новости
max_news_len = 67
# Количество классов новостей
nb_classes = labels

In [10]:
# разделим трейн и тест

X = df['props_un']
y = df['reference_id']

In [11]:
y = pd.get_dummies(y)

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y, 
                                                    test_size=0.2)

In [13]:
y_train = np.array(y_train)
y_test = np.array(y_test)

In [14]:
# делаем токенизатор на списке всех слов и сохраним его в отдельном файле

tokenizer = Tokenizer(num_words=num_words)

tokenizer.fit_on_texts(df['props_un'])

with open("./output/tokinaizer.json", "w") as outfile:
    json.dump(tokenizer.word_index, outfile)

In [15]:
# токенизируем трейн и тест

train_sequences = tokenizer.texts_to_sequences(X_train)

x_train = pad_sequences(train_sequences, maxlen=max_news_len)

test_sequences = tokenizer.texts_to_sequences(X_test)

x_test = pad_sequences(test_sequences, maxlen=max_news_len)

In [16]:
# Сверточная нейронная сеть

model_cnn = Sequential()
model_cnn.add(Embedding(num_words, 128, input_length=max_news_len))
model_cnn.add(Conv1D(1024, 5, padding='valid', activation='relu'))
model_cnn.add(GlobalMaxPooling1D())
model_cnn.add(Dense(512, activation='relu'))
model_cnn.add(Dense(471, activation='softmax'))

In [17]:
model_cnn.compile(optimizer='adam', 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

In [18]:
history_cnn = model_cnn.fit(x_train, 
                            y_train, 
                            epochs=1,
                            batch_size=64,
                            validation_split=0.02)



In [19]:
with open(DIR + 'agora_hack_products/test_request.json', encoding='utf-8') as f:
   tst = json.load(f)

test = pd.DataFrame.from_dict(tst, orient='columns')

In [20]:
prep(test)

100%|██████████| 200/200 [00:00<00:00, 199919.16it/s]


!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~«»—""


100%|██████████| 200/200 [00:00<00:00, 18064.10it/s]
100%|██████████| 200/200 [00:00<00:00, 98968.95it/s]
100%|██████████| 200/200 [00:00<00:00, 7322.27it/s]
100%|██████████| 200/200 [00:01<00:00, 132.53it/s]


In [21]:
test_2_sequences = tokenizer.texts_to_sequences(test['props'])

x_test_2 = pad_sequences(test_2_sequences, maxlen=max_news_len)

In [22]:
model_cnn.load_weights('./output/best_model_cnn.h5')

ImportError: `load_weights` requires h5py package when loading weights from HDF5. Try installing h5py.

In [23]:
pip uninstall h5py

^C
Note: you may need to restart the kernel to use updated packages.


In [None]:
conda install h5py