In [17]:
import os
import re
from copy import copy, deepcopy

from catboost import CatBoostRanker, Pool, cv
import nltk
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import scipy as sp
from sklearn.model_selection import train_test_split
from transliterate import translit, get_available_language_codes
from tqdm import tqdm
tqdm.pandas(desc='progress')
from nltk.corpus import stopwords
from pymystem3 import Mystem
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, accuracy_score
from joblib import Parallel, delayed
from sklearn.neighbors import KNeighborsClassifier
from scipy.spatial.distance import pdist, directed_hausdorff as dh, squareform
from sklearn.metrics.pairwise import euclidean_distances as ed, cosine_distances
from sklearn.ensemble import HistGradientBoostingClassifier as hgbc
import lightgbm as lgb
import pickle

SEED = 616

In [2]:
# серверный путь
pth1 = '/datasets/'
# локальный путь
pth2 = 'C:\\Dev\\practicum\\datasets\\'

try:
    if os.path.exists(pth1):
        df_product = pd.read_csv(
            pth1+'marketing_product.csv',
            sep=';',
            header=0,
            on_bad_lines='skip'
        )
        df_product_dealer_key = pd.read_csv(
            pth1+'marketing_productdealerkey.csv',
            sep=';',
            header=0,
            on_bad_lines='skip'
        )
        df_dealer_price = pd.read_csv(
            pth1+'marketing_dealerprice.csv',
            sep=';',
            header=0,
            on_bad_lines='skip'
        )
    elif os.path.exists(pth2):
        df_product = pd.read_csv(
            pth2+'marketing_product.csv',
            sep=';',
            index_col=0,
            # header=0,
            on_bad_lines='skip'
        )
        df_product_dealer_key = pd.read_csv(
            pth2+'marketing_productdealerkey.csv',
            sep=';',
            header=0,
            on_bad_lines='skip'
        )
        df_dealer_price = pd.read_csv(
            pth2+'marketing_dealerprice.csv',
            sep=';',
            header=0,
            on_bad_lines='skip'
        )
except FileNotFoundError:
        print('Path does not exist. Check path')

[df_product.shape, df_product_dealer_key.shape, df_dealer_price.shape]

[(496, 14), (1700, 4), (20416, 7)]

In [3]:
data_product = df_product[['id', 'name_1c', 'recommended_price']].copy()
data_product = data_product.loc[data_product['name_1c'].notna()].reset_index()
data_product = data_product.drop(['index'], axis=1)
data_product.shape

(485, 3)

In [4]:
# функция для очистки текста
def clear_text(text):
    text = re.sub(r'"{2}|[/]', ' ', text) # убирает 2 подряд кавычки (") + убирает слэш
    text = re.sub(r'(?<=[а-я])[A-Z]', ' \g<0>', text) # разделяет пробелом слип. англ. и русское слова
    text = re.sub(r'[A-Za-z](?=[а-я])', '\g<0> ', text) # разделяет пробелом слип. англ. и русское слова
    text = re.sub(r'(?<=[А-Я]{2})[а-я]', ' \g<0>', text) # разделяет пробелом слип. русские слова
    text = re.sub(r'\W', ' ', text) # убирает знаки препинания
    text = re.sub(r'\d', '', text) # убирает цифры
    text = text.lower() # в нижний регистр
    return ' '.join(text.split())

In [5]:
data_product['clear_name'] = data_product['name_1c'].apply(clear_text)
data_product.shape

(485, 4)

In [6]:
count_tf_idf = TfidfVectorizer()
X_train_tfidf = count_tf_idf.fit_transform(data_product['clear_name'])
y_train = data_product['id']

In [7]:
knn_model = KNeighborsClassifier(n_neighbors=5, weights='distance', metric='cosine')
knn_model.fit(X_train_tfidf, y_train)

In [8]:
df_dealer_price = df_dealer_price[['product_key', 'price', 'product_name']]
df_product_dealer_key.rename(columns={'key':'product_key'}, inplace=True)
data_dealer_price = df_dealer_price.merge(df_product_dealer_key, on='product_key', how='left')
data_dealer_price = data_dealer_price.drop(columns=['product_key', 'id', 'dealer_id'], axis=1)
data_dealer_price.dropna(inplace=True)
data_dealer_price['product_id'] = data_dealer_price['product_id'].astype('int', errors='ignore')
data_dealer_price.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17567 entries, 0 to 20413
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         17567 non-null  float64
 1   product_name  17567 non-null  object 
 2   product_id    17567 non-null  int32  
dtypes: float64(1), int32(1), object(1)
memory usage: 480.3+ KB


In [9]:
data_dealer_price['clear_name'] = data_dealer_price['product_name'].apply(clear_text)
data_dealer_price.shape

(17567, 4)

In [11]:
X_test_tfidf = count_tf_idf.transform(data_dealer_price['clear_name'])

In [12]:
pred = knn_model.predict(X_test_tfidf)
pred

array([ 12,  45,  18, ..., 300, 416,  56], dtype=int64)

In [14]:
neighbors = knn_model.kneighbors(X_test_tfidf, n_neighbors=5, return_distance=False)

In [15]:
neighbors_table = pd.DataFrame(columns=['query_id', 'base_id', 'target'])
for i in range(len(neighbors)):
    # query_id = df_val.iloc[i]['id']
    query_id = data_dealer_price.iloc[i]['product_id']
    for j in neighbors[i]:
        base_id = data_product.iloc[j]['id']
        target = 1 if base_id == query_id else 0
        neighbors_table = neighbors_table._append({'query_id': query_id, 'base_id': base_id, 'target': target}, ignore_index=True)
neighbors_table

Unnamed: 0,query_id,base_id,target
0,12,12,1
1,12,15,0
2,12,13,0
3,12,5,0
4,12,4,0
...,...,...,...
87830,405,57,0
87831,405,56,0
87832,405,58,0
87833,405,52,0


In [16]:
success = neighbors_table.target[neighbors_table['target'] == 1].count()
precis_at_5 = success / (len(neighbors_table) / 5)
precis_at_5

0.839300962031081

Global test

In [19]:
with open('knn_model.pkl', 'wb') as file:
    pickle.dump(knn_model, file)

In [20]:
with open('knn_model.pkl', 'rb') as file:
    clf = pickle.load(file)

In [28]:
test_row = data_dealer_price.iloc[0]
input=[test_row['clear_name']]
test_row_tfidf = count_tf_idf.transform(input)

In [32]:
nghbrs = clf.kneighbors(test_row_tfidf, n_neighbors=5, return_distance=False)

In [35]:
nghbrs[0]

array([477, 167, 332, 435, 424], dtype=int64)

In [33]:
nghbrs_table = pd.DataFrame(columns=['query_id', 'base_id', 'target'])
for i in range(len(nghbrs)):
    # query_id = df_val.iloc[i]['id']
    query_id = data_dealer_price.iloc[i]['product_id']
    for j in nghbrs[i]:
        base_id = data_product.iloc[j]['id']
        target = 1 if base_id == query_id else 0
        nghbrs_table = nghbrs_table._append({'query_id': query_id, 'base_id': base_id, 'target': target}, ignore_index=True)
nghbrs_table

Unnamed: 0,query_id,base_id,target
0,12,12,1
1,12,15,0
2,12,13,0
3,12,5,0
4,12,4,0


In [37]:
list(nghbrs_table.base_id.values)

[12, 15, 13, 5, 4]

In [39]:
product_name = 'Очиститель фасадов SALT CLEANER концентрат 1:2 / 5 л'
product_name_tfidf = count_tf_idf.transform([product_name])
'Средство универсальное Prosept Universal Spray, 500мл'

In [42]:
data_dealer_price.product_name.iloc[15000]
data_dealer_price.iloc[15000]

price                                                       856.0
product_name    Просепт Professional Bath Acid средство для ух...
product_id                                                     51
clear_name      просепт professional bath acid средство для ух...
Name: 17406, dtype: object