In [86]:
import pandas as pd
import numpy as np
import re
import os
import json

#import nltk
#from nltk import stopwords as nltk_stopwords
#nltk.download('stopwords')
from datetime import datetime

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings("ignore")

In [2]:
# data path
path = ['Datasets']

# products from producer
df_product = pd.read_csv(os.path.join(*path,'marketing_product.csv'), sep=';')

# products from dealers
df_dealerprice = pd.read_csv(os.path.join(*path,'marketing_dealerprice.csv'), sep=';')

df_productdealerkey = pd.read_csv(os.path.join(*path,'marketing_productdealerkey.csv'), sep=';')


In [76]:
#Датафрейм df_res будет содержать рекомендации
df_res = df_dealerprice[['id', 'product_key']]

In [4]:
def clean_texts(name):
    '''
    Функции очистки текста.
    Принимает на вход строку - название товара,
    возвращает его в отредактированном виде
    '''     
    if not pd.isna(name):
        # разделение слов
        #name = ' '.join(re.split(r"([A-Za-z][A-Za-z]*)", name))
        #name = ' '.join(re.split(r"([A-Z][a-z]*)", name))
        #name = ' '.join(re.split(r"([А-Я][а-я]*)", name))
        name = ' '.join(re.split(r"([0-9][0-9]*)", name))
        # нижний регистр
        name = name.lower()
        # удаление пунктуации
        name = re.sub(r"[^а-яa-z\d\s]+", ' ', name)        
        # удаление слова "prosept"
        name = re.sub(r"prosept|просепт|professional", ' ', name)
        #name = ' '.join(list(set(name.split())))
        # удаление стоп-слов
        #name = ' '.join([word for word in name.split() if word not in stop_words])  
    else:
        name = ''    
    return name

In [5]:
# TF-IDF
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1,2), max_df=0.6)

In [6]:
#В качестве исходного вектора продуктов используем все столбцы с наименованием продукции
columns = ['name','ozon_name','name_1c','wb_name']

In [7]:
def t_fit_tfidf(df,func=clean_texts,df_columns=['name']):

    df_tmp = df[df_columns[0]].apply(func)

    if len(df_columns)>1:
        for i in range(1,len(df_columns)):
            df_tmp = df_tmp + ' ' + df[df_columns[i]].apply(func)

    model = vectorizer.fit_transform(df_tmp)

    return model, df[['id','name']]

In [8]:
%%time
#Получим вектора из датафрейма df_product.
product_vec, df_product_tfidf =  t_fit_tfidf(df_product,clean_texts,columns)

CPU times: user 185 ms, sys: 7.01 ms, total: 192 ms
Wall time: 196 ms


In [9]:
def t_predict_tfidf(dealer_names):
    dealer_vec = vectorizer.transform(dealer_names.apply(clean_texts))
    return cosine_similarity(dealer_vec, product_vec)

In [10]:
%%time
#Получаем матрицу расстояний
df_predict_tfidf = t_predict_tfidf(df_dealerprice['product_name'])

CPU times: user 2.79 s, sys: 151 ms, total: 2.94 s
Wall time: 2.98 s


In [61]:
# 10 индексов лучших совпадений для строк
N_BEST=10
indices =  df_predict_tfidf.argsort()[:, -N_BEST:][:, ::-1]
quality = np.take_along_axis(df_predict_tfidf, indices, axis=1)

In [77]:
# для всех имен
df_res.loc[:,'predict'] = indices.tolist()
df_res.loc[:,'quality'] = quality.tolist()

In [78]:
# dataframe to test
df_test = (df_res[['product_key', 'predict']]
           .merge(df_productdealerkey[['key', 'product_id']], 
                  how='left', 
                  left_on=['product_key'],right_on=['key']))

In [79]:
#df_res['queue'] = [[x for x in range(1,N_BEST+1)] for j in range(len(df_res))]
df_res['queue'] = [list(range(1, N_BEST+1)) for _ in range(len(df_res))]

In [80]:
df_res=df_res.explode(['predict', 'quality', 'queue'])
df_res = df_res.reset_index(drop=True)
tmp_df = df_product['id'].loc[df_res['predict']].reset_index(drop=True)
df_res['product_id'] = tmp_df
df_res = df_res.drop('predict',axis=1)
df_res['create_date'] = datetime.now()
df_res

Unnamed: 0,id,product_key,quality,queue,product_id,create_date
0,2,546227,0.808849,1,12,2023-12-07 19:59:27.616328
1,2,546227,0.717322,2,13,2023-12-07 19:59:27.616328
2,2,546227,0.681078,3,15,2023-12-07 19:59:27.616328
3,2,546227,0.674129,4,5,2023-12-07 19:59:27.616328
4,2,546227,0.656206,5,4,2023-12-07 19:59:27.616328
...,...,...,...,...,...,...
204155,20570,1077090171,0.386304,6,235,2023-12-07 19:59:27.616328
204156,20570,1077090171,0.385264,7,428,2023-12-07 19:59:27.616328
204157,20570,1077090171,0.381065,8,234,2023-12-07 19:59:27.616328
204158,20570,1077090171,0.377839,9,431,2023-12-07 19:59:27.616328


## Check accuracy

In [81]:
def get_product_ids(predict_list):
    return [df_product['id'].reset_index(drop=True).iloc[i] for i in predict_list]

df_test['pred_product_id'] = df_test['predict'].apply(get_product_ids)

In [82]:
df_test=df_test.dropna().reset_index(drop=True)
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17567 entries, 0 to 17566
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   product_key      17567 non-null  object 
 1   predict          17567 non-null  object 
 2   key              17567 non-null  object 
 3   product_id       17567 non-null  float64
 4   pred_product_id  17567 non-null  object 
dtypes: float64(1), object(4)
memory usage: 686.3+ KB


In [83]:
# accuracy@5
def t_compare(df):
    return (df['product_id'] in (df['pred_product_id'][0:5]))

# accuracy@1
def t_first(df):
    return (df['product_id'] == df['pred_product_id'][0])

In [84]:
# средняя позиция правильного товара в ранжированном списке
def mean_reciprocal_rank(real_id, prediction_id, k=5):
    """
    MRR - вычисление среднего обратного ранга для задачи рекомендации
    real_id - массив с правильными id от заказчика
    prediction_id - массив с предсказаниями, каждая строка - список предсказанных id.
    k - количество предсказанных id
    
    """
    reciprocal_ranks = []

    for i, rec in enumerate(prediction_id):
        recs = rec[:k]
        relevant = real_id[i]

        if np.isin(relevant, recs):
            rank = np.where(recs == relevant)[0][0] + 1
            reciprocal_ranks.append(1 / rank)
        else:
            reciprocal_ranks.append(0)

    return np.mean(reciprocal_ranks)

In [85]:
acc = df_test[['product_id','pred_product_id']].apply(t_compare,axis=1).sum()/len(df_test)
print(f'Accuracy@5:{acc:.4}')

first = df_test[['product_id','pred_product_id']].apply(t_first,axis=1).sum()/len(df_test)
print(f'Accuracy@1:{first:.4}')

mrr=mean_reciprocal_rank(df_test['product_id'], df_test['pred_product_id'])
print(f'MRR:{mrr:0.4}') 

Accuracy@5:0.9321
Accuracy@1:0.7397
MRR:0.8164
