load all data and reduce specifications column

In [None]:
!pip install -U sentence-transformers
!pip install umap-learn
!pip install hdbscan

In [65]:
import pandas as pd
import json
import ast
import re


In [83]:
import nltk
import string
from sentence_transformers import SentenceTransformer

In [None]:
df_all = pd.read_csv('/content/drive/MyDrive/diploma/data/all_wb_dns_smartphones.csv')
df_all = df_all.drop(columns = ['Unnamed: 0'])
df_all.head()

In [5]:
df_all.shape

(7662, 10)

In [16]:
df_all.drop_duplicates().shape

(7662, 10)

In [41]:
def process_spec(spec):
    all_info = {}
    spec = ast.literal_eval(spec)
    for block in spec:
        for item in spec[block]:
            if len(item) > 1:
                key = item[0].lower()
                val = item[1].lower()
                all_info[key] = val

    
    new_all_info = {}
    for info in all_info:
        if re.search(r'(модель|объем|предмета|вес|диагональ|страна|камер|емкость аккумулятора|sim|код производителя)', info.lower()):
            new_all_info[info] = all_info[info].replace('"',"")

    myKeys = list(new_all_info.keys())
    myKeys.sort()
    sorted_dict = {i: new_all_info[i] for i in myKeys}

    return str(sorted_dict)



def reduce_spec(df,column):
    df[column] = df[column].apply(process_spec)
    df.to_csv('data/all_wb_dns_smartphones_reduce_spec.csv')
    
    return df

# reduce_spec(df_all, 'specifications')

Создание двух новых столбцов - код производителя и модель

In [36]:
import pandas as pd
df_all = pd.read_csv('data/all_wb_dns_smartphones_reduce_spec.csv')

In [42]:
def get_models(df):
    models = []
    for ids, row in df_all.iterrows():
        row['specifications'] = ast.literal_eval(row['specifications'])
        if 'модель' in row['specifications']:
            models.append(row['specifications']['модель'])
        else:
            models.append("")
    return models

def get_identifiers(df):
    identifiers = []
    for ids, row in df_all.iterrows():
        row['specifications'] = ast.literal_eval(row['specifications'])
        if 'код производителя' in row['specifications']:
            identifiers.append(re.sub(r'[\[\]]',"",row['specifications']['код производителя']))
        else:
            identifiers.append("")
    return identifiers



In [24]:
df_all.shape

(7662, 12)

Генерация пар по столбцу модель

In [56]:
def generate_pairs(df, column_name, sequence):
    df = df.copy()
    pairs = []
    for elem in sequence:
        pos_prods = df[df[column_name] == elem].reset_index()

        idxs = pos_prods['id'].tolist()
        titles = pos_prods['title'].tolist()
        for i, prod in pos_prods.iterrows():
            for idx, title in zip(idxs[i+1:], titles[1:]):
                pair = {'id_1':prod['id'], 'title_1': prod['title'],
                        'id_2': idx, 'title_2':title, column_name:elem}
                pairs.append(pair)

    print(len(pairs))
    return pairs

In [44]:
def generate_pairs_by_model(df):
    # group by model
    groups_model = df.groupby(['model'], as_index = False).count()
    # select non empty model name and groups where only 2 products (for reducing number of pairs)
    groups_model = groups_model[(groups_model['sku']==2) & (groups_model['model'] != '')].sort_values(by=['sku'], ascending=False)#['identifier'].tolist()[0]

    models = groups_model['model'].tolist()
    pairs = generate_pairs(df,'model',models)

    df_pairs_model = pd.DataFrame(pairs)
    df_pairs_model['pair_id'] = range(df_pairs_model.shape[0])
    df_pairs_model['match_type'] = -1

    df_pairs_model.to_csv('data/pairs_by_model.csv',index = False)

    return df_pairs_model




Генерация пар по столбцу код производителя

In [60]:
def generate_pairs_by_identifier(df):
    groups_ident = df.groupby(['identifier'], as_index = False).count()
    groups_ident = groups_ident[(groups_ident['sku']>1) & (groups_ident['identifier'] != '')].sort_values(by=['sku'], ascending=False)#['identifier'].tolist()[0]

    identifiers = groups_ident['identifier'].tolist()

    pairs = generate_pairs(df,'identifier',identifiers)

    df_pairs_ident = pd.DataFrame(pairs)
    df_pairs_ident['pair_id'] = range(df_pairs_ident.shape[0])
    df_pairs_ident['match_type'] = -1

    df_pairs_ident.to_csv('data/pairs_by_identifier.csv',index = False)

    return df_pairs_ident

Генерация пар по модели, объему оперативы, объему встроенной памяти, емкости аккумулятора, диаг экрана

In [46]:
def get_prods_with_info(df):
    prods = []
    for ids, row in df.iterrows():
        prod = {}
        row['specifications'] = ast.literal_eval(row['specifications'])
        keys = re.findall(r'(диагональ экрана|емкость аккумулятора|объем встроенной памяти \(гб\)|объем оперативной памяти \(гб\))',','.join(row['specifications'].keys()))

        if len(keys) == 4:
            prod['id'] = row['id']
            prod['model'] = row['model']
            prod['diag'] = re.search(r'\d+\.\d+',row['specifications'].get('диагональ экрана').replace(',',".").split(' ')[0].replace("''",""))
            if not prod['diag']:
                prod['diag'] = ""
            else:
                prod['diag'] = prod['diag'].group()
            prod['akk'] = row['specifications'].get('емкость аккумулятора').split(";")[0].split(' ')[0]
            prod['bim'] = row['specifications'].get('объем встроенной памяти (гб)').replace('гб',"").replace('gb',"").strip().split(";")[0]
            prod['ram'] = row['specifications'].get('объем оперативной памяти (гб)').replace('гб',"").replace('gb',"").strip().split(";")[0]
            prods.append(prod)
    df = pd.DataFrame(prods)
    return df

In [47]:
def generate_pairs_by_model_bim_ram(df):
    df = get_prods_with_info(df)
    agg_func_max_min = {'id': ['min', 'max', 'count']}
    groups_ident = df.groupby(['model','diag','akk','bim','ram'], as_index = False, group_keys=True).agg(agg_func_max_min)
    groups_ident = groups_ident[(groups_ident['diag']!="") & (groups_ident['model'] != '')& (groups_ident['id']['count'] == 2)]

    pairs = groups_ident['id'][['min','max']]
    pairs = pairs.rename(columns={"max": "id_1", "min": "id_2"})
    pairs['match_type'] = -1
    pairs['pair_id'] = range(pairs.shape[0])

    pairs.to_csv('data/pairs_by_model_diag_akk_bim_ram.csv',index = False)

    return pairs




Генерация пар с помощью кластеризации и близости названия (описания) и цены

In [84]:
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('russian')

word_tokenizer = nltk.WordPunctTokenizer()

def process_data(text):
    text = text.lower().replace("/"," ").replace("+"," ").replace('смартфон',"").replace('гб','gb').strip()
    return text
    
def process_price(price):
    price = price.split("\n")[0]
    return price

def process_description(desc):
    # print(desc)
    if desc is not None:
        desc = desc.lower().replace('смартфон',"").replace('гб','gb').replace(', ',' ')
        desc = re.sub(r'[\-\:]'," ",desc)
        desc = re.sub(r'[!?"]',"",desc)
        desc = re.sub(r'\s+'," ",desc)

        tokens     = desc.split(" ") # splits the text into tokens (words)
        # еще бы хорошо все это привести в нормальную форму лемматизировать
        # remove punct and stop words from tokens
        tokens = [word for word in tokens if (word not in string.punctuation and word not in stop_words)]
        return " ".join(tokens)
    return ""

def preprocess_columns(df_all):
    df_all['title'] = df_all['title'].apply(process_data)
    df_all['brand'] = df_all['brand'].apply(process_data)
    df_all['description'] = df_all['description'].fillna("")
    df_all['description'] = df_all['description'].apply(process_description)

    # replace empty title with first 5 words in description
    df_all.loc[df_all['title'] == "", ['flag']] = 'nothing'
    df_all.loc[df_all['title'] != "", ['flag']] = 'full'
    df_all['title'].mask(df_all['title'] == '', df_all['description'].apply(lambda x: " ".join(x.split()[:5])), inplace=True)
    # end of replacing

    df_all = df_all[df_all['title'] != ""]
    df_all = df_all.drop_duplicates(subset=['title','brand','price','description'])

    return df_all



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [85]:
def collect_pairs(product, df, pairs):
    new_pairs = []
    for idx,row in df.iterrows():
        pair = {
            'id_1': product['id'],
            'title_1': product['title'],
            'id_2': row['id'],
            'title_2':row['title']
        }
        new_pairs.append(pair)
    pairs.extend(new_pairs)
    return pairs

In [86]:
def get_pairs_for_product(product, products, all_pairs):
    products = products.drop(products[products['id'] == product['id']].index)
    # необходимо со всеми товарами в кластере посчитать близость по title, price, (title + первые 5 слов description?)
    
    titles = products['title'].tolist()
    # закодировать titles в sentence_embeddings
    main_prod_embeddings = model.encode(product['title'])
    row_prod_embeddings = model.encode(titles)

    prices = products['price'].tolist()

    similarities = cosine_similarity(
        [main_prod_embeddings],
        row_prod_embeddings 
    )
    price_sim = [1 - abs(float(product['price']) - float(price))/max(float(product['price']),float(price)) for price in prices]

    products['price_sim'] = price_sim
    products['title_sim'] = similarities[0]
    
    # отсортировать сначала по title_sim, потом по price_sim
    products = products.sort_values(by=['title_sim', 'price_sim'], ascending=False)
    # print(products[:10])
    pairs = []
    # get top 2
    top_prod = products.head(2)
    pairs = collect_pairs(product, top_prod, pairs)

    # get random from middle  1 (in 50% of all)
    start = int(products.shape[0] * 0.25)
    end =  int(products.shape[0] * 0.75)
    middle_random_prod = products[start:end].sample(n=1)
    pairs = collect_pairs(product, middle_random_prod, pairs)

    # get lowest 2
    lowest_prod = products.tail(2)
    pairs = collect_pairs(product, lowest_prod, pairs)

    # get random 2
    random_prod = products.sample(n=2)
    pairs = collect_pairs(product, random_prod, pairs)

    df_pairs = pd.DataFrame(pairs)
    # print(df_pairs)

    all_pairs = pd.concat([all_pairs, df_pairs])
    all_pairs = all_pairs.drop_duplicates()
    return all_pairs

In [None]:
import umap
import hdbscan
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

def cluster_products(df_all):
    titles = df_all['title'].tolist()
    
    sentence_embeddings = model.encode(titles)
    # уменьшение размерности
    umap_embeddings = umap.UMAP(n_neighbors=15, 
                            n_components=5, 
                            metric='cosine').fit_transform(sentence_embeddings)

    # кластеризация
    cluster = hdbscan.HDBSCAN(min_cluster_size=15,min_samples=1,
                          metric='euclidean',                      
                          cluster_selection_method='eom').fit(umap_embeddings)
    # сохранение топиков
    df_all['cluster_id'] = cluster.labels_
    docs_per_topic = df_all.groupby(['cluster_id'], as_index = False).agg({'title': ' '.join})
    docs_df = df_all[['id','cluster_id','title','brand','price']]

    return docs_df

def get_random_products_per_cluster(products):
    n = 2 if products.shape[0] < 30 else 3 if products.shape[0] < 40 else 4 if products.shape[0] < 50 else 5
    # генерация рандомных товаров внутри кластера
    random_products = []
    # подсчет уникальных названий товаров
    n_unique_numbers = len(products['title'].unique())
    # если сгенирированный n > чем уникальных значений, то урезаем, иначе будет очень много похожих пар
    if n > n_unique_numbers:
        n = n_unique_numbers
    # print(n,'random products')
    count = 0
    # до тех пор пока не сгенерировалось нужное кол-во товаров, генерируем
    while len(random_products) < n:
        count += 1
        # выбираем рандомный товар
        product = products.sample(n = 1)
        product = {
            'id':product['id'].tolist()[0],
            'title':product['title'].tolist()[0],
            'brand':product['brand'].tolist()[0],
            'price':product['price'].tolist()[0]
        }
        # добавляем первый товар
        if len(random_products) == 0:
            random_products.append(product)
            # count += 1
            products = products.drop(products[products['id'] == product['id']].index)
            continue
        # смотрим на близость по названию в случае, если уже есть добавленные товары
        sim = 0
        prev_prod_embeddings = model.encode(random_products[-1]['title'])
        prod_embeddings = model.encode(product['title'])
        sim = cosine_similarity([prev_prod_embeddings] ,[prod_embeddings])
        # исходя из того, какая итерация, treshold, для того, чтобы даже в кластере, где практчески только одинаковые названия, нашлись близкие
        treshold = 0.95 if count < 10 else 0.97
        if sim >= treshold:
            sim = 1
            continue

        random_products.append(product)
        # убираем из products добавленный товар, чтобы не попался еще раз
        products = products.drop(products[products['id'] == product['id']].index)

    return random_products


def get_pairs(df):
    docs_df = cluster_products(df)
    n_cat = len(df['cluster_id'].unique())
    print(n_cat)
    all_pairs = pd.DataFrame(columns = ['id_1','title_1','id_2','title_2'])
    for i in range(n_cat):
        print(f'category {i}')
        products = docs_df[docs_df['cluster_id'] == i]
        products_copy = products.copy()
        if products.shape[0] < 6 or products.shape[0] > 150:
            continue
        
        random_products = get_random_products_per_cluster(products)

        for product in tqdm(random_products):
            all_pairs = get_pairs_for_product(product, products, all_pairs)

    all_pairs = all_pairs.drop_duplicates(subset=['id_1','title_1','title_2'])
    all_pairs.to_csv('data/all_pairs_smartphones2.csv')
    return all_pairs




In [None]:
# def main():
df_all = pd.read_csv('/content/drive/MyDrive/diploma/data/all_wb_dns_smartphones.csv')
df_all = df_all.drop(columns = ['Unnamed: 0'])
# df_all = reduce_spec(df_all, 'specifications')
# # df_all = pd.read_csv('data/all_wb_dns_smartphones_reduce_spec.csv')
# df_all['model'] = get_models(df_all)
# df_all['identifier'] = get_identifiers(df_all)

# pairs_by_model = generate_pairs_by_model(df_all)

# pairs_by_identifier = generate_pairs_by_identifier(df_all)

# pairs_by_model_bim_ram = generate_pairs_by_model_bim_ram(df_all)

df_all = preprocess_columns(df_all)
df_all = df_all.drop_duplicates(subset=['title','brand','price','description'])
all_pairs = get_pairs(df_all)
