In [1]:
import pandas as pd
import numpy as np
import io
import os
import re
import string
import warnings
import pymorphy2
from nltk.stem.snowball import SnowballStemmer
from pandarallel import pandarallel
from stop_words import get_stop_words
from tqdm import tqdm_notebook as tqdm
from tqdm.auto import tqdm as progress
from matplotlib import pyplot as plt
import psutil

Model parameters:

In [2]:
token_min = 2
rem_warnings = True

Resources management:

In [3]:
memory_usage = 0.1
kernels_usage = 0.1

Rules:

In [4]:
remove_memory = True
rem_brack_content = True 
rem_repeats = True 
seperate_digits = True 
remove_vendors = True

Supress warning:

In [5]:
if rem_warnings is True:
    warnings.filterwarnings('ignore')

Pandas progress:

In [6]:
progress.pandas()

Remove brackets content:

In [7]:
def remove_brackets(item):
    
    result = re.search(r'\([^)]*\)', item)
    p_item = item
    
    if result and re.search(',', str(result.group())):
        p_item = re.sub(r'\([^)]*\)', '', item)
        
    return p_item  

Remove vendor:

In [8]:
def remove_vendor(x):
    result = re.sub(r'(\b){}(\b)'.format(df[df['model_m_clean'] == x]['ven_m'].values[0]), '', x)
    return result 

Remove repeating tokens:

In [9]:
def remove_tokens(x):
    
    text_list = list(x.split())
    cleaned_list = []
    
    for item in text_list:
        if len(item) >= token_min and item not in cleaned_list:
            cleaned_list.append(item)   
        if len(item) < token_min:
            cleaned_list.append(item)

    return(' '.join(cleaned_list))

Read directory:

In [10]:
list_of_files = os.listdir('data/top_ym/')
files_num = len(list_of_files)

Select required attributes:

In [11]:
market_features = ['Категория', 'Производитель', 'Название модели', 'url']

Compress files to the single frame:

In [12]:
df = pd.DataFrame()

for item in tqdm(list_of_files):
    df_file = pd.read_csv(('data/top_ym/' + str(item)), delimiter=';', encoding='windows-1251', usecols=market_features)
    df = pd.concat([df, df_file], axis=0)

HBox(children=(IntProgress(value=0, max=313), HTML(value='')))




Cut url to integer id:

In [13]:
df['id'] = df['url'].apply(lambda x: re.sub('\D+', '', x)).astype('int64')

Write to csv file:

In [14]:
df.to_csv('data/market_dirty.csv', index=False)

Rename columns:

In [15]:
df = df.rename(columns={'Категория': 'class_m', 'Производитель': 'ven_m', 'Название модели': 'model_m'})

Remove records with missed values:

In [16]:
df.dropna(how='any', axis=0, inplace=True)

Drop duplicates by model name: 

In [17]:
df.drop_duplicates(inplace=True, keep='first', subset=['model_m'])

Create model_m_clean attribute:

In [18]:
df['model_m_clean'] = df['model_m']

If contents of brackets of model name contain enumerations, contents is removed:

In [19]:
if rem_brack_content is True:
    df['model_m_clean'] = df['model_m_clean'].apply(remove_brackets)  

Remove punctuation from the model name:

In [20]:
df['model_m_clean'] = df['model_m_clean'].str.replace('[{}]'.format(string.punctuation), ' ')    

Remove multiple spaces and reduce to lowercase:

In [21]:
df[['class_m', 'ven_m', 'model_m_clean']] = df[['class_m', 'ven_m', 'model_m_clean']].applymap(lambda x: re.sub(' +', ' ', x))
df[['class_m', 'ven_m', 'model_m_clean']] = df[['class_m', 'ven_m', 'model_m_clean']].applymap(lambda x: x.strip().lower())

Drop duplicates:

In [22]:
df.drop_duplicates(inplace=True, keep='first', subset=['ven_m', 'model_m_clean', 'class_m', 'id'])
df.drop_duplicates(inplace=True, keep='first', subset=['model_m_clean'])

Remove the memory size 'digit + GB' - do not influence tac:

In [23]:
if remove_memory is True:
    df['model_m_clean'] = df['model_m_clean'].apply(lambda x: re.sub(r'\d+gb', '', x))
    df['model_m_clean'] = df['model_m_clean'].apply(lambda x: re.sub(r'\d+гб', '', x))

Convert types to string:

In [24]:
df = df.astype({'class_m': 'str', 'ven_m': 'str', 'model_m': 'str', 'model_m_clean': 'str', 'id': 'int64', 'url': 'str'})

Separate numbers from non-numbers:

In [25]:
df['model_m_clean'] = df['model_m_clean'].apply(lambda x: re.sub(r'([A-Za-zА-Яа-я])(\d)', r'\1 \2', x))
df['model_m_clean'] = df['model_m_clean'].apply(lambda x: re.sub(r'(\d)([A-Za-zА-Яа-я])', r'\1 \2', x))

Remove vendors from model names:

In [26]:
workers = max(round(kernels_usage*psutil.cpu_count()), 1)
memory = max(round(memory_usage*psutil.virtual_memory().available/(1024**2)), 2048)
pandarallel.initialize(progress_bar=True, verbose=0, shm_size_mb=memory, nb_workers=workers)

if remove_vendors is True:
    df['model_m_clean'] = df['model_m_clean'].parallel_apply(remove_vendor)      



VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=7091), Label(value='0 / 7091'))), …

Remove repeating tokens:

In [27]:
if rem_repeats is True: 
    df['model_m'] = df['model_m'].progress_apply(remove_tokens)

HBox(children=(IntProgress(value=0, max=42545), HTML(value='')))




Compile list of Russian stop words with length cut-off:

In [28]:
stop_words_ru = [item for item in get_stop_words('russian') if len(item) > token_min]

Remove stop words in model name:

In [29]:
for stop in tqdm(stop_words_ru):
    remove_stops = (lambda x: ' '.join([item for item in x.split() if item != stop]))
    df['model_m_clean'] = df['model_m_clean'].apply(remove_stops)

HBox(children=(IntProgress(value=0, max=375), HTML(value='')))




Create vector of keywords based on classes:

In [30]:
keys = list(" ".join(list(set(df['class_m']))).split())
keys = [x for x in keys if len(x) > token_min]

Call functions for cleaning:

In [31]:
morph = pymorphy2.MorphAnalyzer()
stemmer = SnowballStemmer("russian")

Extract keywords paradigms:

In [32]:
keys = [morph.parse(item)[0].normal_form for item in keys]
keys = [stemmer.stem(item) for item in keys]

Leave words in model names that do not contain paradigms:

In [33]:
for key in tqdm(keys): 
    remove_keys = (lambda x: ' '.join([item for item in x.split() if (key not in item)]))
    df['model_m_clean'] = df['model_m_clean'].apply(remove_keys)

HBox(children=(IntProgress(value=0, max=143), HTML(value='')))




Remove remaining tokens that are not present in Russian Corpora:

In [34]:
morph = pymorphy2.MorphAnalyzer()
remove_nones = (lambda x: ' '.join([item for item in x.split() if morph.parse(item)[0].tag.POS is None]))
df['model_m_clean'] = df['model_m_clean'].progress_apply(remove_nones)

HBox(children=(IntProgress(value=0, max=42545), HTML(value='')))




Remove entries with missing values:

In [35]:
df = df.apply(lambda x: x.str.strip() if isinstance(x, str) else x).replace('', np.nan)
df = df.dropna(how='any', axis=0)

Drop duplicates:

In [36]:
df.drop_duplicates(inplace=True, keep='first', subset=['ven_m', 'model_m_clean', 'class_m', 'id'])
df.drop_duplicates(inplace=True, keep='first', subset=['model_m_clean'])

Change order of atributes:

In [37]:
df = df[['class_m', 'ven_m', 'model_m', 'model_m_clean', 'id', 'url']]

Rename columns:

In [38]:
column_voc = {'model_m': 'model_name_market', 'model_m_clean': 'model_name_market_clean', 'ven_m': 'vendor_market', \
              'class_m': 'category_market', 'id': 'model_id_market', 'url': 'model_url_market'}
df = df.rename(columns=column_voc)

Show header:

In [39]:
df.head()

Unnamed: 0,category_market,vendor_market,model_name_market,model_name_market_clean,model_id_market,model_url_market
0,автомагнитолы,pioneer,FH-X360UB,fh x 360 ub,10544669,https://market.yandex.ru/product/10544669
1,автомагнитолы,pioneer,AVH-X1600DVD,avh x 1600,10567251,https://market.yandex.ru/product/10567251
2,автомагнитолы,sony,DSX-A35U,dsx a 35 u,10577269,https://market.yandex.ru/product/10577269
3,автомагнитолы,pioneer,MVH-150UB,mvh 150 ub,8475132,https://market.yandex.ru/product/8475132
4,автомагнитолы,pioneer,AVH-3500DVD,avh 3500,10389517,https://market.yandex.ru/product/10389517


Show random sample:

In [40]:
df.sample(n=5)

Unnamed: 0,category_market,vendor_market,model_name_market,model_name_market_clean,model_id_market,model_url_market
8400,стационарные медиаплееры,iconbit,XDS1003DW,xds 1003 dw,10757578,https://market.yandex.ru/product/10757578
5689,фотоаппараты,samsung,WB35F,wb 35 f,10667058,https://market.yandex.ru/product/10667058
6364,сумки и рюкзаки,port designs,Avoriaz,avoriaz,2557994,https://market.yandex.ru/product/2557994
3810,ноутбуки,hp,PAVILION 17-f159nr (Core i7 4510U 2000 Mhz/17....,pavilion 17 f 159 nr core i 7 4510 u 2000 mhz ...,11161356,https://market.yandex.ru/product/11161356
845,магнитолы,sony,Sony ZS-RS70BTB,zs rs 70 btb,10747463,https://market.yandex.ru/product/10747463


Load to csv:

In [41]:
df.to_csv('data/market_clean.csv', index=False)