In [4]:
import pandas as pd
import numpy as np
import io
import os
import re
import string
import pymorphy2
from collections import OrderedDict
from nltk.stem.snowball import SnowballStemmer
from stop_words import get_stop_words
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt

Parameters:

In [5]:
min_word_length = 2

Open directory:

In [6]:
list_of_files = os.listdir('data/top_ym/')
files_num = len(list_of_files)

Choose proper features:

In [7]:
market_features = ['Категория', 'Производитель', 'Название модели']

Read files with proper attributes to the frame:

In [8]:
df = pd.DataFrame()

for item in tqdm(list_of_files):
    df_file = pd.read_csv(('data/top_ym/' + str(item)), delimiter=';', encoding='windows-1251', usecols=market_features)
    df = pd.concat([df, df_file], axis=0)

HBox(children=(IntProgress(value=0, max=313), HTML(value='')))




Write frame to csv:

In [9]:
df.to_csv('data/market_dirty.csv', index=False)

Rename features:

In [10]:
df = df.rename(columns={'Категория': 'class_m', 'Производитель': 'ven_m', 'Название модели': 'model_m'})

Number of records before cleaning:

In [11]:
dim_before = int(df.shape[0])
print('Number of records:', int(df.shape[0]))

Number of records: 3421187


Distribution of model length - number of words:

In [12]:
print('Average number of words in model:', round(df['model_m'].str.split().apply(len).mean()))
print('Std number of words in model:', round(df['model_m'].str.split().apply(len).std()))

Average number of words in model: 3
Std number of words in model: 3


Show nulls for all features - absolute values:

In [13]:
print('Nulls for all features - absolute values:', dict(round(df.isnull().sum()/len(df), 2)))

Nulls for all features - absolute values: {'class_m': 0.0, 'ven_m': 0.0, 'model_m': 0.0}


Show nulls for all features - relative values:

In [14]:
print('Nulls for all features - relative values:', dict(df.isnull().sum()))

Nulls for all features - relative values: {'class_m': 0, 'ven_m': 0, 'model_m': 0}


Collect records with missed records:

In [15]:
df.dropna(how='any', axis=0, inplace=True)

Removing punctuation from string attributes:

In [16]:
for item in tqdm(list(string.punctuation)):
    df[['class_m', 'model_m']] = df[['class_m', 'model_m']].applymap(lambda x: x.replace(item, ' '))

HBox(children=(IntProgress(value=0, max=32), HTML(value='')))




Remove multiple spaces and reduce to lowercase:

In [17]:
df = df.applymap(lambda x: re.sub(' +', ' ', x) if isinstance(x, str) else x)
df = df.applymap(lambda x: x.strip().lower() if isinstance(x, str) else x)

Duplicate reset by category-manufacturer-model:

In [18]:
df.drop_duplicates(inplace=True, keep='first', subset=['class_m', 'ven_m', 'model_m'])

Type conversion to string:

In [19]:
df = df.astype({'class_m': 'str', 'ven_m': 'str', 'model_m': 'str'})

Print the number of zeros:

In [20]:
print('Number of nulls for all features:', dict(df.isnull().sum()))

Number of nulls for all features: {'class_m': 0, 'ven_m': 0, 'model_m': 0}


Delete duplicate words in the model name:

In [21]:
df['model_m'] = df['model_m'].apply(lambda x: ' '.join(OrderedDict.fromkeys(x.split())))

We remove manufacturers from model names:

In [22]:
for item in tqdm(list(set(df['ven_m']))):
    df['model_m'] = df['model_m'].apply(lambda x: x.replace(item, ''))

HBox(children=(IntProgress(value=0, max=1649), HTML(value='')))




Compile a list of stop words in the Russian dictionary with a cut-off in length:

In [23]:
stop_words_ru = [item for item in get_stop_words('russian') if len(item) > min_word_length]

Delete the word stop in the model name:

In [24]:
for stop in tqdm(stop_words_ru):
    remove_stops = (lambda x: ' '.join([item for item in x.split() if item != stop]))
    df['model_m'] = df['model_m'].apply(remove_stops)

HBox(children=(IntProgress(value=0, max=375), HTML(value='')))




Create a vector of keywords for category names:

In [25]:
keys = list(" ".join(list(set(df['class_m']))).split())
keys = [x for x in keys if len(x) > min_word_length]

We call classes for cleaning and stemming:

In [26]:
morph = pymorphy2.MorphAnalyzer()
stemmer = SnowballStemmer("russian")

We highlight the paradigms of keywords:

In [27]:
keys = [morph.parse(item)[0].normal_form for item in keys]
keys = [stemmer.stem(item) for item in keys]

We leave words in the model names that do not contain a paradigm:

In [28]:
for key in tqdm(keys): 
    remove_keys = (lambda x: ' '.join([item for item in x.split() if (key not in item)]))
    df['model_m'] = df['model_m'].apply(remove_keys)

HBox(children=(IntProgress(value=0, max=150), HTML(value='')))




We leave only words that are not mentioned in the Russian dictionary:

In [29]:
remove_not_nouns = (lambda x: ' '.join([item for item in x.split() if morph.parse(item)[0].tag.POS is None]))
df['model_m'] = df['model_m'].apply(remove_not_nouns)

We delete the characteristic of the memory size 'number + GB': it does not affect the tac number:

In [30]:
df['model_m'] = df['model_m'].apply(lambda x: re.sub(r'\d+gb', '', x))

Separate numbers from non-numbers:

In [31]:
df['model_m'] = df['model_m'].apply(lambda x: re.sub(r'(\D)(\d)', r'\1 \2', x))
df['model_m'] = df['model_m'].apply(lambda x: re.sub(r'(\d)(\D)', r'\1 \2', x))

Delete entries with missing values:

In [32]:
df = df.apply(lambda x: x.str.strip() if isinstance(x, str) else x).replace('', np.nan)
df = df.dropna(how='any', axis=0)

Duplicate reset:

In [33]:
df.drop_duplicates(inplace=True, keep='first', subset=['class_m', 'ven_m', 'model_m'])

Statistics on the distribution of the number of words in the model name:

In [34]:
print('Average number of words in model:', round(df['model_m'].str.split().apply(len).mean()))
print('Std number of words in model:', round(df['model_m'].str.split().apply(len).std()))

Average number of words in model: 5
Std number of words in model: 5


Number of records:

In [35]:
dim_after = int(df.shape[0])
print('Number of records:', int(df.shape[0]))

Number of records: 37003


Compression by the number of records:

In [36]:
print('Compression by the number of records:', round(dim_before/dim_after))

Compression by the number of records: 92


Frame Description:

In [37]:
df.describe()[:2][:]

Unnamed: 0,class_m,ven_m,model_m
count,37003,37003,37003
unique,76,1642,36300


Output a random sample:

In [38]:
df.sample(n=10)

Unnamed: 0,class_m,ven_m,model_m
5385,рули джойстики геймпады,sony,shock 4
6564,сканеры,canon,l 24 scanner
7869,кулеры и системы охлаждения,cooler master,a 116 dp 6 9 gdsc 0 l
3983,ноутбуки,dell,inspiron 5748 core i 5 4210 u 1700 mhz ...
8020,модули памяти,hyperx,hx 426 c 16 fb 2 k 2 16
139,автомагнитолы,mydean,7139
2053,диктофоны,olympus,vn 731 pc
6009,источники бесперебойного питания,cyberpower,ибп value 600 ei
6649,компьютерная акустика,sven,ms 110
8335,комплекты акустики,pioneer,htb 423 2 b


Write csv to the directory:

In [39]:
df.to_csv('data/market_clean.csv', index=False)