In [5]:
import pandas as pd

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords 
import re

In [7]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

#### Загружаем тренировочные данные

In [8]:
train = pd.read_parquet('data_fusion_train.parquet')

In [308]:
train.head()

Unnamed: 0,receipt_id,receipt_dayofweek,receipt_time,item_name,item_quantity,item_price,item_nds_rate,category_id,brands
1,11,6,20:34,"Молоко 3,2%,шт",2.0,8,2,78,
3,39,4,11:28,"Компот из изюма, 114 ккал",1.0,4,1,71,
4,39,4,11:28,"Макаронные изделия отварные (масло сливочное),...",1.0,4,1,71,
17,56,5,11:42,Кофе Капучино Большой Эден 18,1.0,12,1,70,
40,105,3,01:53,Хлеб на СЫВОРОТКЕ 350г,1.0,7,-1,84,


- **receipt_id** —  id чека;
- **receipt_dayofweek** — день недели;
- **receipt_time** — время создания чека;
- **item_name** — наименование товара;
- **item_quantity** — количество товара;
- **item_price** — цена товара;
- **item_nds_rate** — ставка НДС;
- **category_id** — категория товара. 

**Найдем 5 наиболее частых item_name**

In [395]:
train['item_name'].value_counts().head()

Пакет                  1266157
ЗЕЛПМ-КА32Х62Х17         59320
Доставка                 55346
Станд Картофель фри      19634
БАНАНЫ КНОПКА 18         16736
Name: item_name, dtype: int64

**Удалим данные без категории (-1)**

In [9]:
train = train[train.category_id != -1]

In [383]:
train[(train['item_name'] == 'Пакет')]['category_id'].unique()

array([203,  78,  79,  40, 103,  80, 139, 204,  84, 143,  85, 133,  30,
        77,  76,  73,  70, 114,  83, 164,  81,  11,  75, 140, 130, 150,
        38,  82, 145, 121, 117,  74, 167], dtype=int64)

**Товар "Пакет" самый популярный и попал в разные категории. Его посмотрим позже отдельно.**

In [385]:
train[(train['item_name'] == 'ЗЕЛПМ-КА32Х62Х17')]['category_id'].unique()

array([203], dtype=int64)

In [386]:
train[(train['item_name'] == 'Доставка')]['category_id'].unique()

array([204], dtype=int64)

In [396]:
train[(train['item_name'] == 'Станд Картофель фри')]['category_id'].unique()

array([69], dtype=int64)

In [397]:
train[(train['item_name'] == 'БАНАНЫ КНОПКА 18')]['category_id'].unique()

array([80], dtype=int64)

**Остальные товары из ТОП-5 в одной категории**

In [10]:
train = train[train.category_id != -1].drop_duplicates('item_name').reset_index(drop=True)

In [334]:
train = train.drop('brands', axis=1)

In [297]:
train.shape

(26094077, 8)

In [8]:
train[train['receipt_id'].duplicated()].head()

Unnamed: 0,receipt_id,receipt_dayofweek,receipt_time,item_name,item_quantity,item_price,item_nds_rate,category_id
2,39,4,11:28,"Макаронные изделия отварные (масло сливочное),...",1.0,4,1,71
7,129,3,15:17,Станд Картофель фри,2.0,8,6,69
8,129,3,15:17,Хот-дог Куриный СБ,1.0,9,2,69
9,129,3,15:17,Чизбургер с луком СБ,1.0,9,2,68
11,131,1,08:21,"Молоко пастерилиз. т/рекс 2,5 % 1 л. 1/8 БМК",1.0,8,2,78


#### В одном чеке несколько товаров

In [317]:
print(train['category_id'].unique())
train['category_id'].unique().shape

[ 78  71  70  84  69  68  40 203  79   7 117  80  83  53  73  81   0  12
  85  77 145   2 204 139  38  74 130  75  49  45 133  76  82  19  43  51
  61 177 118  92  36  30 167  66  52 107  37   3  72  62  50 120  42 150
  57   6 140 101 163  20 103   4  31  67  27  29 114 102 115  26  35  39
  13   9 128  60  41 138  11 100  24  96 109 106  56 143 105  90   1 108
 164  55 111  58  54  97  46 121]


(98,)

In [330]:
print(train['category_id'].unique())
train['category_id'].unique().shape

[ 78  71  70  84  69  68  40 203  79   7 117  80  83  53  73  81   0  12
  85  77 145   2 204 139  38  74 130  75  49  45 133  76  82  19  43  51
  61 177 118  92  36  30 167  66  52 107  37   3  72  62  50 120  42 150
  57   6 140 101 163  20   4  31  67  27  29 114 102 115  26  35  39 103
  13   9 128  60  41 138  11 100  24  96 109 106  56 105  90   1 108 164
  55 111  58  54  97  46]


(96,)

In [300]:
train.isnull().sum()

receipt_id           0
receipt_dayofweek    0
receipt_time         0
item_name            0
item_quantity        0
item_price           0
item_nds_rate        0
category_id          0
dtype: int64

In [318]:
train.duplicated().sum()

34917

In [313]:
gr_train = train[train['item_name'].duplicated()].groupby('item_name')

In [321]:
gr_train['category_id'].unique()

item_name
                                                    [71, 84, 204, 74]
                                КАРТОФЕЛЬ ВЕС                    [80]
                             МИСКА                              [130]
                             НОСКИ                               [61]
                            БАНАНЫ кг                            [80]
                                                          ...        
яйцо куриное С1 10 шт бокс ИП Мусиенко                           [78]
яйцо под майонезом 90г  1 порц.                                  [71]
якобс монарх 1.8gr                                               [85]
ярина плюс таб п/об пленочной 3мг+0.03мг+0.451мг                 [38]
№1 с люля-кебаб (фри) 1  порц                                    [71]
Name: category_id, Length: 40093, dtype: object

In [11]:
train.groupby('category_id')['category_id'].count()

category_id
0      2356
1        28
2       316
3       109
4       225
       ... 
164      74
167      91
177     119
203      59
204    1146
Name: category_id, Length: 96, dtype: int64

In [12]:
train.groupby('category_id')['category_id'].count().max()

7094

In [13]:
train.groupby('category_id')['category_id'].count().min()

13

In [82]:
train[20:30]

Unnamed: 0,receipt_id,receipt_dayofweek,receipt_time,item_name,item_quantity,item_price,item_nds_rate,category_id
20,491,1,17:36,Вафли с топленым молоком вес. 1кг Тортугалия,0.354,11,1,84
21,558,1,13:55,Сметана Кубанский Молочник 20% 180гр шт,1.0,7,6,78
22,607,6,10:35,Баклажаны 1кг,0.173,8,2,80
23,615,3,20:35,НЕКТАР МУЛЬТИФРУК КД,1.0,4,2,83
24,615,3,20:35,СМЕТАНА 20% 300Г,1.0,8,2,78
25,629,3,13:18,Брюки трик. женские,1.0,14,1,53
26,677,1,09:39,РОЖОК ЛАК.КОЛИБР100Г,1.0,6,2,73
27,677,1,09:39,ЧИПСЫ КАРТОФЕЛЬНЫЕ,0.087,13,2,81
28,699,4,20:20,КОКА-КОЛА СТД. СКОМБО,2.0,8,1,83
29,706,0,00:32,Кф.Золотой Степ 50г с орехом,3.0,5,1,84


In [113]:
train[train['category_id'] == 85].head(10)

Unnamed: 0,receipt_id,receipt_dayofweek,receipt_time,item_name,item_quantity,item_price,item_nds_rate,category_id,brands
43,940,1,09:02,200Г ЧАЙ ГРИНФИЛД ИНГЛИШ ЭДИШН,1.0,12,1,85,гринфилд
171,3240,2,16:28,Кофе растворимый Маккофе Original 3в1 20г,2.0,4,1,85,
241,4531,6,13:10,Каркаде,2.0,6,-1,85,
338,6618,3,12:40,"Нап.коф. Cappuccino DI TORINO 5шт*25,5г",1.0,9,1,85,
372,6939,5,23:04,ЧАЙ ЧЕР. ЭРЛ ГРЕЙ БОЛ,1.0,9,1,85,
433,8190,6,09:29,ЦИКОРИЙ НАТ.СУБЛ.85Г,1.0,13,1,85,
534,10468,0,13:57,"Чай черн. Принцесса ГИТА 1,2гХ100пак",1.0,9,-1,85,принцесса гита
604,12412,6,10:52,"1,8Г*25 ЧАЙ ГРИНФ. ПИЧ МЭЛЛОУ",1.0,9,1,85,
676,14031,2,15:33,Кофе NESCAFE CLASSIK Стекло 47.5г,1.0,8,-1,85,nescafe
760,16294,0,09:15,Чай,3.0,6,6,85,


In [177]:
train[train['category_id'] == 6].head(20)

Unnamed: 0,receipt_id,receipt_dayofweek,receipt_time,item_name,item_quantity,item_price,item_nds_rate,category_id,brands
777,16835,0,15:30,КП/СТЕКЛООЧИСТИТЕЛЬ,1.0,10,1,6,
1141,25540,1,20:34,ЩЕТКА СТЕКЛООЧИС BOSCH ECO 60,1.0,13,1,6,eco
2160,55113,3,11:50,Держатель для тел в салон авто ИМП,1.0,12,1,6,
3765,109746,1,14:39,КД/ЗИМ. СТЕКЛООЧ.-30,1.0,12,1,6,
4503,141325,3,19:49,"Скребок 52,5 см для очистки от снега ИМП",1.0,9,1,6,
5594,193154,3,16:49,Щетка стеклоочистителя ИМП,2.0,10,1,6,
6066,216579,2,13:03,Коврик антискользящ.д/авто ИМП,1.0,8,1,6,
13319,859665,0,17:25,ЩЕТКА СТ/ОЧИСТИТЕЛЯ BOSCH C50 50СМ,1.0,13,1,6,bosch
15068,1146037,6,14:45,ШТЕКЕР В ПРИКУРИВАТЕЛЬ ( ПРЕДОХРАНИТЕЛЬ) ПРОВО...,1.0,10,1,6,
16155,1360372,0,16:24,РАМКА ПОД НОМЕР КАМУФЛЯЖ ОСКОЛКИ,1.0,10,1,6,


**Проанализаруем товар с названием "Пакет"**

In [6]:
df_paket = train[(train['item_name'] == 'Пакет')]

In [7]:
df_paket.shape

(1266157, 9)

In [8]:
df_paket['category_id'].unique()

array([203,  78,  79,  40, 103,  80, 139, 204,  84, 143,  85, 133,  30,
        77,  76,  73,  70, 114,  83, 164,  81,  11,  75, 140, 130, 150,
        38,  82, 145, 121, 117,  74, 167], dtype=int64)

In [9]:
df_paket.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1266157 entries, 71 to 45729277
Data columns (total 9 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   receipt_id         1266157 non-null  int64  
 1   receipt_dayofweek  1266157 non-null  int32  
 2   receipt_time       1266157 non-null  object 
 3   item_name          1266157 non-null  object 
 4   item_quantity      1266157 non-null  float64
 5   item_price         1266157 non-null  int64  
 6   item_nds_rate      1266157 non-null  int32  
 7   category_id        1266157 non-null  int64  
 8   brands             0 non-null        object 
dtypes: float64(1), int32(2), int64(3), object(3)
memory usage: 86.9+ MB


In [10]:
df_paket.corr()

Unnamed: 0,receipt_id,receipt_dayofweek,item_quantity,item_price,item_nds_rate,category_id
receipt_id,1.0,0.000589,-0.001175,-0.000277,0.001594,0.000965
receipt_dayofweek,0.000589,1.0,0.006355,0.020689,-0.00337,-0.004242
item_quantity,-0.001175,0.006355,1.0,0.028493,-0.014266,-0.02152
item_price,-0.000277,0.020689,0.028493,1.0,-0.011571,-0.550791
item_nds_rate,0.001594,-0.00337,-0.014266,-0.011571,1.0,-0.011448
category_id,0.000965,-0.004242,-0.02152,-0.550791,-0.011448,1.0


**Возьмем только признаки item_price, item_quantity и item_nds_rate**

In [11]:
df_paket['item_price'].describe()

count    1.266157e+06
mean     2.620295e+00
std      1.527792e+00
min      0.000000e+00
25%      2.000000e+00
50%      3.000000e+00
75%      3.000000e+00
max      2.900000e+01
Name: item_price, dtype: float64

In [12]:
df_paket['item_nds_rate'].describe()

count    1.266157e+06
mean     1.535099e+00
std      1.832467e+00
min     -1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      6.000000e+00
Name: item_nds_rate, dtype: float64

In [13]:
df_paket['item_quantity'].describe()

count    1.266157e+06
mean     1.267545e+00
std      3.587968e+00
min      1.000000e-03
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      1.998000e+03
Name: item_quantity, dtype: float64

In [14]:
df_paket['category_id'].value_counts().head()

203    1241807
79        5346
78        3738
80        3670
84        3399
Name: category_id, dtype: int64

**Чаще всего товар "Пакет" относится к 203-й категории**

In [15]:
df_paket.duplicated().sum()

24871

In [16]:
df_paket = df_paket.drop_duplicates()

In [17]:
df_paket.duplicated().sum()

0

In [18]:
df_paket.shape

(1241286, 9)

In [19]:
X_paket = df_paket[['item_price', 'item_quantity']].to_numpy()
y_paket = df_paket['category_id'].to_numpy()

In [20]:
from sklearn.svm import LinearSVC

In [21]:
from sklearn.metrics import f1_score, make_scorer
import numpy as np

def f1_weighted(y, p):
    resulted_f1 = []
    for c in y.unique():
        f1 = f1_score(y == c, p == c)
        resulted_f1.append(f1 * class_weights[c])
        
    return np.sum(resulted_f1) 

f1_weighted_sc = make_scorer(f1_weighted)

In [None]:
clf = LinearSVC()
scores = cross_val_score(clf, X_paket, y_paket, cv=3, scoring=f1_weighted_sc)



In [None]:
np.mean(scores), np.std(scores)

**Нормализуем данные**

In [None]:
mean = df_paket['item_price'].mean()
std  = df_paket['item_price'].std()

In [427]:
df_paket['item_price'] = (df_paket['item_price'] - df_paket['item_price'].mean()) / df_paket['item_price'].std()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [369]:
train[train['receipt_id'] == 42361]

Unnamed: 0,receipt_id,receipt_dayofweek,receipt_time,item_name,item_quantity,item_price,item_nds_rate,category_id
19557,42361,2,10:40,Лук репчатый нефасованный 1кг,1.131,5,2,80
19559,42361,2,10:40,Пакет,1.0,10,2,78
19560,42361,2,10:40,"Хлебушек ржаной ""Бабулин"" 300г",1.0,7,2,84


In [370]:
df_pivot = df_paket.pivot_table(index=['category_id', 'item_price'])

In [371]:
df_pivot

Unnamed: 0_level_0,Unnamed: 1_level_0,item_nds_rate,item_quantity,receipt_dayofweek,receipt_id
category_id,item_price,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
11,3,6.0,2.250000,1.500000,4.102204e+06
11,4,6.0,23.333333,2.666667,6.303433e+06
30,7,6.0,1.000000,3.666667,3.610264e+06
38,5,6.0,3.000000,5.000000,9.728760e+05
38,9,2.5,1.000000,3.000000,3.516204e+06
...,...,...,...,...,...
204,3,6.0,1.000000,3.344828,5.080993e+06
204,6,-1.0,1.000000,2.500000,3.982236e+06
204,12,6.0,1.000000,3.000000,5.524249e+06
204,14,-1.0,1.000000,3.333333,4.542218e+06


In [20]:
train[(train['item_name'].str.contains('колодки', case=False)) & (train['category_id'] != 4)].head(10)

Unnamed: 0,receipt_id,receipt_dayofweek,receipt_time,item_name,item_quantity,item_price,item_nds_rate,category_id,brands
46851,4839232,1,15:22,Колодки,1.0,17,6,115,


In [17]:
# Список для замены слов/символов с целью нормализации текста. 1-е значение - что меняем, 2-е - на что меняем
replace_words = [[',', '.']]

In [18]:
for word in replace_words:
    train['item_name'] = train['item_name'].str.replace(word[0], word[1], case = False)

In [None]:
stop = stopwords.words('russian')
tfidf = TfidfVectorizer(stop_words=stop, min_df=5)
tfidf.fit_transform(train.item_name)

In [None]:
X_words = list(tfidf.vocabulary_.keys())

In [None]:
# Функция преобразования текста в список слов, полученных способом вычисления расстояния Левенштейна
def new_text_lev(text):
    result = ""
    for word in identity_tokenizer(text.lower()):
        if word != "":
            new_word = process.extractOne(word, X_words)
            if new_word[1] > 90: # Задаем порог для включения слова
                result = result + " " + new_word[0]
    return result

In [None]:
train['new_item_name'] = train['item_name'].apply(new_text_lev)

In [None]:
import pickle

In [None]:
with open("train_ext", "wb") as f:
    pickle.dump(train, f)

In [None]:
# Что означает *?

In [None]:
tr = tfidf.transform(['пиво'])

In [None]:
tfidf.vocabulary_['пиво']

In [None]:
print(tr)

In [None]:
# Создадим функцию, которая выберает все покупки одной категории и возвращает вектор слов
# Не считать разделителями такие символы как (',', '/', )

In [None]:
# Функция получает номер категории и возвращает словарь слов, которые попадали в эту категрию
def get_words_cat(cat):
    df_cat = train[train['category_id'] == cat]
    tfidf = TfidfVectorizer(tokenizer=identity_tokenizer, min_df=5)
    tfidf.fit_transform(df_cat.item_name)
    return tfidf.vocabulary_

In [None]:
# Функция получает номер категории и возвращает словарь слов из других категорий кроме этой
def get_words_nocat(cat):
    df_cat = train[train['category_id'] != cat]
    tfidf = TfidfVectorizer(tokenizer=identity_tokenizer, min_df=5)
    tfidf.fit_transform(df_cat.item_name)
    return tfidf.vocabulary_

In [None]:
def dict_compare(d1, d2):
    d1_keys = set(d1.keys())
    d2_keys = set(d2.keys())
    uniq_key = d1_keys - d2_keys
    return uniq_key

In [None]:
# сравниваем словари и ищем совпадения. Интересуют совпадающие сокращения
words = get_words_cat(0)
nowords = get_words_nocat(0)

In [None]:
uniq_keys = dict_compare(words, nowords)

In [None]:
len(uniq_keys)

In [None]:
for pattern in uniq_keys:
    for cur_key in words.keys():
        if (re.match(pattern, cur_key)) and (pattern != cur_key):
            print(pattern, cur_key)   

In [None]:
# Список для замены слов в тексте. Индекс 0 - что заменить, 1 - на что заменить
replace_words_cat2 = [['com ', 'compakt'], ['папирос ', 'папиросы '], ['сиг. ', 'сигареты'], ['сигареты', 'сигареты '],
    ['durex', 'Дюрекс'], ['CONTEX', 'Контекс'], ['Classic', 'КЛАССИК'], ['пурпур ', 'пурпурный '], [',', '.'],
    ['сигар ', 'сигареты '], ['сиг-ты ', 'сигареты '], ['0,47 ', '0,47л '], ['нефильтр. ', 'нефильтрованное '], ['коньяке ', 'коньяк '],
    ['игр. ', 'игристое '], ['игр ', 'игристое '], ['п/сл. ', 'полусладкое '], ['п/сл ', 'полусладкое '], ['п/слад ', 'полусладкое '],
    ['крас ', 'красное '], ['фильт ', 'фильтрованное '], ['светл. ', 'светлое '], ['светл ', 'светлое ']]

In [None]:
import nltk
nltk.download('stopwords')

In [6]:
tfidf = TfidfVectorizer(min_df=5)
X_train = tfidf.fit_transform(train.item_name)

In [20]:
len(tfidf.vocabulary_)

6474

In [None]:
clf = LogisticRegression(max_iter=400)
cross_val_score(clf, X_train, train.category_id, cv=3, scoring='f1_weighted')

In [None]:
X_train.shape

min_df=5 удучшило качество модели по сравнению с max_features=1000

In [None]:
tfidf = TfidfVectorizer(min_df=5, tokenizer=identity_tokenizer)
X_train = tfidf.fit_transform(train.item_name)

In [None]:
clf = LogisticRegression(max_iter=400)
cross_val_score(clf, X_train, train.category_id, cv=3, scoring='f1_weighted')

tokenizer=identity_tokenizer ухудшило модель

In [None]:
X_train.shape

In [None]:
import re

In [None]:
len(tfidf.vocabulary_)

In [None]:
#Проверим fuzzy на наших данных
train_fuzzy = train[:-5]['item_name'].tolist()

In [None]:
len(train_fuzzy)

In [None]:
test_fuzzy = train[-3:-2]['item_name'].tolist()[0]
print(test_fuzzy)
new_sent = []
for word in identity_tokenizer(test_fuzzy.lower()):
    if word in tfidf.vocabulary_:
        new_sent.append(word)
    else:
        res = process.extractOne(word, X_words)
        if res[1] > 90:
            new_sent.append(res[0])
print(new_sent)

In [None]:
process.extract(test_fuzzy, train_fuzzy, limit=10)

In [None]:
print(test_fuzzy)
identity_tokenizer(test_fuzzy.lower())

In [None]:
from Levenshtein import editops

In [None]:
!pip install python-Levenshtein 

In [None]:
clf.fit(X_train, train.category_id)

In [None]:
import pickle
pickle.dump(tfidf, open('tfidf', 'wb'))
pickle.dump(clf, open('clf_task1', 'wb'))

In [None]:
tfidf = TfidfVectorizer(min_df = 3)

In [None]:
tfidf.fit(train.item_name)

In [None]:
len(tfidf.vocabulary_)

In [None]:
X_words = list(tfidf.vocabulary_.keys())

In [None]:
# Функция преобразования текста в список слов, полученных способом вычисления расстояния Левенштейна
def new_text_lev(text):
    result = ""
    for word in identity_tokenizer(text.lower()):
        #word = translit(word, 'ru')
        if word in X_words:
            result = result + " " + word
        else:
            if word != "":
                new_word = process.extractOne(word, X_words)
                if new_word[1] > 90: # Задаем порог для включения слова
                    result = result + " " + new_word[0]
    return result

In [None]:
train['new_item_name'] = train['item_name'].apply(new_text_lev)

In [None]:
train['item_name'][10:20]

In [None]:
train['item_name'][10:20].apply(new_text_lev)

In [None]:
identity_tokenizer(str(train['item_name'][:1]))

In [None]:
(train['item_name'][:1])

In [56]:
import numpy as np

In [None]:
# Функция для разделения текста
def identity_tokenizer(text):
    return re.findall('[А-яA-z]+', text)

In [69]:
stop = stopwords.words('russian')
tfidf = TfidfVectorizer(min_df = 6, ngram_range=(1,3))
X_train = tfidf.fit_transform(train.item_name)

In [70]:
len(tfidf.vocabulary_)

8146

In [71]:
clf = LogisticRegression(max_iter=400)
np.array(cross_val_score(clf, X_train, train.category_id, cv=3, scoring='f1_weighted')).mean()

0.7131785515282553

In [None]:
train.head()

In [None]:
from transliterate import translit

In [None]:
process.extractOne(translit('plus', 'ru'), ['пл'])

In [None]:
translit('plus', 'ru')

In [8]:
clf = LogisticRegression(max_iter=400)

In [27]:
X_train = tfidf.transform(train['item_name'][:45000])
y_train = train['category_id'][:45000]

In [28]:
X_test = tfidf.transform(train['item_name'][45000:])
y_test = train['category_id'][45000:]

In [29]:
clf.fit(X_train, y_train)

LogisticRegression(max_iter=400)

In [30]:
y_pred = clf.predict(X_test)

In [37]:
from keras.models import Sequential
from keras.optimizers import RMSprop
from keras.layers import Dense, Embedding, LSTM

In [38]:
max_words = len(tfidf.vocabulary_)
num_classes = 96
maxSequenceLength = len(tfidf.vocabulary_)

In [44]:
len(tfidf.vocabulary_)

6474

In [52]:
from keras.layers import Embedding
from keras import preprocessing

In [43]:
embedding_layer = Embedding(10000, 64)

In [45]:
max_features = 10000
maxlen = 5

In [76]:
model = Sequential()
model.add(Embedding(10000, 8, input_length=maxlen))
model.add(Flatten())
model.add(Dense(96, activation='softmax'))
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])

In [7]:
from sklearn.neural_network import MLPClassifier

In [8]:
mlp = MLPClassifier(solver='lbfgs', random_state=0, max_iter=1000, hidden_layer_sizes=[1000, 100])

In [10]:
mlp.fit(X_train, train.category_id)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


MLPClassifier(hidden_layer_sizes=[1000, 100], max_iter=1000, random_state=0,
              solver='lbfgs')

In [11]:
import pickle

In [12]:
import pickle
pickle.dump(tfidf, open('tfidf', 'wb'))
pickle.dump(mlp, open('mlp_task1', 'wb'))

In [82]:
cross_val_score(mlp, X_train, train.category_id, cv=3, scoring='f1_weighted')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


array([0.75982965, 0.74151029, 0.69955346])

In [79]:
np.array([0.76546632, 0.7459006 , 0.71880272]).mean()

0.74338988

In [83]:
np.array([0.75982965, 0.74151029, 0.69955346]).mean()

0.7336311333333333

In [74]:
import numpy as np

In [99]:
import re

In [247]:
ini_str = "тестСлова АВС jhjhjhVgfgf 43.466 dfd/465g ghjGfgg JHGH"

In [291]:
def repl_case(text):
    text = re.sub(r'([а-я](?=[А-Я])|[А-Я](?=[А-Я][а-я]))', r'\1 ', text)
    text = re.sub(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ', text)
    return text

In [188]:
ini_str = repl_case(ini_str)
ini_str

'тест Слова'

In [251]:
re.split('[А-я][^А-Я]*', ini_str)

['', '', '', '', '', '']

In [274]:
re.sub(r"(\w)([А-Я])", r"\1 \2", ini_str)

'тест Слова А ВС jhjhjhVgfgf 43.466 dfd/465g ghjGfgg JHGH'

In [293]:
train['item_name'] = train['item_name'].apply(repl_case)

In [294]:
train['item_name'].head(10)

0                                       Молоко 3,2%,шт
1                            Компот из изюма, 114 ккал
2    Макаронные изделия отварные (масло сливочное),...
3                        Кофе Капучино Большой Эден 18
4                               Хлеб на СЫВОРОТКЕ 350г
5                       Сосиска в тесте с сыром 1шт ГЕ
6      Ланч Баскет 5 за 300: 2 шт ОРИГ Стрипсы кур фил
7                                  Станд Картофель фри
8                                   Хот-дог Куриный СБ
9                                 Чизбургер с луком СБ
Name: item_name, dtype: object

In [257]:
re.sub(r'([а-я](?=[А-Я])|[А-Я](?=[А-Я][а-я]))', r' ', ini_str)

'тес Слова АВС jhjhjhVgfgf 43.466 dfd/465g ghjGfgg JHGH'

In [273]:
re.sub(r'([а-я](?=[А-Я])|[А-Я](?=[А-Я][а-я]))', r' ', ini_str)

'тес Слова АВС jhjhjhVgfgf 43.466 dfd/465g ghjGfgg JHGH'

In [255]:
''.join(' ' + char if char.isupper() else char.strip() for char in ini_str).strip()

'тест Слова А В Сjhjhjh Vgfgf43.466dfd/465gghj Gfgg J H G H'

In [280]:
re.sub(r"(\w)([А-Я])", r"\1 \2", ini_str)

'тест Слова А ВС jhjhjhVgfgf 43.466 dfd/465g ghjGfgg JHGH'

In [281]:
re.sub(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ', "SimpleHTTPServer")

'Simple HTTP Server'

In [287]:
re.sub(r'([а-я](?=[А-Я])|[А-Я](?=[А-Я][а-я]))', r'\1 ', ini_str)

'тест Слова АВС jhjhjhVgfgf 43.466 dfd/465g ghjGfgg JHGH'

In [17]:
train.shape

(6483420, 9)

In [10]:
ad_df = pd.read_excel('cat0.xlsx')

In [12]:
ad_df.head()

Unnamed: 0,item_name,category_id
0,Пиво PAULANER Hefe-Weissbier безалкогольное ж/...,0
1,Пиво безалкогольное CLAUSTHALER. бутылка 0.33 л,0
2,Пиво MAISEL'S WEISSE безалкогольное ст/б. 0.5л,0
3,Пиво CLAUSTHALER безалкогольное в жестяной бан...,0
4,Пиво БАЛТИКА №0 безалкогольное ж/б. 0.45л,0


In [16]:
train = pd.concat([train, ad_df])

In [1]:
from nltk.stem import SnowballStemmer 
russian_stemmer = SnowballStemmer('russian')

In [34]:
%%time

russian_stemmer.stem('Шампанское полусладкое 0.75литров 10% Сметаной')

Wall time: 0 ns


'шампанское полусладкое 0.75литров 10% сметан'

In [16]:
from pymystem3 import Mystem
m = Mystem()

Installing mystem to C:\Users\sypachev/.local/bin\mystem.exe from http://download.cdn.yandex.net/mystem/mystem-3.1-win-64bit.zip


In [64]:
%%time

text = "Шампанское полусладкое 0.75литров 10% Сметаной"

for word in text.split():
    print(m.lemmatize(word))

['шампанское', '\n']
['полусладкий', '\n']
['0.75', 'литр', '\n']
['10', '%\n']
['сметана', '\n']
Wall time: 8.65 s


In [66]:
%%time

print(m.lemmatize(text))

['шампанское', ' ', 'полусладкий', ' ', '0.75', 'литр', ' ', '10', '% ', 'сметана', '\n']
Wall time: 1.81 s


In [40]:
def stemm(text):
    result = ""
    for word in identity_tokenizer(text):
        result += " " + russian_stemmer.stem(word)
    return result

In [43]:
import re

In [44]:
def identity_tokenizer(text):
    return re.findall('[А-яA-z]+', text)

In [46]:
%%time

stemm("Шампанское полусладкое 0.75литров 10% Сметаной")

Wall time: 0 ns


' шампанск полусладк литр сметан'

In [71]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sypachev\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [72]:
import nltk
from nltk.stem import WordNetLemmatizer

In [73]:
lemmatizer = WordNetLemmatizer()

In [76]:
%%time

lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in text.split()])

Wall time: 0 ns


In [77]:
lemmatized_output

'Шампанское полусладкое 0.75литров 10% Сметаной'