# Text to feature conversion.

Execution flow plan:

1. Uploading test and train extended datasets.
2. Converting text to features
3. Saving train and test encoded datasets.

## Uploading:

In [1]:
import pandas as  pd

In [2]:
import os

TRAIN_EXT_PATH = os.path.join("..", "data", "train_extended.pkl")
TEST_EXT_PATH = os.path.join("..", "data", "test_extended.pkl")

In [3]:
train = pd.read_pickle(TRAIN_EXT_PATH)
train

Unnamed: 0,product_id,category_id,sale,shop_id,shop_title,rating,title,description,attributes
0,325286,12171,False,9031,Aksik,5.00000,Зарядный кабель Borofone BX1 Lightning для айф...,"<p><span style=""background-color: transparent;...","[Длина: 1м, Разъем: Lightning, Подерживает быс..."
1,888134,14233,False,18305,Sela,5.00000,Трусы Sela,Трусы-слипы из эластичного бесшовного трикотаж...,[]
2,1267173,13429,False,16357,ЮНЛАНДИЯ канцтовары,5.00000,"Гуашь ""ЮНЫЙ ВОЛШЕБНИК"", 12 цветов по 35 мл, БО...",<p>Гуашь высшего качества ЮНЛАНДИЯ поможет соз...,[]
3,1416943,2789,False,34666,вася-nicotine,4.00000,Колба для кальяна Крафт (разные цвета),"<p><span style=""color: rgb(149, 151, 153);"">Ун...","[Материал: стекло, Внутренний диаметр: 45 мм ,..."
4,1058275,12834,False,26389,Lim Market,4.60000,"Пижама женская, однотонная с шортами",<p>Лёгкая ткань! Комфортная посадка! Идеальная...,[]
...,...,...,...,...,...,...,...,...,...
91111,114402,14922,False,4955,СТЕКЛОФФ ПРО,3.62069,Прочное стекло 2D на Samsung Galaxy J5 Prime,"<h2 class=""ql-align-justify"">Защитное стекло 2...",[В магазине СТЕКЛОФФ есть стекла всех моделей ...
91112,1594500,13028,False,19626,Hobby room,5.00000,"Алмазная мозаика ""Ромашки"" 40*50см на подрамнике","<p class=""ql-align-justify"">Алмазная мозаика -...","[Алмазная мозаика на подрамнике, Тип выкладки:..."
91113,790493,13407,False,22291,Море открыток,5.00000,"Открытка ""Вместе навсегда"" в крафтовом конверт...",<p>Открытка 10*15 в крафтовом конверте.</p>,[]
91114,114509,12100,False,2985,Oppa Market,5.00000,Пульт K10B-C1 для Rolsen,<p>Подходит к аппаратуре Ролсен:</p><p>Rolsen ...,[Пульт для телевизоров Rolsen]


In [4]:
test = pd.read_pickle(TEST_EXT_PATH)
test

Unnamed: 0,product_id,sale,shop_id,shop_title,rating,title,description,attributes
0,1997646,False,22758,Sky_Electronics,5.000000,"Светодиодная лента Smart led Strip Light, с пу...","<p>Светодиодная лента LED, 5 м, RGB (Цветная) ...","[Легкость управления с пульта, а так же смартф..."
1,927375,False,17729,Di-Di Market,4.405941,Стекло ПЛЕНКА керамик матовое Honor 50 lite 10...,"<p><span style=""color: rgb(63, 62, 62);"">Защит...",[Honor 50 ПЛЕНКА!!! КРАЯ КЛЕЮТСЯ МОГУТ НЕ ПРО...
2,1921513,False,54327,VisionStore,4.000000,"Проводные наушники с микрофоном jack 3.5, IOS,...",<p>Наушники проводные с микрофоном отличное ка...,[Возможность использования как гарнитуры для П...
3,1668662,False,15000,FORNAILS,5.000000,"Декоративная табличка ""Правила кухни"", подстав...","<p>Декоративная табличка ""Правила кухни"" созда...",[]
4,1467778,False,39600,МОЯ КУХНЯ,5.000000,"Подставка под ложку керамическая, подложка ""Кл...",<p>Подложка керамическая с рисунком в подарочн...,"[Керамика., Размер: 255*90*30мм; ]"
...,...,...,...,...,...,...,...,...
16855,1914264,False,8598,Zoorbox,4.913043,Жесткий диск внутренний SSD KingDian 2.5 Inch ...,"<p><img src=""https://ke-images.servicecdn.ru/c...","[Интерфейс SSD SATA3 6.0 Гбит/с, Форм-фактор 2..."
16856,1310569,False,27474,RenRin,5.000000,Браслет оберег на руку/красная нить от сглаза/...,"<p>Красная нить - оберег, обладающий большой с...","[Насыщенный красный цвет, Хорошее плетение, Ун..."
16857,978095,False,23395,Хобби и Ты,5.000000,Кабошон бантик в упаковке 2 шт,"<p>Кабошон бантик, желтого цвета. Используется...","[Размер: 1.8 см., Пластик., В упаковке 2 шт.]"
16858,797547,False,16764,100000 мелочей,5.000000,"Полка для ванной угловая, 20,5 х 20,5 х 6,5 см...","<p>Полка для ванной угловая, 20,5×20,5×6,5 см,...","[Индивидуальная упаковка Без упаковки, Размер ..."


## Encoding text to numerical features:
Using 'bag-of-words' techniques.

In [5]:
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import word_tokenize


def text_processing(df: pd.DataFrame, columns: list[str]) -> list[str]:
    """ Processes dataframe row by row.
        Extracts text with the use of regexp.
        Applies lemmatization.
        Then tokenizes the text and merges tokens into one row.
        Merges tokens into one row.
        Append the row of tokens to the resultig list.

        Parameters
        ----------
        df: pd.DataFrame
            dataframe that has text-value columns.
        columns: List[str]
            name of columns which contain text.
        
        Returns
        -------
        tokenized_text: list[str]
            list of rows consists of lemmatized tokens.
    """
    
    # adding regexp patterns and stop words that are not deliver any meaning
    general_pattern = "[A-Za-z0-9!#$%&'()*+,./:;<=>?@[\]^_`{|}~—\"\-]+"
    stopwords_ru = stopwords.words("russian")

    lemmer = WordNetLemmatizer()

    text = df[columns]

    tokenized_text = []

    # go through each row of dataframe and convert it to lemmatized tokens
    for i in range(len(text)):
        row = ["".join(text.iloc[i][col]).lower() for col in text.columns]
        tokenized_row = []

        for part in row:
            # applying general pattern
            part = re.sub(general_pattern, " ", part)
            # Substituting multiple spaces with single space
            part = re.sub(r"\s+", " ", part, flags=re.I)

            tokens = word_tokenize(part, language="russian")

            for token in tokens:
                if token not in stopwords_ru and len(token) >= 3:
                    token = lemmer.lemmatize(token)
                    tokenized_row.append(token)
        # status prints
        if (i + 1) % 5000 == 0:
            print(f"{i+1} rows has been tokenized and lemmatized")
        # adding processed row to the resulting list.
        tokenized_text.append(" ".join(tokenized_row))

    return tokenized_text

In [6]:
from sklearn.feature_extraction.text import HashingVectorizer
""" Convert list of tokenized rows into numerical values with hashing vector """

hv = HashingVectorizer(n_features=1500, lowercase=False, alternate_sign=False)
train_corpus = text_processing(train, ["title", "description"])
test_corpus = text_processing(test, ["title", "description"])

hv_train = hv.fit_transform(train_corpus)
hv_test = hv.transform(test_corpus)

5000 rows has been tokenized and lemmatized
10000 rows has been tokenized and lemmatized
15000 rows has been tokenized and lemmatized
20000 rows has been tokenized and lemmatized
25000 rows has been tokenized and lemmatized
30000 rows has been tokenized and lemmatized
35000 rows has been tokenized and lemmatized
40000 rows has been tokenized and lemmatized
45000 rows has been tokenized and lemmatized
50000 rows has been tokenized and lemmatized
55000 rows has been tokenized and lemmatized
60000 rows has been tokenized and lemmatized
65000 rows has been tokenized and lemmatized
70000 rows has been tokenized and lemmatized
75000 rows has been tokenized and lemmatized
80000 rows has been tokenized and lemmatized
85000 rows has been tokenized and lemmatized
90000 rows has been tokenized and lemmatized
5000 rows has been tokenized and lemmatized
10000 rows has been tokenized and lemmatized
15000 rows has been tokenized and lemmatized


**check that everything is ok:**

In [7]:
hv_train

<91116x1500 sparse matrix of type '<class 'numpy.float64'>'
	with 4075608 stored elements in Compressed Sparse Row format>

In [8]:
hv_test

<16860x1500 sparse matrix of type '<class 'numpy.float64'>'
	with 764746 stored elements in Compressed Sparse Row format>

## Saving matrices with numerical features:

In [9]:
import numpy as np

TRAIN_ENC_PATH = os.path.join("..", "data", "train_encoded.npy")
TEST_ENC_PATH = os.path.join("..", "data", "test_encoded.npy")

np.save(TRAIN_ENC_PATH, hv_train.toarray())
np.save(TEST_ENC_PATH, hv_test.toarray())