In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
from functions.preprocessing import get_train_val_data_for_catboost
from functions.fit_eval_funcs import train_and_validate_catboost
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/statixx/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [5]:
path = 'data/TenderHack_Москва_train_data.xls'
data = pd.read_excel(path)

In [6]:
data.head()

Unnamed: 0,id,Статус,Наименование КС,ОКПД 2,КПГЗ,Регион,НМЦК,Итоговая цена,Дата,Участники,Ставки,ИНН
0,0,Завершена,СТУЛЬЯ УЧЕНИЧЕСКИЕ,,01.06.01.03.01;01.06.01.03.01,Москва,596790.0,593806.05,2021-06-30 11:20:05.720,1,1,d8912494ba2edd3bfeba55206a78a0f5
1,1,Не состоялась,МЕДИЦИНСКИЕ РАСХОДНЫЕ МАТЕРИАЛЫ,,01.02.10.50.33;01.02.10.43.05.01,Москва,4964.9,0.0,2022-10-04 09:16:04.770,0,0,237a5c57a66f02f8deb152e21f33863a
2,3,Завершена,Видеокарта Palit PCI-E,,01.13.17.08,Москва,462000.0,311850.0,2021-07-01 13:23:09.177,6,65,040f1570117a744d529f4675f827a50f
3,4,Завершена,"ТОВАРЫ ИНФОРМАЦИОННО-ТЕХНОЛОГИЧЕСКИЕ, СРЕДСТВА...",,01.13.11.03.02;01.13.04.04.06.05;01.10.04.04.0...,Москва,505555.0,460042.9,2021-03-16 10:42:20.810,6,18,c00c03dca0274fe43fc34e974434a927
4,5,Завершена,Пинцет острый,,01.02.10.06.48.04,Москва,2000.0,1980.0,2022-05-26 11:04:42.597,2,2,cc997efa7a6742b6119a3c253a084e80


## Get data: (normally ended sessions and use datetime features)
1. filter out normally ended sessions
2. Add datetime features to feature dataframe

In [7]:
features, drawdown, num_competitors = get_train_val_data_for_catboost(data, status_columns=['Завершена'], use_date_features=True)

[INFO] X y split...
[INFO] Done...


## Simple model using datetime features and other categorical objects

### Eval MAE error on 1st target on validation dataset: price drawdown in percents

In [8]:
# MAE loss function
drawdown_model = train_and_validate_catboost(features.train, features.valid, drawdown.train, drawdown.valid, 
                                            iterations=5000, loss_function='MAE', custom_metric="MAE", verbose=0)

Default metric period is 5 because MAE is/are not implemented for GPU


            R2        MAE
Score  0.11111  12.720013


In [68]:
# RMSE loss function
drawdown_model = train_and_validate_catboost(features.train, features.valid, drawdown.train, drawdown.valid, 
                                            iterations=5000, loss_function='RMSE', custom_metric="RMSE", verbose=0)

             R2        MAE
Score  0.208469  13.067395


### Valid MAE error on 2nd target: number of competitors

In [98]:
# MAE loss function
num_comp_model = train_and_validate_catboost(features.train, features.valid, num_competitors.train, num_competitors.valid, 
                                            iterations=5000, loss_function='MAE', custom_metric="MAE", verbose=0)

Default metric period is 5 because MAE is/are not implemented for GPU


             R2       MAE
Score  0.255137  1.868958


In [99]:
# RMSE loss function
num_comp_model = train_and_validate_catboost(features.train, features.valid, num_competitors.train, num_competitors.valid, 
                                            iterations=5000, loss_function='RMSE', custom_metric="RMSE", verbose=0)

             R2       MAE
Score  0.310553  1.884676


## Text processing: 
pass to catboost text features like 'Наименование КС'

### Valid MAE error on 1st target: price drawdown in percents

In [39]:
# MAE loss function
drawdown_model = train_and_validate_catboost(features.train, features.valid, drawdown.train, drawdown.valid, 
                                            use_text_features=True,
                                            iterations=5000, loss_function='MAE', custom_metric="MAE", verbose=0)

Default metric period is 5 because MAE is/are not implemented for GPU


             R2        MAE
Score  0.110359  12.890812


### Valid MAE error on 2nd target: number of competitors

In [106]:
# MAE loss function
num_comp_model = train_and_validate_catboost(features.train, features.valid, num_competitors.train, num_competitors.valid, 
                                            use_text_features=True,
                                            iterations=5000, loss_function='MAE', custom_metric="MAE", verbose=0)

Default metric period is 5 because MAE is/are not implemented for GPU


             R2       MAE
Score  0.277157  1.861879


## TEXT VECTORS:
transform code to words, then transform word columns (Наименование КС и code) to embedding vectors, using gensim

In [30]:
features, drawdown, num_competitors = get_train_val_data_for_catboost(
                                            data, 
                                            status_columns=['Завершена'],
                                            vectorize_features=True,
                                            use_date_features=True)

[INFO] Loading classifier database...
[INFO] Starting code to words process...
[INFO] Transform words to vectors...
[INFO] Unite vectors...
[INFO] X y split...
[INFO] Done...


### Valid MAE error on 1st target: price drawdown in percents

In [36]:
# MAE loss function
drawdown_model = train_and_validate_catboost(features.train, features.valid, drawdown.train, drawdown.valid, 
                                            iterations=5000, loss_function='MAE', custom_metric="MAE", verbose=0)

Default metric period is 5 because MAE is/are not implemented for GPU


             R2       MAE
Score  0.133225  12.75187


In [37]:
# RMSE loss function
drawdown_model = train_and_validate_catboost(features.train, features.valid, drawdown.train, drawdown.valid, 
                                            iterations=5000, loss_function='RMSE', custom_metric="RMSE", verbose=0)

             R2       MAE
Score  0.257744  12.59445


### Valid MAE error on 2nd target: number of competitors

In [35]:
# MAE loss function
num_comp_model = train_and_validate_catboost(features.train, features.valid, num_competitors.train, num_competitors.valid, 
                                            iterations=5000, loss_function='MAE', custom_metric="MAE", verbose=0)

Default metric period is 5 because MAE is/are not implemented for GPU


             R2       MAE
Score  0.284966  1.844291


In [34]:
# RMSE loss function
num_comp_model = train_and_validate_catboost(features.train, features.valid, num_competitors.train, num_competitors.valid, 
                                            iterations=5000, loss_function='RMSE', custom_metric="RMSE", verbose=0)

             R2       MAE
Score  0.343018  1.851319
