In [10]:
from pathlib import Path
import pandas

data_dir = Path("..") / 'data'
train_data_file = pandas.read_csv(data_dir / "train.csv")
test_data_file = pandas.read_csv(data_dir / 'test.csv')

In [11]:
from sentence_transformers import SentenceTransformer
import numpy
import re


def name_refactor(string: str):
    string = re.sub(r"\[.*?\]", "", string)
    string = re.sub(r"\(.*?\)", "", string)
    string = re.sub(r"[^A-ZА-Яa-zа-я0-9 ]", "", string)
    string = re.sub(r"\s{2,}", " ", string)
    string = string.lower()
    return string


def split_slash(category) :
    components = category.split('/')
    return list(filter(lambda it: it, components))


def fill_category(cat_list):
    if len(cat_list) == 0:
        return ['неизвестно', 'неизвестно', 'неизвестно']
    if len(cat_list) == 1:
        return cat_list + ['неизвестно', 'неизвестно']
    elif len(cat_list) == 2:
        return cat_list + ['неизвестно']
    return cat_list


def worker(df: pandas.DataFrame, train: bool) -> pandas.DataFrame():
    df['Rating'] = df['Rating'].str.replace(',', '.').astype(float)
    df['Brand'] = df.Brand.astype(str)

    if train:
        df = df[~((df['Days in stock'] <= 0) & (df['Sales'] > 0))]
        df = df[~(df['Days with sales'] > df['Days in stock'])]

    df['Final sale'] = numpy.round(100.0 - 100.0 * df['Final price'] / df['Base price'])

    df['clean_name'] = df['Name'].apply(name_refactor)
    df['clean_name_len'] = df['clean_name'].apply(len)
    df['clean_brand'] = df['Brand'].apply(name_refactor)
    df['clean_name_brand'] = df['clean_name'] + " " + df['clean_brand']

    df['item_popularity'] = df['Days with sales'] / 31
    df['item_availability'] = df['Days in stock'] / 31

    df['sales_ratio'] = df['Days with sales'] / df['Days in stock']
    df['sales_ratio'] = df['sales_ratio'].fillna(value=0)
    df['sales_ratio'].replace([numpy.inf], 1.0, inplace=True)

    category_components: pandas.Series = df.apply(lambda row: split_slash(row['Category']), axis=1)
    category_components = category_components.apply(fill_category)
    df['cat_1'] = category_components.apply(lambda x: x[0])
    df['cat_2'] = category_components.apply(lambda x: x[1])
    df['cat_3'] = category_components.apply(lambda x: x[2])

    model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

    embs = model.encode(sentences=df.clean_name_brand.to_list(), device='cpu')
    df['name_brand_emb'] = embs.tolist()
    df['cat_1_emb'] = model.encode(df.cat_1.to_list()).tolist()
    df['cat_2_emb'] = model.encode(df.cat_2.to_list()).tolist()
    df['cat_3_emb'] = model.encode(df.cat_3.to_list()).tolist()
    df = df.drop(columns=[
        'Name',
        'cat_1',
        'cat_2',
        'cat_3',
        'Brand',
        'Seller',
        'clean_name_len',
        'clean_name',
        'clean_brand',
        'clean_name_brand',
        'Color',
        'full_category',
        'Category',
        'Average price',
        'Min price',
        'Max price',
        'Base price',
        'Basic Sale',
        'Basic Sale Price'])
    return df

In [12]:
train_df = worker(train_data_file, train=True)

In [13]:
test_df = worker(test_data_file, train=False)
test_Id = test_df.Id.to_list()

In [14]:
test_df = test_df.drop(columns=['Id'])

In [17]:
import catboost as cb

train_data = cb.Pool(train_df.drop(columns='Sales'), label=train_df.Sales,
                     embedding_features=['name_brand_emb', 'cat_1_emb', 'cat_2_emb', 'cat_3_emb'])

In [18]:
test_data = cb.Pool(test_df,
                    embedding_features=['name_brand_emb', 'cat_1_emb', 'cat_2_emb', 'cat_3_emb'])

In [19]:
catboost_reg = cb.CatBoostRegressor(loss_function='MAE',
                                    task_type='CPU',
                                    learning_rate=0.01,
                                    iterations=5000)

In [20]:
catboost_reg.fit(train_data)

0:	learn: 25.2894803	total: 55.5ms	remaining: 4m 37s
1:	learn: 25.1834389	total: 60.3ms	remaining: 2m 30s
2:	learn: 25.1147619	total: 64.3ms	remaining: 1m 47s
3:	learn: 25.0120388	total: 68.6ms	remaining: 1m 25s
4:	learn: 24.9253649	total: 72.8ms	remaining: 1m 12s
5:	learn: 24.8556653	total: 76.9ms	remaining: 1m 3s
6:	learn: 24.7455119	total: 80.7ms	remaining: 57.6s
7:	learn: 24.6419299	total: 84.4ms	remaining: 52.7s
8:	learn: 24.5686166	total: 88.2ms	remaining: 48.9s
9:	learn: 24.5061014	total: 91.9ms	remaining: 45.8s
10:	learn: 24.4461383	total: 95.9ms	remaining: 43.5s
11:	learn: 24.3333918	total: 99.6ms	remaining: 41.4s
12:	learn: 24.2417420	total: 103ms	remaining: 39.5s
13:	learn: 24.1394259	total: 107ms	remaining: 38.1s
14:	learn: 24.0262646	total: 111ms	remaining: 37s
15:	learn: 23.9307833	total: 116ms	remaining: 36s
16:	learn: 23.8671629	total: 120ms	remaining: 35.3s
17:	learn: 23.8085527	total: 125ms	remaining: 34.7s
18:	learn: 23.7188183	total: 129ms	remaining: 33.8s
19:	learn

<catboost.core.CatBoostRegressor at 0x7fc48a502be0>

In [21]:
test_pred = catboost_reg.predict(test_data)
train_pred = catboost_reg.predict(train_data)

In [22]:
submit_df = pandas.DataFrame({'Id': test_Id, 'Expected': numpy.round(test_pred)})

In [23]:
submit_df.to_csv('submission.csv', index=False)