In [28]:
from pathlib import Path

import pandas as pd

data_dir = Path("..") / 'data'
train_csv = pd.read_csv(data_dir / "train.csv")
test_df = pd.read_csv(data_dir / 'test.csv')

In [29]:
train_csv['Seller'] = train_csv.Seller.fillna(value='неизвестно')

In [30]:
from sentence_transformers import SentenceTransformer
import numpy as np

import re


def string_work(string: str):
    string = re.sub(r"\[.*?\]", "", string)
    string = re.sub(r"\(.*?\)", "", string)
    string = re.sub(r"[^A-ZА-Яa-zа-я0-9 ]", "", string)
    string = re.sub(r"\s{2,}", " ", string)
    string = string.lower()
    return string


def worker(df: pd.DataFrame, train: bool) -> pd.DataFrame():
    df['Rating'] = df['Rating'].str.replace(',', '.').astype(float)
    df['Brand'] = df.Brand.astype(str)
    df['Seller'] = df.Seller.fillna(value='неизвестно')

    if train:
        df = df[~((df['Days in stock'] <= 0) & (df['Sales'] > 0))]
        df = df[~(df['Days with sales'] > df['Days in stock'])]

    df['Final sale'] = np.round(100.0 - 100.0 * df['Final price'] / df['Base price'])

    df['clean_name'] = df['Name'].apply(string_work)
    df['clean_name_len'] = df['clean_name'].apply(len)
    df['clean_brand'] = df['Brand'].apply(string_work)
    df['clean_name_brand'] = df['clean_name'] + " " + df['clean_brand']
    df['clean_seller'] = df['Seller'].apply(string_work)

    df['item_popularity'] = df['Days with sales'] / 31
    df['item_availability'] = df['Days in stock'] / 31

    df['sales_ratio'] = df['Days with sales'] / df['Days in stock']
    df['sales_ratio'] = df['sales_ratio'].fillna(value=0)
    df['sales_ratio'].replace([np.inf], 1.0, inplace=True)

    model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

    df["Category"] = df["Category"].apply(lambda x:x.replace("/"," ").strip())

    df['seller_emb'] = model.encode(df.clean_seller.to_list(), device='cpu').tolist()
    df['name_brand_emb'] = model.encode(df.clean_name_brand.to_list(), device='cpu').tolist()
    df['cat_emb'] = model.encode(df["Category"].to_list(), device='cpu').tolist()
    df = df.drop(columns=[
        'Name',
        'Brand',
        'Seller',
        'clean_seller',
        'clean_name_len',
        'clean_name',
        'clean_brand',
        'clean_name_brand',
        'Color',
        'full_category',
        'Category',
        'Average price',
        'Min price',
        'Max price',
        'Base price',
        'Basic Sale',
        'Basic Sale Price'])
    return df

In [31]:
train_csv = worker(train_csv, train=True)

In [32]:
test_df = worker(test_df, train=False)
test_Id = test_df.Id.to_list()

In [33]:
test_df = test_df.drop(columns=['Id'])

In [36]:
import catboost as cb

train_data = cb.Pool(train_csv.drop(columns='Sales'), label=train_csv.Sales,
                     embedding_features=['name_brand_emb', 'cat_emb', 'seller_emb'])

In [37]:

test_data = cb.Pool(test_df,
                    embedding_features=['name_brand_emb', 'cat_emb', 'seller_emb'])

In [38]:
catboost_reg = cb.CatBoostRegressor(loss_function='MAPE',
                                    task_type='CPU',
                                    learning_rate=0.015,
                                    depth=12,
                                    iterations=1200)

In [39]:
catboost_reg.fit(train_data, early_stopping_rounds=10)

0:	learn: 0.7425608	total: 120ms	remaining: 2m 23s
1:	learn: 0.7333517	total: 201ms	remaining: 2m
2:	learn: 0.7247292	total: 280ms	remaining: 1m 51s
3:	learn: 0.7164749	total: 347ms	remaining: 1m 43s
4:	learn: 0.7079905	total: 360ms	remaining: 1m 25s
5:	learn: 0.6996329	total: 428ms	remaining: 1m 25s
6:	learn: 0.6915014	total: 493ms	remaining: 1m 24s
7:	learn: 0.6827571	total: 554ms	remaining: 1m 22s
8:	learn: 0.6742958	total: 623ms	remaining: 1m 22s
9:	learn: 0.6657616	total: 686ms	remaining: 1m 21s
10:	learn: 0.6579012	total: 759ms	remaining: 1m 22s
11:	learn: 0.6499779	total: 835ms	remaining: 1m 22s
12:	learn: 0.6421394	total: 902ms	remaining: 1m 22s
13:	learn: 0.6347691	total: 973ms	remaining: 1m 22s
14:	learn: 0.6279630	total: 1.04s	remaining: 1m 21s
15:	learn: 0.6208978	total: 1.1s	remaining: 1m 21s
16:	learn: 0.6148583	total: 1.16s	remaining: 1m 20s
17:	learn: 0.6081709	total: 1.23s	remaining: 1m 20s
18:	learn: 0.6017266	total: 1.26s	remaining: 1m 18s
19:	learn: 0.5953412	total:

<catboost.core.CatBoostRegressor at 0x7f1350d17a60>

In [42]:
test_pred = catboost_reg.predict(test_data)
train_pred = catboost_reg.predict(train_data)

In [43]:
submit_df = pd.DataFrame({'Id': test_Id, 'Expected': np.round(test_pred)})

In [45]:
submit_df.to_csv('and_what_and.csv', index=False)