In [None]:
import requests
import os
from io import BytesIO
import shutil
if not os.path.exists("data"):
    os.makedirs("data")

files = ["CHMF Акции.csv",
         "MAGN Акции.csv",
         "NLMK Акции.csv",
         "test.xlsx",
         "train.xlsx",
         "Грузоперевозки.xlsx",
         "Данные рынка стройматериалов.xlsx",
         "Индекс LME.xlsx",
         "Макропоказатели.xlsx",
         "Показатели рынка металла.xlsx",
         "Топливо.xlsx",
         "Цены на сырье.xlsx"]

url = "https://raw.githubusercontent.com/samoletpanfilov/reinforcement_task/master/data/"
for f in files:
    with open("data/"+f, 'wb') as out_file:
        shutil.copyfileobj(BytesIO(requests.get(url+f).content), out_file)

In [None]:
!pip install catboost



In [None]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm
import os

horizon = 10
month_length = 30
inf = 1000000000
test_size = int(400*0.2) # 400 is size of dataset

In [None]:
def load_merge_data() :
    df_train = pd.read_excel(
        "data/train.xlsx").rename(columns={"dt": "timestamp", "Цена на арматуру": "target"})
    df_chmf = pd.read_csv(
        "data/CHMF Акции.csv").rename(columns={"Date": "timestamp"})
    df_magn = pd.read_csv(
        "data/MAGN Акции.csv").rename(columns={"Дата": "timestamp"})
    df_nlmk = pd.read_csv(
        "data/NLMK Акции.csv").rename(columns={"Date": "timestamp"})
    df_transfer = pd.read_excel("data/Грузоперевозки.xlsx").rename(
        columns={"dt": "timestamp", "Индекс стоимости грузоперевозок": "transfer_cost"})
    df_market = pd.read_excel(
        "data/Данные рынка стройматериалов.xlsx").rename(columns={"dt": "timestamp"})
    df_lme = pd.read_excel(
        "data/Индекс LME.xlsx").rename(columns={"дата": "timestamp"})
    df_macro = pd.read_excel(
        "data/Макропоказатели.xlsx").rename(columns={"dt": "timestamp"})
    df_fuel = pd.read_excel(
        "data/Топливо.xlsx").rename(columns={"dt": "timestamp"})
    df_raw_prices = pd.read_excel(
        "data/Цены на сырье.xlsx").rename(columns={"dt": "timestamp"})

    df_chmf["timestamp"] = pd.to_datetime(df_chmf["timestamp"])
    df_magn["timestamp"] = pd.to_datetime(df_magn["timestamp"])
    df_nlmk["timestamp"] = pd.to_datetime(df_nlmk["timestamp"])

    # Merge the dataframes
    merged_df = pd.merge(df_train, df_chmf, on='timestamp', how='outer')
    merged_df = pd.merge(merged_df, df_magn, on='timestamp', how='outer')
    merged_df = pd.merge(merged_df, df_nlmk, on='timestamp', how='outer')
    merged_df = pd.merge(merged_df, df_transfer, on='timestamp', how='outer')
    merged_df = pd.merge(merged_df, df_market, on='timestamp', how='outer')
    merged_df = pd.merge(merged_df, df_lme, on='timestamp', how='outer')
    merged_df = pd.merge(merged_df, df_macro, on='timestamp', how='outer')
    merged_df = pd.merge(merged_df, df_fuel, on='timestamp', how='outer')
    merged_df = pd.merge(merged_df, df_raw_prices, on='timestamp', how='outer')

    merged_df.sort_values("timestamp")

    return merged_df

In [None]:
def prepare_data(merged_df):
    for window in range(3, 2*month_length): #target over a rolling window
        merged_df['EMA'+str(window)] = merged_df['target'].ewm(alpha=2 / (window + 1), adjust=False).mean()
        # merged_df['SMA'+str(window)] = merged_df['target'].rolling(window).mean()
        # merged_df['STD'+str(window)] = merged_df['target'].rolling(window).std()

    merged_df = merged_df.dropna(subset=["target"])


    # merged_df = merged_df.dropna(subset=["target"])
    # Iterate over the columns in the DataFrame
    for column in merged_df.columns:
        if column != "timestamp":
            # Check if the column contains non-numeric values
            if merged_df[column].dtype != float:
                # Extract numeric values using regular expressions
                merged_df[column] = merged_df[column].apply(lambda x: re.findall(
                    r"[-+]?\d*\.\d+|\d+", str(x))[0] if re.findall(r"[-+]?\d*\.\d+|\d+", str(x)) else None)

                # Convert the column to float data type
                merged_df[column] = merged_df[column].astype(float)

    # Convert the "timestamp" column to datetime, if needed
    merged_df["timestamp"] = pd.to_datetime(merged_df["timestamp"])

    nan_counts = merged_df.isna().sum()
    columns_with_high_nan = nan_counts[nan_counts > merged_df.shape[0] * 0.5].index

    with open("models/columns_with_high_nan.txt", "w", encoding='utf8') as file:
        file.write("\n".join(columns_with_high_nan))

    merged_df = merged_df.drop(columns=columns_with_high_nan)

    #TODO:fillna

    return merged_df

In [None]:
def make_data_blocks(df, shift, window): # make a new dataset with data between weeks <t-shift-window> and <t-shift>, to predict the change of price between week <t> and week <t-shift>
    upgrade_df = df[["timestamp"]].copy()
    rga, rgb = shift, shift+window

    x = np.array(df["target"]-df["target"].shift(rga)) / 100 # TODO
    x = x / (np.abs(x)+1) #SoftSign
    upgrade_df["target"] = x

    for col in df.columns:
        if col !="timestamp":
            for i in range(rga, rgb):
                col1 = col+str(i)
                if i!=rga:
                    upgrade_df[col1] = df[col].shift(i)-df[col].shift(rga)
                else:
                    upgrade_df[col1] = df[col].shift(i)

    upgrade_df = upgrade_df[horizon:]
    return upgrade_df

In [None]:


def train_test_split(df): #cut test data from the end of the dataset
    return (df[:-test_size].copy().reset_index(drop=True), df[-test_size:].copy().reset_index(drop=True))

In [None]:
class model: #contains the model, and data, prepared for it (different models require different states of data)
    def __init__(self, horizon, window, name, is_full_data_train = False):
        df = prepare_data(load_merge_data())
        upgrade_df = make_data_blocks(df, horizon, window)
        if is_full_data_train:
            upgrade_train = upgrade_df
            upgrade_test = upgrade_df
        else:
            upgrade_train, upgrade_test = train_test_split(upgrade_df)
        self.test_y = upgrade_test['target']
        self.test_x = upgrade_test.drop(columns=['target', "timestamp"])
        self.train_y = upgrade_train['target']
        self.train_x = upgrade_train.drop(columns=['target', "timestamp"])
        self.name = name
        self.model = CatBoostRegressor(verbose=0)

    def save(self):
        self.model.save_model(self.name)

    def load(self):

        self.model.load_model(self.name)

    def fit(self):
        self.model = CatBoostRegressor(verbose=0)
        self.model.fit(self.train_x , self.train_y)

    def predict(self):
        return self.model.predict(self.test_x)

    def true(self):
        return self.test_y

In [None]:
def find_negative_index(lst):
    for i, num in enumerate(lst):
        if num < 0:
            return i
    return len(lst)

def make_orders_on_time_segment(models):
    result = []
    step = 0
    models_predictions = [model.predict() for model in models]
    for day_ind in range(test_size):
        if step == 0:
            segment = [models_predictions[i][day_ind] for i in range(len(models))]
            step = find_negative_index(segment)+1
            result.append(min(step, test_size-day_ind))
        else:
            result.append(0)
        step-=1
    return result

In [None]:
def decision_prices(test):
    test = test.set_index('dt')
    tender_price = test['target']
    decision = test['Объем']
    start_date = test.index.min()
    end_date = test.index.max()

    _results = []
    _active_weeks = 0
    for report_date in pd.date_range(start_date, end_date, freq='W-MON'):
        if _active_weeks == 0:  # Пришла пора нового тендера
            _fixed_price = tender_price.loc[report_date]
            _active_weeks = int(decision.loc[report_date])
        _results.append(_fixed_price)
        _active_weeks += -1
    cost = sum(_results)
    return cost # Возвращаем затраты на периоде

In [None]:
if not os.path.exists("models"):
            os.mkdir("models")

In [None]:
df = prepare_data(load_merge_data())

_, df_test = train_test_split(df)
df_test = df_test[['timestamp', 'target']]
df=  df[horizon:]

best_window = 0
best_score = inf
for window in [4,6,8,10,12]:
    print("training for block size of "+str(window))
    models = []
    for i in range(1, horizon):
        models.append(model(i, window, "models/cb_model_" + str(i)+".cbm", is_full_data_train = False))

    i=0
    for _model in tqdm(models):
        _model.fit()
        i+=1

    res = make_orders_on_time_segment(models)
    print(*res)
    df_test['Объем'] = res
    score = decision_prices(df_test.rename(columns={"timestamp": "dt"}))
    print("score for size of " + str(window) + " is " + str(score))
    if(best_score > score):
        best_score = score
        best_window = window

print("best window size is " + str(best_window) + " with score of " + str(best_score))

training for block size of 4


100%|██████████| 9/9 [02:02<00:00, 13.59s/it]


1 1 2 0 4 0 0 0 1 2 0 1 2 0 4 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 0 2 0 3 0 0 1 1 1 1 1 1 1 1 1 1 3 0 0 3 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 0 0 1 2 0
score for size of 4 is 4847200.0
training for block size of 6


100%|██████████| 9/9 [03:04<00:00, 20.51s/it]


1 1 1 2 0 2 0 1 1 2 0 1 2 0 4 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 0 2 0 3 0 0 1 1 1 1 1 1 1 1 1 3 0 0 4 0 0 0 3 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 0 4 0 0 0
score for size of 6 is 4829300.0
training for block size of 8


100%|██████████| 9/9 [04:08<00:00, 27.60s/it]


1 1 1 2 0 1 1 2 0 1 1 1 2 0 3 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 0 2 0 1 1 1 1 1 1 1 1 1 1 2 0 4 0 0 0 3 0 0 1 2 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 0 1 1 2 0
score for size of 8 is 4844200.0
training for block size of 10


100%|██████████| 9/9 [05:10<00:00, 34.53s/it]


1 1 1 2 0 1 1 2 0 1 1 1 2 0 3 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 0 2 0 3 0 0 1 1 1 1 1 1 1 1 1 3 0 0 4 0 0 0 3 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 0 1 2 0
score for size of 10 is 4823400.0
training for block size of 12


100%|██████████| 9/9 [06:14<00:00, 41.57s/it]


1 1 1 2 0 1 1 1 1 1 1 1 1 2 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 0 2 0 2 0 1 1 1 1 1 1 1 1 1 1 1 3 0 0 3 0 0 3 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 0 2 0 2 0
score for size of 12 is 4858700.0
best window size is 10 with score of 4823400.0


In [None]:
class MetaModel:
    def __init__(self, window, name):
        self.cb_models = []
        self.risk_value = 0 #this value makes model to make more risky decisions and notice momentum changes
        self.cnt_models = 0
        self.name = name

        if not os.path.exists("models/"+ name):
            os.mkdir("models/"+ name)
        for i in range(1, horizon):
            self.cb_models.append(model(i, window, "models/"+ name + "/cb_model_" + str(i)+".cbm", is_full_data_train = True))

    def save(self):
        for _model in self.cb_models:
            _model.save()

        # write risk_value in file
        with open("models/"+self.name+"/"+ self.name+".mmm", "w", encoding='utf8') as file:
            file.write(str(self.risk_value)+"\n")
            file.write(str(self.cnt_models)+"\n")


    def load(self):
        for _model in self.cb_models:
            _model.load()
        with open("models/"+ self.name+"/"+self.name+".mmm", "r", encoding='utf8') as file:
            f = file.read().splitlines()
            self.risk_value, self.cnt_models  = float(f[0]), int(f[1])


    def fit(self, step = 0.05):
        for _model in tqdm(self.cb_models):
            _model.fit()

        best_score = inf
        final_cnt_models = 0
        for self.cnt_models in range(1, self.cb_models.__len__()):
            y = self.predict()
            df['Объем'] = y
            score = decision_prices(df.rename(columns={"timestamp": "dt"}))
            if(score<best_score):
                best_score = score
                final_cnt_models = self.cnt_models
        self.cnt_models = final_cnt_models
        print("best cnt_models is " + str(self.cnt_models) )

        best_score = inf
        final_risk_value = 0
        for self.risk_value in np.arange(0.1, 0.9, step):
            y = self.predict()
            df['Объем'] = y
            score = decision_prices(df.rename(columns={"timestamp": "dt"}))
            if(score<best_score):
                best_score = score
                final_risk_value = self.risk_value
        self.risk_value = final_risk_value
        print("best risk_value is " + str(self.risk_value) )

    def __find_suitable_index(self, lst):
        for i in range(1, lst.__len__()):
            lst[i]= (lst[0]*self.risk_value + lst[i]*(1-self.risk_value))
        for i, num in enumerate(lst):
            if num < 0:
                return i
        return len(lst)

    def predict(self):
        result = []
        step = 0
        models_predictions = [_model.predict() for _model in self.cb_models]
        test_size = models_predictions[0].__len__()
        for day_ind in range(test_size):
            if step == 0:
                segment = [models_predictions[i][day_ind] for i in range(self.cnt_models)]
                step = self.__find_suitable_index(segment) + 1
                result.append(min(step, test_size - day_ind))
            else:
                result.append(0)
            step-=1
        return result

In [None]:
final_model = MetaModel(best_window, "test1")
final_model.fit()

100%|██████████| 9/9 [05:20<00:00, 35.64s/it]


best cnt_models is 7
best risk_value is 0.6500000000000001


In [None]:
final_model.save()