In [None]:
%pylab inline

plt.style.use("bmh")

import numpy as np
import pandas as pd

import tqdm
import pathlib
# catboost.varsion == 0.17.5
from catboost import cv, Pool, CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor

from tsfresh.feature_extraction import ComprehensiveFCParameters,EfficientFCParameters, MinimalFCParameters
from tsfresh.feature_extraction import extract_features
from tsfresh.utilities.dataframe_functions import make_forecasting_frame
import tsfresh
from  few import FEW
from sklearn.linear_model import HuberRegressor, LinearRegression, Lasso, Lars

from sklearn.preprocessing import Normalizer, MinMaxScaler


DATA_DIR = pathlib.Path("data/")
RS = 289475

# Загрузка данных

In [None]:
train_target = pd.read_csv(DATA_DIR.joinpath("pet_target_train.csv"), index_col="date", parse_dates=["date"])
test_target = pd.read_csv(DATA_DIR.joinpath("pet_test_timestamps.csv"), index_col="date", parse_dates=["date"])
daily = pd.read_csv(DATA_DIR.joinpath("pet_daily.csv"), index_col="date", parse_dates=["date"])
weekly = pd.read_csv(DATA_DIR.joinpath("pet_weekly.csv"), index_col="date", parse_dates=["date"])

In [None]:

for name in ['brent', 'USDCNY']:
    daily[name] = daily[name+'_close']
    daily[name+'_diff1'] = daily[name+'_close']/daily[name+'_open']  
    daily[name+'_diff2'] = daily[name+'_max']/daily[name+'_min']
    daily[name+'_std'] = daily[[name+'_close', name+'_open', name+'_max', name+'_min']].std(axis = 1)
    daily[name+'_delta_2'] = daily[name]/daily[name].shift(2)
    daily[name+'_delta_10'] = daily[name]/daily[name].shift(10)
    daily[name+'_delta_30'] = daily[name]/daily[name].shift(30)
    daily[name+'_delta_60'] = daily[name]/daily[name].shift(60)   
    
    daily = daily.drop([name+'_close', name+'_open', name+'_max', name+'_min'], axis =1)

for col in ['paraxylene_CHN_USD', 'paraxylene_RT_USD', 'paraxylene_SEA_USD', 'pta_NEA_USD', 'ethylene_glycol_EU_EUR', 'ethylene_glycol_CHN_USD']:
    weekly[col+'_delta_7'] = weekly[col]/weekly[col].shift(1)
    weekly[col+'_delta_14'] = weekly[col]/weekly[col].shift(2)
    weekly[col+'_delta_28'] = weekly[col]/weekly[col].shift(4)
    weekly[col+'_delta_56'] = weekly[col]/weekly[col].shift(8)
    
weekly['paraxylene_diff1'] = weekly['paraxylene_CHN_USD']/weekly['paraxylene_RT_USD']
weekly['paraxylene_diff2'] = weekly['paraxylene_CHN_USD']/weekly['paraxylene_SEA_USD']
weekly['paraxylene_diff3'] = weekly['paraxylene_RT_USD']/weekly['paraxylene_SEA_USD']
weekly['ethylene_glycol_diff1'] = weekly['ethylene_glycol_EU_EUR']/weekly['ethylene_glycol_CHN_USD']


dfts = daily.resample("D").mean()
wfts = weekly.resample("D").ffill()
fts = dfts.join(wfts)

In [None]:
# Описание логики
# Получается из всех дат беру только даты с днем==9, генерю по ним фичи, изменяю день в данных с 9го на 1ое, для 
# Последующего склеивания с таргетом. Склеивааю с .shift(1)

def data_to_imp_columns(data, fdr_level = 5):
    '''Функция принимает данные, склеивает их с таргетом с необходимым сдвигом,
    и записывает отфильтрованные tsfresh'ем колонки'''
    data.index = data.index.map(lambda x: pd.to_datetime('-'.join([str(x.year), str(x.month), '1'])))
    data = train_target.join(data.shift(1))
    x, y = data.drop(['pet'],axis = 1), data['pet']
    columns = list(tsfresh.select_features(x.fillna(method = 'pad').fillna(data.mean()).dropna(axis = 1),  y, fdr_level=fdr_level).columns)
    return columns

columns = fts.columns
settings = ComprehensiveFCParameters()

all_data = []
for col in columns:
    for shift in [5, 10, 20, 30, 50, 70, 100, 182, 365, 730]:
        # роллинг
        df_shift, y = make_forecasting_frame(fts[col], kind=col, max_timeshift=shift, rolling_direction=1)
        # берем только данные на 9й день месяца
        df_shift = df_shift[df_shift['id'].apply(lambda x: pd.to_datetime(x).day)==9]
        # генерим фичи
        extracted_data = extract_features(df_shift, column_value  = 'value',column_id ='id', default_fc_parameters=settings)
        # переименовываем фичи в соответствии с измененной колонкой и периодом
        extracted_data.rename(lambda x: "-".join([x, str(shift)+'D', col]), axis=1, inplace=True)
        # базовый feature selecting - откидывание совсем ненужных фич, констант и тд
        extracted_data = extracted_data[data_to_imp_columns(extracted_data, 5)]
        all_data.append(extracted_data)


# Сбор нагенеренных фич
data = pd.concat(all_data,axis = 1)
# Переименовывание даты с 9го числа на 1ое для склеивания с таргетом
data.index = data.index.map(lambda x: pd.to_datetime('-'.join([str(x.year), str(x.month), '1'])))

# Чтобы data.shift работал корректно и для даты '2019-07-01' сдвигались предыдущие значения
data.loc[pd.to_datetime('2019-07-01')] = None

# Склеивание со сдвигом. 
train = train_target.join(data.shift(1))
test = test_target.join(data.shift(1))

In [None]:
# Подгружая исходные данные, ничего не удаляю
train_target = pd.read_csv(DATA_DIR.joinpath("pet_target_train.csv"), index_col="date", parse_dates=["date"])
test_target = pd.read_csv(DATA_DIR.joinpath("pet_test_timestamps.csv"), index_col="date", parse_dates=["date"])
daily = pd.read_csv(DATA_DIR.joinpath("pet_daily.csv"), index_col="date", parse_dates=["date"])
weekly = pd.read_csv(DATA_DIR.joinpath("pet_weekly.csv"), index_col="date", parse_dates=["date"])

for name in ['brent', 'USDCNY']:
    daily[name] = daily[name+'_close']
    daily[name+'_diff1'] = daily[name+'_close']/daily[name+'_open']  
    daily[name+'_diff2'] = daily[name+'_max']/daily[name+'_min']
    daily[name+'_std'] = daily[[name+'_close', name+'_open', name+'_max', name+'_min']].std(axis = 1)
    daily[name+'_delta_2'] = daily[name]/daily[name].shift(2)
    daily[name+'_delta_10'] = daily[name]/daily[name].shift(10)
    daily[name+'_delta_30'] = daily[name]/daily[name].shift(30)
    daily[name+'_delta_60'] = daily[name]/daily[name].shift(60)   
    

for col in ['paraxylene_CHN_USD', 'paraxylene_RT_USD', 'paraxylene_SEA_USD', 'pta_NEA_USD', 'ethylene_glycol_EU_EUR', 'ethylene_glycol_CHN_USD']:
    weekly[col+'_delta_7'] = weekly[col]/weekly[col].shift(1)
    weekly[col+'_delta_14'] = weekly[col]/weekly[col].shift(2)
    weekly[col+'_delta_28'] = weekly[col]/weekly[col].shift(4)
    weekly[col+'_delta_56'] = weekly[col]/weekly[col].shift(8)
    
weekly['paraxylene_diff1'] = weekly['paraxylene_CHN_USD']/weekly['paraxylene_RT_USD']
weekly['paraxylene_diff2'] = weekly['paraxylene_CHN_USD']/weekly['paraxylene_SEA_USD']
weekly['paraxylene_diff3'] = weekly['paraxylene_RT_USD']/weekly['paraxylene_SEA_USD']
weekly['ethylene_glycol_diff1'] = weekly['ethylene_glycol_EU_EUR']/weekly['ethylene_glycol_CHN_USD']


dfts = daily.resample("D").ffill()
wfts = weekly.resample("D").ffill()
fts = dfts.join(wfts)

In [None]:
# Добавляю к нагенеренным данным всевозможные исходные даынне тем же образом образом, что и нагенеренные фичи

data = fts
data = data[data.index.map(lambda x: x.day == 9)]
data.index = data.index.map(lambda x: pd.to_datetime('-'.join([str(x.year), str(x.month), '1'])))

data.loc[pd.to_datetime('2019-07-01')] = None

test = test.join(data.shift(1))
train = train.join(data.shift(1))

x, y = train.drop(['pet'],axis = 1), train['pet']

In [None]:
# Обрабатываю данные, тк модель линейная

info = pd.DataFrame()

columns = x.columns
cleaned_x = x[columns].fillna(method = 'pad').fillna(x.mean())

cleaned_x = pd.DataFrame(np.nan_to_num(cleaned_x), columns = cleaned_x.columns, index = cleaned_x.index)


all_columns = cleaned_x.columns
cleaned_x[~np.isfinite(cleaned_x.astype(np.float32))] = np.finfo(np.float32).max

cleaned_test_x = test[columns].fillna(method = 'pad').fillna(x.mean())

cleaned_test_x = pd.DataFrame(np.nan_to_num(cleaned_test_x), columns = cleaned_test_x.columns, index = cleaned_test_x.index)


all_columns = cleaned_test_x.columns
cleaned_test_x[~np.isfinite(cleaned_test_x.astype(np.float32))] = np.finfo(np.float32).max

In [None]:
start_cleaned_x  = cleaned_x
start_cleaned_test_x  = cleaned_test_x

In [None]:
tsfresh_columns = list(tsfresh.select_features(start_cleaned_x,  y, fdr_level=0.0000001).columns)

cleaned_x = start_cleaned_x[tsfresh_columns]
cleaned_test_x = start_cleaned_test_x[tsfresh_columns]

In [None]:
from sklearn.preprocessing import Normalizer, MinMaxScaler, MaxAbsScaler, StandardScaler
from sklearn.svm import LinearSVR

model = LinearSVR(C=0.032, random_state =1)


scaler = MinMaxScaler()

model.fit(scaler.fit_transform(cleaned_x), y)
ts_preds = model.predict(scaler.transform(cleaned_test_x))

ts_preds = pd.DataFrame(ts_preds, columns=["pet"], index=cleaned_test_x.index)
ts_preds.to_csv(DATA_DIR.joinpath("pet.csv"))