In [1]:
from isswrapper.util.helpers import read_parquet_into_dataframe
import pandas as pd
import numpy as np
import os
import datetime 
from ydata_profiling import ProfileReport
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from adtk.detector import VolatilityShiftAD, QuantileAD, PersistAD
import itertools
from pnd_moex.general.plots import anomaly_plot
from pnd_moex.general.general import anomaly_detect, anomaly_news_markup_func
from pnd_moex.util.other import find_all_sequences

  @nb.jit


In [2]:
# loading data
current_path = os.getcwd()
project_path = os.path.dirname(current_path)
datasets_folder_path = os.path.join(project_path, 'datasets')
# news 
pnd_token_date_df = read_parquet_into_dataframe(os.path.join(datasets_folder_path, "pnd_token_date.parquet"))
# all time series
ts_df = read_parquet_into_dataframe(os.path.join(datasets_folder_path, "time_series_securities_pnd.parquet"))
ts_df.drop(columns = ["BOARDID", "SHORTNAME","CURRENCYID", "TRADINGSESSION", "WAVAL"], inplace=True)
print(ts_df.shape)
ts_df.sample(5)

(134205, 19)


Unnamed: 0,TRADEDATE,SECID,NUMTRADES,VALUE,OPEN,LOW,HIGH,LEGALCLOSEPRICE,WAPRICE,CLOSE,VOLUME,MARKETPRICE2,MARKETPRICE3,ADMITTEDQUOTE,MP2VALTRD,MARKETPRICE3TRADESVALUE,ADMITTEDVALUE,TRENDCLSPR,currencyid
261,2021-07-02,IDVP,0.0,0.0,,,,64000.0,,,0.0,,62000.0,64000.0,0.0,802000.0,0.0,,RUB
598,2016-10-25,VSYDP,0.0,0.0,,,,8300.0,,,0.0,,8300.0,8300.0,0.0,508100.0,0.0,,RUB
166,2022-08-11,UWGN,4108.0,21732517.3,48.9,48.4,54.4,54.1,51.3,54.1,423242.0,51.3,51.3,54.1,21732517.3,21732517.3,21732517.3,11.55,RUB
980,2018-05-02,VRSBP,0.0,0.0,,,,20.8,,,0.0,,19.2,20.8,0.0,504380.0,0.0,-100.0,RUB
1886,2021-11-30,LVHK,156.0,875530.0,16.1,15.61,16.59,15.7,16.04,15.7,54600.0,16.04,16.04,15.7,875530.0,875530.0,875530.0,-2.85,RUB


In [3]:
token = np.random.choice(pnd_token_date_df["token"].unique())
# token = "BSPBP"
# selecting data
sec_df = ts_df[ts_df["SECID"]==token]
# making proper index
sec_df = sec_df.sort_values("TRADEDATE")
sec_df.index = pd.to_datetime(sec_df["TRADEDATE"])

a_df = anomaly_detect(sec_df["CLOSE"])
# news
n_list = pnd_token_date_df[pnd_token_date_df["token"]==token]["p_date"].iloc[0].tolist()



In [4]:
processed_df_list = []
# lets mark and concat data for all secutrities
for token in pnd_token_date_df["token"].unique():
    sec_df = ts_df[ts_df["SECID"]==token]
    # making proper index
    sec_df = sec_df.sort_values("TRADEDATE")
    sec_df.index = pd.to_datetime(sec_df["TRADEDATE"])

    a_df = anomaly_detect(sec_df["CLOSE"])
    # news
    n_list = pnd_token_date_df[pnd_token_date_df["token"]==token]["p_date"].iloc[0].tolist()
    marked = anomaly_news_markup_func(sec_df, a_df["80over3"], n_list)
    processed_df_list.append(marked)

processed_df = pd.concat(processed_df_list)

In [5]:
# example, anomaly_plot used for mark visualisation
marked_df = processed_df[processed_df["SECID"]==token]
anomaly_plot(marked_df, pd.get_dummies(marked_df["mark"]), n_list)

In [6]:
ts_df.columns

Index(['TRADEDATE', 'SECID', 'NUMTRADES', 'VALUE', 'OPEN', 'LOW', 'HIGH',
       'LEGALCLOSEPRICE', 'WAPRICE', 'CLOSE', 'VOLUME', 'MARKETPRICE2',
       'MARKETPRICE3', 'ADMITTEDQUOTE', 'MP2VALTRD', 'MARKETPRICE3TRADESVALUE',
       'ADMITTEDVALUE', 'TRENDCLSPR', 'currencyid'],
      dtype='object')

Next step is generate and selecting features. For this purpose I'll use tsfresh lib.
To use this lib we need to fill na. 
There many ways to fulfill this. 


In [45]:
import tsfresh

In [52]:
ts_df.isna().sum()

TRADEDATE                      0
SECID                          0
NUMTRADES                      0
VALUE                          0
OPEN                       26130
LOW                        26130
HIGH                       26130
LEGALCLOSEPRICE              181
WAPRICE                    25561
CLOSE                      26130
VOLUME                        83
MARKETPRICE2               92028
MARKETPRICE3               11697
ADMITTEDQUOTE              27712
MP2VALTRD                     83
MARKETPRICE3TRADESVALUE       83
ADMITTEDVALUE               9687
TRENDCLSPR                 21501
currencyid                     0
dtype: int64

In [53]:
tsfresh.utilities.dataframe_functions.check_for_nans_in_columns(ts_df)

ValueError: Columns ['OPEN', 'LOW', 'HIGH', 'LEGALCLOSEPRICE', 'WAPRICE', 'CLOSE', 'VOLUME', 'MARKETPRICE2', 'MARKETPRICE3', 'ADMITTEDQUOTE', 'MP2VALTRD', 'MARKETPRICE3TRADESVALUE', 'ADMITTEDVALUE', 'TRENDCLSPR'] of DataFrame must not contain NaN values

In [46]:
df_features = tsfresh.extract_features(processed_df.drop(columns=["mark"]), column_id="SECID", column_sort="TRADEDATE", default_fc_parameters=tsfresh.feature_extraction.MinimalFCParameters())
df_features.columns

ValueError: Column must not contain NaN values: SECID