In [1]:
from isswrapper.util.helpers import read_parquet_into_dataframe
import pandas as pd
import numpy as np
import os
from pnd_moex.general.plots import anomaly_plot
from pnd_moex.general.general import anomaly_detect, anomaly_news_markup_func
from sklearn.preprocessing import MinMaxScaler
import plotly.express as px

import warnings
warnings.filterwarnings('ignore')

In [2]:
# loading data
current_path = os.getcwd()
project_path = os.path.dirname(current_path)
datasets_folder_path = os.path.join(project_path, 'datasets')
# news 
pnd_token_date_df = read_parquet_into_dataframe(os.path.join(datasets_folder_path, "pnd_token_date.parquet"))
# all time series
ts_df = read_parquet_into_dataframe(os.path.join(datasets_folder_path, "time_series_securities_pnd.parquet"))
# droping all static and uninformative columns
ts_df.drop(columns = ["BOARDID", "SHORTNAME","CURRENCYID", "TRADINGSESSION", "WAVAL"], inplace=True)
print(ts_df.shape)
ts_df.sample(5)

(135031, 19)


Unnamed: 0,TRADEDATE,SECID,NUMTRADES,VALUE,OPEN,LOW,HIGH,LEGALCLOSEPRICE,WAPRICE,CLOSE,VOLUME,MARKETPRICE2,MARKETPRICE3,ADMITTEDQUOTE,MP2VALTRD,MARKETPRICE3TRADESVALUE,ADMITTEDVALUE,TRENDCLSPR,currencyid
1492,2020-05-13,LNZLP,74.0,446925.0,3725.0,3700.0,3780.0,3710.0,3725.0,3710.0,120.0,,3720.0,3710.0,0.0,502535.0,0.0,-0.13,RUB
2318,2023-08-17,TGKBP,639.0,8549798.0,0.01516,0.01474,0.01533,0.01516,0.01502,0.01516,569100000.0,0.01502,0.01502,,8549798.0,8549798.0,,0.33,RUB
1852,2021-10-12,KUZB,1123.0,7550895.3,0.02315,0.023,0.02496,0.02399,0.02405,0.02399,313920000.0,0.02405,0.02405,0.02399,7550895.3,7550895.3,7550895.3,3.36,RUB
869,2017-11-21,VJGZP,2.0,8530.0,433.0,420.0,433.0,420.0,427.0,420.0,20.0,425.0,425.0,420.0,1304840.0,1304840.0,1304840.0,-1.64,RUB
1735,2021-04-29,KCHE,6.0,56515.0,0.209,0.208,0.21,0.208,0.2095,0.208,270000.0,,0.2085,0.208,0.0,582025.0,0.0,-0.24,RUB


In [3]:
def process_time_series(
    ts_df:pd.DataFrame, 
    pnd_df: pd.DataFrame,
    date_col:str = "TRADEDATE",
    token_col:str = "SECID",
    anomaly_detection_col: str="CLOSE",
    anomaly_method: str="80over3",
    mark_days_before: int = 10,
    mark_days_after:int = 3,
    scaling_opt:callable = None,
    ):
    """
    Process time series data with anomaly detection, markup, and optional scaling.

    :param time_series_df: Time series DataFrame.
    :type time_series_df: pd.DataFrame
    :param pump_dump_df: DataFrame with pump and dump information, including token and p_date columns.
    :type pump_dump_df: pd.DataFrame
    :param date_column: Name of the date column, defaults to "TRADEDATE".
    :type date_column: str, optional
    :param token_column: Name of the token column, defaults to "SECID".
    :type token_column: str, optional
    :param anomaly_detection_column: Column used for anomaly detection, defaults to "CLOSE".
    :type anomaly_detection_column: str, optional
    :param anomaly_method: Anomaly detection method name, available options are ["3over20", "80over3", "quantile"], defaults to "80over3".
    :type anomaly_method: str, optional
    :param scaling_function: Scaling function, defaults to None.
    :type scaling_function: callable, optional
    :return: Processed DataFrame.
    :rtype: pd.DataFrame
    """
    
    processed_df_list = []
    tokens = ts_df[token_col].unique().tolist()
    max_idx = 0
    # processing data
    for token in tokens:
    # take one token
        sec_df = ts_df[ts_df[token_col]==token]
        sec_df = sec_df.sort_values(date_col)

        # make proper index
        sec_df.index = pd.to_datetime(sec_df[date_col])
        
        # detect anomalies
        a_df = anomaly_detect(sec_df[anomaly_detection_col])

        # news for markup funciton
        n_list = pnd_df[pnd_df["token"]==token]["p_date"].iloc[0].tolist()
        
        # creating markup
        marked = anomaly_news_markup_func(
            sec_df,
            a_df[anomaly_method],
            n_list,
            na_mark=np.nan,
            days_before=mark_days_before,
            days_after=mark_days_after,
            additional_indexing=True
            )
        
        # scaling 
        num_cols = marked.drop(columns=["mark", "new_index"]).select_dtypes(np.number).columns
        
        # scale func
        if scaling_opt is None:
            marked[num_cols] = marked[num_cols].pct_change()
            marked.replace([np.inf, -np.inf], np.nan, inplace=True)
            scaler = MinMaxScaler()
            marked[num_cols] = scaler.fit_transform(marked[num_cols])
        else: 
            marked[num_cols] = scaling_opt(marked[num_cols])
        # deleting na
        marked.dropna(subset = ["new_index", "mark"], inplace=True)
        marked["new_index"] = marked["new_index"] + max_idx
        
        
        max_idx = marked["new_index"].max()+1
        processed_df_list.append(marked)

    processed_df = pd.concat(processed_df_list)

    return processed_df

Before creating the markup, there is one important step left to take: creating rolling statistics to help us identify patterns more effectively.

Let's delve into this in more detail. If we closely examine pump cases, we may notice that certain values start to increase while the closing price remains static. Let's refer to this process as a preparation phase. It's evident that the trading volume will increase rapidly during this period. Even if the bad actors are subtle, the volume will still show some growth, albeit slowly. What can we do with this information? Simply put, we can calculate the rolling average and rolling sum of the volume, the number of trades, and perhaps other relevant features.

What else can we discover by analyzing our data? That's a bit more challenging to answer. If we had forum message data, it would be a valuable feature since forum activity regarding a suspected pumped token typically increases before the pump. We can also incorporate token news information. There are many different factors to consider.

Since we currently lack forum data, we can map news onto our dataset. We'll create a new column that represents the number of news articles on a given day. While this sounds promising, there's a caveat. We exclude every anomaly that is not connected to changes in risk factors. Consequently, we exclude every increase that occurs for significant reasons. Without marked instances of real price increases, we cannot be sure if this approach will make sense. It might even make things worse, so let's postpone it for now.

In [13]:
ts_df.columns

Index(['TRADEDATE', 'SECID', 'NUMTRADES', 'VALUE', 'OPEN', 'LOW', 'HIGH',
       'LEGALCLOSEPRICE', 'WAPRICE', 'CLOSE', 'VOLUME', 'MARKETPRICE2',
       'MARKETPRICE3', 'ADMITTEDQUOTE', 'MP2VALTRD', 'MARKETPRICE3TRADESVALUE',
       'ADMITTEDVALUE', 'TRENDCLSPR', 'currencyid'],
      dtype='object')

In [None]:
"TRADEDATE"
"SECID"
"NUMTRADES"
"VALUE"
"OPEN"
"LOW"
"HIGH"
"LEGALCLOSEPRICE"
"WAPRICE"
"CLOSE"
"VOLUME"
"MARKETPRICE2"
"MARKETPRICE3"
"ADMITTEDQUOTE"
"MP2VALTRD"
"MARKETPRICE3TRADESVALUE"
"ADMITTEDVALUE"
"TRENDCLSPR"
"currencyid"


In [4]:
# droping high correlated
ts_df.drop(columns = [ "OPEN", "LOW","HIGH", "LEGALCLOSEPRICE", "WAPRICE",   "MARKETPRICE2", "MARKETPRICE3", "MP2VALTRD"], inplace=True)

In [5]:
import matplotlib as mpl 
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
mpl.style.use("ggplot")

In [3]:
token = np.random.choice(pnd_token_date_df["token"].unique())
token
pnd_token_date_df["token"].unique()

array(['ABRD', 'ALBK', 'ASSB', 'BELU', 'BLNG', 'BSPBP', 'CHKZ', 'DASB',
       'DVEC', 'DZRD', 'ELTZ', 'FXRW', 'FXWO', 'GECO', 'GTLC', 'GTRK',
       'IDVP', 'IGST', 'IGSTP', 'ISKJ', 'KCHE', 'KCHEP', 'KRKOP', 'KROT',
       'KROTP', 'KTSB', 'KTSBP', 'KUBE', 'KUZB', 'LENT', 'LNZL', 'LNZLP',
       'LPSB', 'LVHK', 'MERF', 'MGVM', 'MISB', 'MISBP', 'MOBB', 'MRKS',
       'MSST', 'MSTT', 'NKHP', 'NNSBP', 'ORUP', 'PAZA', 'RDRB', 'RKKE',
       'RLMNP', 'ROSB', 'ROST', 'RTSB', 'RTSBP', 'RU000A101NK4', 'RUSI',
       'SVAV', 'SVET', 'TGKBP', 'TGKN', 'TNSE', 'UCSS', 'UKUZ', 'UNKL',
       'UWGN', 'VGSBP', 'VJGZ', 'VJGZP', 'VRSB', 'VRSBP', 'VSYD', 'VSYDP',
       'YAKG'], dtype=object)

In [7]:
# sec_df = ts_df[ts_df["SECID"]==token]
# # making proper index
# sec_df = sec_df.sort_values("TRADEDATE")
# sec_df.index = pd.to_datetime(sec_df["TRADEDATE"])
# token = np.random.choice(pnd_token_date_df["token"].unique())
# token = "BSPBP"
# selecting data
sec_df = ts_df[ts_df["SECID"]==token]
# making proper index
sec_df = sec_df.sort_values("TRADEDATE")
sec_df.index = pd.to_datetime(sec_df["TRADEDATE"])

a_df = anomaly_detect(sec_df["VOLUME"])
# news
n_list = pnd_token_date_df[pnd_token_date_df["token"]==token]["p_date"].iloc[0].tolist()


In [8]:
sec_df.columns

Index(['TRADEDATE', 'SECID', 'NUMTRADES', 'VALUE', 'CLOSE', 'VOLUME',
       'ADMITTEDQUOTE', 'MARKETPRICE3TRADESVALUE', 'ADMITTEDVALUE',
       'TRENDCLSPR', 'currencyid'],
      dtype='object')

In [9]:
# edgy plots
column_rolling_aggregate = "VOLUME"
days = 30
sec_df = pd.DataFrame(sec_df[column_rolling_aggregate])
sec_df["rolling_sum"] = sec_df[column_rolling_aggregate].rolling(datetime.timedelta(days=days), min_periods=1).sum()
sec_df["rolling_avg"] = sec_df[column_rolling_aggregate].rolling(datetime.timedelta(days=days), min_periods=1).mean()

sec_df["rolling_skewness"] = sec_df["rolling_avg"].rolling(datetime.timedelta(days=days), min_periods=1).skew()
sec_df["rolling_kurtosis"] = sec_df["rolling_avg"].rolling(datetime.timedelta(days=days), min_periods=1).kurt()
sec_df["rolling_std"] = sec_df["rolling_avg"].rolling(datetime.timedelta(days=days), min_periods=1).std()

sec_df["avg_skewness"] = sec_df["rolling_skewness"].rolling(datetime.timedelta(days=3), min_periods=1).mean()

sec_df["rolling_max"] = sec_df["avg_skewness"].rolling(datetime.timedelta(days=3), min_periods=1).max()
sec_df["rolling_min"] = sec_df["avg_skewness"].rolling(datetime.timedelta(days=3), min_periods=1).min()

sec_df["rolling_range"] = sec_df["rolling_max"]-sec_df["rolling_min"]
sec_df["weekday"] = sec_df.index.weekday

In [19]:
a_df.columns

Index(['3over20', '80over3', 'quantile'], dtype='object')

In [10]:
fig = px.line(sec_df)
for news in n_list:
    fig.add_vline(x = news)
fig.show()

In [22]:

n_df = process_time_series(ts_df, pnd_token_date_df)
n_df.shape


All-NaN slice encountered


All-NaN slice encountered


All-NaN slice encountered


All-NaN slice encountered


All-NaN slice encountered


All-NaN slice encountered



(73413, 13)

In [28]:
token = np.random.choice(n_df["SECID"].unique())
px.line(n_df[n_df["SECID"]==token], x=n_df[n_df["SECID"]==token].index, y="CLOSE", color="new_index")


In [18]:
token = 'TGKBP'
token

'TGKBP'

In [4]:
# example, anomaly_plot used for mark visualisation

# marked_df = processed_df[processed_df["SECID"]==token]
sec_df = ts_df[ts_df["SECID"]==token]
sec_df = sec_df.sort_values("TRADEDATE")
sec_df.index = pd.to_datetime(sec_df["TRADEDATE"])
a_df = anomaly_detect(sec_df["CLOSE"])
n_list = pnd_token_date_df[pnd_token_date_df["token"]==token]["p_date"].iloc[0].tolist()
marked_df = anomaly_news_markup_func(sec_df, a_df["80over3"], n_list, na_mark=np.nan, additional_indexing=True)

In [11]:
marked_df["new_index"].max()

68.0

In [10]:
px.line(marked_df[marked_df["SECID"]==token], x=marked_df[marked_df["SECID"]==token].index, y="CLOSE", color="new_index")


In [44]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
new_tmp_df = pd.DataFrame(scaler.fit_transform(tmp_df), columns = tmp_df.columns)
new_tmp_df.describe()

Unnamed: 0,OPEN,LOW,HIGH,LEGALCLOSEPRICE,WAPRICE,CLOSE
count,629.0,629.0,629.0,629.0,629.0,629.0
mean,0.26474,0.370271,0.344231,0.394996,0.333961,0.389572
std,0.074477,0.074585,0.088799,0.073426,0.071961,0.07426
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.237886,0.344677,0.310927,0.367823,0.306394,0.361073
50%,0.257338,0.361728,0.335071,0.386998,0.325749,0.381496
75%,0.276142,0.384789,0.354802,0.402979,0.343937,0.3984
max,1.0,1.0,1.0,1.0,1.0,1.0


In [51]:
new_tmp_df["n_idx"] = marked_df["new_index"].tolist()
new_tmp_df["mark"] = marked_df["mark"].tolist()

In [52]:
new_tmp_df.describe()

Unnamed: 0,OPEN,LOW,HIGH,LEGALCLOSEPRICE,WAPRICE,CLOSE,n_idx,mark
count,629.0,629.0,629.0,629.0,629.0,629.0,501.0,501.0
mean,0.26474,0.370271,0.344231,0.394996,0.333961,0.389572,6.864271,0.025948
std,0.074477,0.074585,0.088799,0.073426,0.071961,0.07426,4.15422,0.159139
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.237886,0.344677,0.310927,0.367823,0.306394,0.361073,3.0,0.0
50%,0.257338,0.361728,0.335071,0.386998,0.325749,0.381496,7.0,0.0
75%,0.276142,0.384789,0.354802,0.402979,0.343937,0.3984,9.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,14.0,1.0


In [53]:
new_tmp_df.dropna(subset=["n_idx"]).head()

Unnamed: 0,OPEN,LOW,HIGH,LEGALCLOSEPRICE,WAPRICE,CLOSE,n_idx,mark
62,0.391158,0.530033,0.501167,0.539564,0.485925,0.535431,0.0,0.0
63,0.390784,0.529562,0.500702,0.539137,0.485477,0.535,0.0,0.0
64,0.390802,0.529585,0.500725,0.539158,0.485499,0.535021,0.0,0.0
65,0.391124,0.529989,0.501124,0.539524,0.485884,0.535391,0.0,0.0
73,0.390775,0.529551,0.500692,0.539127,0.485815,0.53499,1.0,0.0


In [58]:
refined = new_tmp_df.dropna(subset=["n_idx"])

In [62]:
refined.iloc[:10].select_dtypes(np.number).columns

Index(['OPEN', 'LOW', 'HIGH', 'LEGALCLOSEPRICE', 'WAPRICE', 'CLOSE', 'n_idx',
       'mark'],
      dtype='object')

In [33]:
# px.line(marked_df, x=marked_df.index, y="CLOSE", color="new_index")
# some questionable visuals for new inedxes


In [12]:
token = np.random.choice(ts_df["SECID"].unique())

# take one token
sec_df = ts_df[ts_df["SECID"]==token]
sec_df = sec_df.sort_values("TRADEDATE")

# make proper index
sec_df.index = pd.to_datetime(sec_df["TRADEDATE"])

# detect anomalies
a_df = anomaly_detect(sec_df["CLOSE"])

# news for this
n_list = pnd_token_date_df[pnd_token_date_df["token"]==token]["p_date"].iloc[0].tolist()

# markup and new indexing
marked_df = anomaly_news_markup_func(sec_df, a_df["80over3"], n_list, na_mark=np.nan, additional_indexing=True)

# # scaling 
# num_cols = marked_df.drop(columns=["mark", "new_index"]).select_dtypes(np.number).columns
# marked_df[num_cols] = marked_df[num_cols].pct_change()
# marked_df.replace([np.inf, -np.inf], np.nan, inplace=True)
# scaler = MinMaxScaler()
# marked_df[num_cols] = scaler.fit_transform(marked_df[num_cols])

# deleting na
# marked_df.dropna(inplace=True)
print(marked_df.shape)
print("NaN values",marked_df.isna().sum().sum())
marked_df.head()



(1497, 21)
NaN values 3568


Unnamed: 0_level_0,TRADEDATE,SECID,NUMTRADES,VALUE,OPEN,LOW,HIGH,LEGALCLOSEPRICE,WAPRICE,CLOSE,...,MARKETPRICE2,MARKETPRICE3,ADMITTEDQUOTE,MP2VALTRD,MARKETPRICE3TRADESVALUE,ADMITTEDVALUE,TRENDCLSPR,currencyid,mark,new_index
TRADEDATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-12-12,2013-12-12,MGVM,787.0,1257798.3,0.7,0.7,0.88,0.836,0.83,0.838,...,0.83,0.83,0.83,1257798.3,1257798.3,1257798.3,-7.51,RUB,0.0,0.0
2013-12-13,2013-12-13,MGVM,517.0,733907.5,0.85,0.779,0.994,0.782,0.803,0.787,...,0.803,0.803,0.803,733907.5,733907.5,733907.5,-6.09,RUB,0.0,0.0
2013-12-16,2013-12-16,MGVM,543.0,855014.8,0.793,0.723,0.794,0.77,0.766,0.77,...,0.766,0.766,0.766,855014.8,855014.8,855014.8,-2.16,RUB,0.0,0.0
2013-12-17,2013-12-17,MGVM,300.0,244650.6,0.79,0.769,0.808,0.787,0.789,0.787,...,,0.773,,0.0,501941.8,0.0,2.21,RUB,0.0,0.0
2013-12-18,2013-12-18,MGVM,359.0,445425.7,0.778,0.728,0.794,0.756,0.756,0.752,...,,0.759,,0.0,501160.3,0.0,-4.45,RUB,0.0,0.0


In [13]:

anomaly_plot(marked_df, pd.get_dummies(marked_df["mark"]), n_list)


In [None]:
# drop some highy correlated columns
ts_df.drop(columns = [ "OPEN", "LOW","HIGH", "LEGALCLOSEPRICE", "WAPRICE",   "MARKETPRICE2", "MARKETPRICE3", "MP2VALTRD"], inplace=True)


Next step is generate and selecting features. For this purpose I'll use tsfresh lib.
To use this lib we need to fill na. 
There many ways to fulfill this. 


In [45]:
import tsfresh

In [52]:
ts_df.isna().sum()

TRADEDATE                      0
SECID                          0
NUMTRADES                      0
VALUE                          0
OPEN                       26130
LOW                        26130
HIGH                       26130
LEGALCLOSEPRICE              181
WAPRICE                    25561
CLOSE                      26130
VOLUME                        83
MARKETPRICE2               92028
MARKETPRICE3               11697
ADMITTEDQUOTE              27712
MP2VALTRD                     83
MARKETPRICE3TRADESVALUE       83
ADMITTEDVALUE               9687
TRENDCLSPR                 21501
currencyid                     0
dtype: int64

In [53]:
tsfresh.utilities.dataframe_functions.check_for_nans_in_columns(ts_df)

ValueError: Columns ['OPEN', 'LOW', 'HIGH', 'LEGALCLOSEPRICE', 'WAPRICE', 'CLOSE', 'VOLUME', 'MARKETPRICE2', 'MARKETPRICE3', 'ADMITTEDQUOTE', 'MP2VALTRD', 'MARKETPRICE3TRADESVALUE', 'ADMITTEDVALUE', 'TRENDCLSPR'] of DataFrame must not contain NaN values

In [46]:
df_features = tsfresh.extract_features(processed_df.drop(columns=["mark"]), column_id="SECID", column_sort="TRADEDATE", default_fc_parameters=tsfresh.feature_extraction.MinimalFCParameters())
df_features.columns

ValueError: Column must not contain NaN values: SECID