In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")


Standardize the data of each stock in history and deal with the extreme value according to the principle of 3 $\sigma$

**3$\sigma$ Criterion**

A value is considered an outlier when the data is defined as being more than 3 times $\sigma$ (standard deviation) from the mean in a set of measured values, and the probability of this outlier is usually less than 0.3%

> ***I use a for loop to process each stock, it takes some time, if you have a more efficient method, please let me know in the comment section!!!🥰***

In [None]:
prices = pd.read_csv('../input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv')

In [None]:
def calculateOutlier(column):
    mu = column.mean()
    std = column.std()
    outlier = column[np.abs(column - mu) > 3*std]
    if pd.isnull(outlier.any()):
        return column
    else:
        column.clip(lower=mu - 3*std, upper=mu + 3*std)
    return column

In [None]:
df = pd.DataFrame(columns = prices.columns)

for id in prices.SecuritiesCode.unique():
    std_data = prices[prices.SecuritiesCode == id]
    ss = StandardScaler()
    std_data.loc[:, ['Open', 'High', 'Low', 'Close', 'Volume']] = ss.fit_transform(std_data.loc[:, ['Open', 'High', 'Low', 'Close', 'Volume']])
    std_data.loc[:, ['Open', 'High', 'Low', 'Close', 'Volume']].apply(calculateOutlier, axis=0)
    price_std = pd.concat([df,std_data])
    df = price_std

In [None]:
df = df.sort_values(['Date', 'SecuritiesCode'])

In [None]:
df

In [None]:
df.to_csv('./stock_prices_std_outlier.csv')