In [11]:
import pandas as pd
import statsmodels.api as sm
import numpy as np

def remove_outliers(dataframe, k=1.5):
    """
    使用中位数法剔除异常值。

    参数：
    dataframe -- 输入的Pandas DataFrame。
    k -- 中位数法的系数, 默认为1.5。

    返回值：
    一个新的DataFrame, 其中异常值已被替换为NaN。
    """
    # 计算每列的中位数
    medians = dataframe.median()

    # 计算每列的绝对离差
    abs_deviation = (dataframe - medians).abs()

    # 计算每列绝对离差的中位数
    median_abs_deviation = abs_deviation.median()

    # 计算上下界
    lower_bound = medians - k * median_abs_deviation
    upper_bound = medians + k * median_abs_deviation

    # 找出异常值并替换
    dataframe_no_outliers = dataframe.where((dataframe >= lower_bound) & (dataframe <= upper_bound))

    # 将异常值替换为 NaN
    dataframe_no_outliers = dataframe_no_outliers.mask((dataframe < lower_bound) | (dataframe > upper_bound))

    return dataframe_no_outliers

def standardize_dataframe(dataframe):
    """
    将输入的DataFrame标准化，使其具有均值为0和标准差为1的正态分布。

    参数：
    dataframe -- 输入的Pandas DataFrame。

    返回值：
    一个新的标准化后的DataFrame。
    """
    
    # 计算均值和标准差
    mean = dataframe.mean()
    std = dataframe.std()

    # 使用公式进行标准化
    standardized_dataframe = (dataframe - mean) / std

    return standardized_dataframe

In [12]:
beta = pd.read_csv('beta.csv', index_col=0)

print(beta.shape)

beta_no_outliers  = remove_outliers(beta)

beta_no_outliers_standardized = standardize_dataframe(beta_no_outliers)

print(beta_no_outliers_standardized)


(965, 5034)
                   1   2         4         5         6         7         8  \
Trddt                                                                        
2019-03-22  1.661889 NaN -0.512521       NaN  1.785918  0.977134       NaN   
2019-03-25  1.656973 NaN -0.497318       NaN  1.798646  0.983009       NaN   
2019-03-26  1.368641 NaN -0.349093       NaN  1.210477  0.746972       NaN   
2019-03-27  1.332573 NaN -0.328641       NaN  1.219240  0.955044       NaN   
2019-03-28  1.317305 NaN -0.329080       NaN  1.220413  0.946321       NaN   
...              ...  ..       ...       ...       ...       ...       ...   
2023-03-07  0.919556 NaN -1.679529 -0.768330       NaN  1.167576  0.483327   
2023-03-08  0.920200 NaN -1.810762 -0.879154       NaN  1.170088  0.536475   
2023-03-09  0.697465 NaN -1.864818 -0.846820       NaN  0.366008 -0.056247   
2023-03-10  0.954354 NaN -1.788898  0.066991       NaN  0.489819  0.201808   
2023-03-13  1.018115 NaN -1.806163  0.079963       N