In [1]:
import pandas as pd
from pyod.models.ecod import ECOD

In [2]:
data_raw = pd.read_csv(r"C:\Users\paulo\PycharmProjects\ETL-CVM\data\cvm_fi_2000_2022.csv", index_col=0,
                       low_memory=False)
data_raw.shape

(57626454, 9)

In [3]:
data_raw = data_raw.query(
    "VL_TOTAL >= 0 and VL_QUOTA >= 0 and VL_PATRIM_LIQ >= 0 and NR_COTST > 0 and CAPTC_DIA >= 0 and RESG_DIA >= 0")
data_raw.shape

(56860405, 9)

In [4]:
data_raw_filter = data_raw.copy()

In [5]:
data_raw_filter.describe()

Unnamed: 0,VL_TOTAL,VL_QUOTA,VL_PATRIM_LIQ,CAPTC_DIA,RESG_DIA,NR_COTST
count,56860400.0,56860400.0,56860400.0,56860400.0,56860400.0,56860400.0
mean,355354900.0,208875.8,347616000.0,2178688.0,2124899.0,1151.052
std,30001380000.0,1486288000.0,2333124000.0,60244320.0,58379070.0,166378.0
min,0.0,0.0,0.0,0.0,0.0,1.0
25%,12658510.0,1.277843,12671110.0,0.0,0.0,1.0
50%,37203290.0,2.702882,37213130.0,0.0,0.0,2.0
75%,136915100.0,25.1503,136682800.0,0.0,0.0,17.0
max,215700300000000.0,11206620000000.0,3439999000000.0,111144800000.0,45973240000.0,1246559000.0


In [6]:
data_train_base = data_raw.drop(columns=["TP_FUNDO", "CNPJ_FUNDO", "DT_COMPTC"]).dropna()

In [7]:
data_train_base.shape

(56860405, 6)

In [8]:
classifiers = {
    "ECOD_0.5%": ECOD(contamination=0.005),
}

In [9]:
for clf_name, clf in classifiers.items():
    data_train = data_train_base.copy()
    print(f"Initial data shape {data_train.shape}")
    for column in data_train.columns:
        print(f"Removing outliers from: {column}")
        x_train = data_train[column].values.reshape(-1, 1)
        clf.fit(x_train)
        y_pred = clf.labels_
        data_raw_filter.drop(data_train[y_pred == 1].index, inplace=True)
        data_train.drop(data_train[y_pred == 1].index, inplace=True)
        print(f"removed {y_pred.sum()} rows")
        print(f"Data shape after {column} {data_train.shape}")


Initial data shape (56860405, 6)
Removing outliers from: VL_TOTAL
removed 284302 rows
Data shape after VL_TOTAL (56576103, 6)
Removing outliers from: VL_QUOTA
removed 282881 rows
Data shape after VL_QUOTA (56293222, 6)
Removing outliers from: VL_PATRIM_LIQ
removed 281467 rows
Data shape after VL_PATRIM_LIQ (56011755, 6)
Removing outliers from: CAPTC_DIA
removed 280059 rows
Data shape after CAPTC_DIA (55731696, 6)
Removing outliers from: RESG_DIA
removed 278659 rows
Data shape after RESG_DIA (55453037, 6)
Removing outliers from: NR_COTST
removed 277255 rows
Data shape after NR_COTST (55175782, 6)


In [10]:
data_raw.shape

(56860405, 9)

In [11]:
data_raw_filter.shape

(55175782, 9)

In [12]:
data_raw.describe()

Unnamed: 0,VL_TOTAL,VL_QUOTA,VL_PATRIM_LIQ,CAPTC_DIA,RESG_DIA,NR_COTST
count,56860400.0,56860400.0,56860400.0,56860400.0,56860400.0,56860400.0
mean,355354900.0,208875.8,347616000.0,2178688.0,2124899.0,1151.052
std,30001380000.0,1486288000.0,2333124000.0,60244320.0,58379070.0,166378.0
min,0.0,0.0,0.0,0.0,0.0,1.0
25%,12658510.0,1.277843,12671110.0,0.0,0.0,1.0
50%,37203290.0,2.702882,37213130.0,0.0,0.0,2.0
75%,136915100.0,25.1503,136682800.0,0.0,0.0,17.0
max,215700300000000.0,11206620000000.0,3439999000000.0,111144800000.0,45973240000.0,1246559000.0


In [13]:
data_raw_filter.describe()

Unnamed: 0,VL_TOTAL,VL_QUOTA,VL_PATRIM_LIQ,CAPTC_DIA,RESG_DIA,NR_COTST
count,55175780.0,55175780.0,55175780.0,55175780.0,55175780.0,55175780.0
mean,189558100.0,6985.081,189101700.0,358866.9,292855.8,325.9201
std,534263700.0,1100524.0,532411900.0,2208511.0,1607225.0,1751.831
min,53830.12,0.3639899,0.42,0.0,0.0,1.0
25%,12562010.0,1.279173,12570930.0,0.0,0.0,1.0
50%,35871360.0,2.684378,35876920.0,0.0,0.0,2.0
75%,125247800.0,26.06957,125044100.0,0.0,0.0,14.0
max,16377100000.0,6377437000.0,7252777000.0,42927780.0,25063240.0,26084.0


In [15]:
path_to_save = fr"C:\Users\paulo\PycharmProjects\ETL-CVM\data\cvm_fi_2000_2022_(ECOD_0.5%)_without_outliers_all_columns.parquet"

data_raw_filter.to_parquet(path_to_save)