In [1]:
from pathlib import Path

import pandas as pd
from joblib import dump, load
from sklearn.preprocessing import StandardScaler

from src.data_loader import DataLoader


In [8]:
SCALER_PATH = Path("out/StandardScaler.joblib")
TRAIN_STATS_PATH = Path("out/train_stats.csv")
EVAL_STATS_PATH = Path("out/eval_stats.csv")

In [3]:
if not SCALER_PATH.exists():
    scaler = StandardScaler()
    for measurements in DataLoader.get_train_iter(batch_size=100):
        for ind, measurement in enumerate(measurements):
            print(f"\r{ind:4} / {len(measurements)}", end="")
            scaler.partial_fit(measurement.data)

    dump(scaler, "out/StandardScaler.joblib")
else:
    scaler = load(SCALER_PATH)


In [None]:
statistics = []
batch_size = 100
no_batches = round(DataLoader.train_size / batch_size)
for batch_no, measurements in enumerate(DataLoader.get_train_iter(batch_size=100)):
    print(f"\rBatch: {batch_no + 1:2} / {no_batches}")
    for ind, measurement in enumerate(measurements):
        print(f"\r{ind + 1:4} / {len(measurements)}", end="")
        scaled = scaler.transform(measurement.data)
        stats = pd.DataFrame(scaled, columns=measurement.data.columns).describe().drop("count", axis=0)

        row = {}
        for column in stats.columns:
            col = column.replace("EEG ", "")
            for index in stats.index:
                row[f"{col}_{index}"] = stats.loc[index, column]
        row["age"] = measurement.age

        statistics.append(row)



In [None]:
df_train = pd.DataFrame(statistics)
df_train.reindex(sorted(df_train.columns), axis=1)


In [None]:
df_train.drop("age", axis=1).apply(lambda x: x.corr(df_train["age"])).sort_values(ascending=False)

In [None]:
df_train.to_csv(TRAIN_STATS_PATH, index=False)

In [5]:
statistics = []
measurements = DataLoader.get_eval()
for ind, measurement in enumerate(measurements):
    print(f"\r{ind + 1:4} / {len(measurements)}", end="")
    scaled = scaler.transform(measurement.data)
    stats = pd.DataFrame(scaled, columns=measurement.data.columns).describe().drop("count", axis=0)

    row = {}
    for column in stats.columns:
        col = column.replace("EEG ", "")
        for index in stats.index:
            row[f"{col}_{index}"] = stats.loc[index, column]
    row["age"] = measurement.age

    statistics.append(row)

 126 / 126

In [6]:
df_eval = pd.DataFrame(statistics)
df_eval.reindex(sorted(df_eval.columns), axis=1)

Unnamed: 0,A1-REF_25%,A1-REF_50%,A1-REF_75%,A1-REF_max,A1-REF_mean,A1-REF_min,A1-REF_std,A2-REF_25%,A2-REF_50%,A2-REF_75%,...,T5-REF_min,T5-REF_std,T6-REF_25%,T6-REF_50%,T6-REF_75%,T6-REF_max,T6-REF_mean,T6-REF_min,T6-REF_std,age
0,-0.073611,0.000301,0.074213,3.520887,-0.000814,-2.806162,0.142012,-0.070845,0.000927,0.070450,...,-2.675931,0.138241,-0.068453,0.004302,0.075627,2.717643,0.002177,-2.793210,0.128980,69
1,-0.131217,0.004647,0.138345,7.976235,-0.001869,-16.446118,0.260536,-0.133644,0.005410,0.142222,...,-3.317468,0.854470,-0.166891,0.005723,0.178337,46.745947,0.023909,-39.312037,1.626966,80
2,-0.064913,-0.004044,0.057908,0.856805,-0.004567,-2.156178,0.142120,-0.049540,-0.002439,0.045779,...,-2.578567,0.153548,-0.048484,0.000020,0.048524,0.766084,-0.004040,-2.748989,0.157041,78
3,-0.056222,-0.000781,0.056825,19.798832,0.005495,-13.723347,0.210680,-0.067479,0.000927,0.069333,...,-21.835256,1.291512,-0.075587,0.001451,0.079909,18.822217,-0.003408,-32.354648,0.308122,42
4,-0.047525,-0.000781,0.047038,0.735067,0.000549,-0.829027,0.088125,-0.052906,-0.000198,0.051386,...,-1.446213,0.088346,-0.049914,0.001451,0.054227,1.008611,0.002108,-0.873045,0.094335,34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121,-0.074700,0.007909,0.085083,35.608310,0.028657,-3.453981,0.902706,-0.080936,0.003168,0.085031,...,-36.731902,0.699244,-0.085572,0.005723,0.094176,46.122534,0.044168,-5.138493,1.273060,52
122,-0.052960,0.001391,0.055735,0.689421,0.000829,-0.861638,0.093744,-0.050657,0.000927,0.052511,...,-0.832691,0.092446,-0.068453,0.005723,0.078479,0.818869,0.003385,-0.714698,0.119837,81
123,-0.068175,0.001391,0.068778,15.355445,0.005151,-6.491967,0.246037,-0.124670,0.014383,0.153437,...,-8.792513,0.284061,-0.087002,0.005723,0.098458,46.165334,0.020391,-31.201976,0.676905,22
124,-0.084481,0.002474,0.087256,1.129630,0.000252,-0.718159,0.143586,-0.091033,0.002044,0.089514,...,-0.983410,0.168902,-0.112684,0.004302,0.119849,1.430867,0.002033,-1.219704,0.193617,21


In [7]:
df_eval.drop("age", axis=1).apply(lambda x: x.corr(df_eval["age"])).sort_values(ascending=False)


EKG1-REF_50%     0.248368
FP2-REF_25%      0.226459
FP1-REF_25%      0.214732
EKG1-REF_25%     0.214275
O1-REF_mean      0.205297
                   ...   
EKG1-REF_mean   -0.133012
EKG1-REF_max    -0.163055
FP1-REF_75%     -0.204212
FP2-REF_75%     -0.216215
EKG1-REF_std    -0.232759
Length: 189, dtype: float64

In [9]:
df_eval.to_csv(EVAL_STATS_PATH, index=False)