In [24]:
import pandas as pd
import random

In [25]:
data = pd.read_csv('data/train.csv')
data.head()

Unnamed: 0,id,time,x,y
0,1,0,800,0
1,1,780,780,0
2,1,1572,792,0
3,1,2392,820,0
4,1,3196,804,0


In [26]:
from sklearn.neighbors import LocalOutlierFactor

def remove_noises(noise_data):
    lof = LocalOutlierFactor(n_neighbors=15, novelty=True)
    lof.fit(noise_data[['x']].to_numpy())
    outlier_predicted = lof.predict(noise_data[['x']].to_numpy())
    clear_data = noise_data[outlier_predicted == 1].copy()

    return clear_data

In [27]:
data.loc[data.y == 0] = remove_noises(data.loc[data.y == 0])
data.dropna(axis=0, inplace=True)

In [28]:
from sklearn.preprocessing import MinMaxScaler

normalizer = MinMaxScaler()
data.loc[:, 'x'] = normalizer.fit_transform(data[['x']])

In [29]:
test = pd.DataFrame(columns=data.columns)
train = pd.DataFrame(columns=data.columns)
PART = 0.1
is_equal = False
ids = data.id.unique().tolist()
while not is_equal:
    test_ids = random.sample(list(ids), int(PART * len(ids)))
    train_ids = set(ids) - set(test_ids)

    test = data.loc[data.id.isin(test_ids)]
    train = data.loc[data.id.isin(train_ids)]

    if 0.99 < (test.x.mean() / train.x.mean()) < 1.01:
        is_equal = True

In [30]:
test.x.mean(), train.x.mean()

(0.1928944535419699, 0.1938151945128769)

In [31]:
test.to_csv('data/split_data/test.csv')
train.to_csv('data/split_data/train.csv')

In [32]:
import pandas as pd

train = pd.read_csv('data/split_data/train.csv', index_col=0)
test = pd.read_csv('data/split_data/test.csv', index_col=0)

In [33]:
PREDICT_PART = 50

from main_features.rr_features import StatFeatures

name_cols = ['tension_index_int',
             'mode_int',
             'std_int',
             'mean_int',
             'var_int',
             'pnn50_int',
             'RMSSD_int',
             'ivr_int',
             'vpr_int',
             'papr_int',
             'idm_int',
             'cat_int',
             'tension_index_user',
             'mode_user',
             'std_user',
             'mean_user',
             'var_user',
             'pnn50_user',
             'RMSSD_user',
             'ivr_user',
             'vpr_user',
             'papr_user',
             'idm_user',
             'cat_user',] + \
            [f'r-r-{i}' for i in range(PREDICT_PART)] + \
            [f'y{i}' for i in range(PREDICT_PART)]
df_train_stats = pd.DataFrame(columns=name_cols)
df_test_stats = pd.DataFrame(columns=name_cols)
df_train_stats.head()

Unnamed: 0,tension_index_int,mode_int,std_int,mean_int,var_int,pnn50_int,RMSSD_int,ivr_int,vpr_int,papr_int,...,y40,y41,y42,y43,y44,y45,y46,y47,y48,y49


In [35]:
from tqdm import tqdm

max_id = train.id.unique().max()
for id_user in train.id.unique():
    print(f'ID {id_user} из {max_id}')
    stats_for_id = StatFeatures(train.loc[train.id == id_user, 'x'], count_intervals=PREDICT_PART).get_statistic()
    for i in tqdm(range(train.loc[train.id == id_user].shape[0] - PREDICT_PART)):
        if train.iloc[i: i + PREDICT_PART, 2].max() == train.iloc[i: i + PREDICT_PART, 2].min():
            continue

        stats_for_interval = StatFeatures(train.iloc[i: i + PREDICT_PART, 2], count_intervals=PREDICT_PART).get_statistic()
        df_train_stats.loc[i, :] = stats_for_interval + stats_for_id + list(train.iloc[i: i + PREDICT_PART, 2].values) + list(
            train.iloc[i: i + PREDICT_PART, 3].values)

df_train_stats

100%|██████████| 1820/1820 [00:58<00:00, 31.14it/s]
100%|██████████| 176/176 [00:05<00:00, 33.13it/s]
100%|██████████| 55/55 [00:01<00:00, 32.41it/s]
100%|██████████| 85/85 [00:02<00:00, 35.28it/s]
100%|██████████| 139/139 [00:03<00:00, 36.76it/s]
100%|██████████| 118/118 [00:03<00:00, 33.51it/s]
100%|██████████| 65/65 [00:01<00:00, 35.45it/s]
100%|██████████| 96/96 [00:02<00:00, 36.58it/s]
100%|██████████| 103/103 [00:03<00:00, 34.28it/s]
100%|██████████| 214/214 [00:06<00:00, 34.78it/s]
100%|██████████| 268/268 [00:07<00:00, 35.91it/s]
100%|██████████| 170/170 [00:04<00:00, 34.69it/s]
100%|██████████| 175/175 [00:04<00:00, 36.55it/s]
100%|██████████| 219/219 [00:07<00:00, 31.06it/s]
100%|██████████| 375/375 [00:11<00:00, 32.83it/s]
100%|██████████| 905/905 [00:28<00:00, 32.18it/s]
100%|██████████| 731/731 [00:23<00:00, 30.97it/s]
100%|██████████| 890/890 [00:30<00:00, 28.78it/s]
100%|██████████| 332/332 [00:10<00:00, 32.56it/s]
100%|██████████| 286/286 [00:10<00:00, 26.56it/s]
100%|█

ID 1.0 из 275.0
ID 2.0 из 275.0
ID 3.0 из 275.0
ID 4.0 из 275.0
ID 5.0 из 275.0
ID 6.0 из 275.0
ID 7.0 из 275.0
ID 10.0 из 275.0
ID 12.0 из 275.0
ID 13.0 из 275.0
ID 14.0 из 275.0
ID 15.0 из 275.0
ID 16.0 из 275.0
ID 17.0 из 275.0
ID 19.0 из 275.0
ID 20.0 из 275.0
ID 21.0 из 275.0
ID 22.0 из 275.0
ID 24.0 из 275.0
ID 25.0 из 275.0
ID 27.0 из 275.0
ID 29.0 из 275.0
ID 30.0 из 275.0
ID 31.0 из 275.0
ID 32.0 из 275.0
ID 33.0 из 275.0
ID 34.0 из 275.0
ID 35.0 из 275.0
ID 36.0 из 275.0
ID 39.0 из 275.0
ID 40.0 из 275.0
ID 42.0 из 275.0
ID 43.0 из 275.0
ID 44.0 из 275.0
ID 45.0 из 275.0
ID 46.0 из 275.0
ID 47.0 из 275.0
ID 48.0 из 275.0
ID 49.0 из 275.0
ID 51.0 из 275.0
ID 52.0 из 275.0
ID 53.0 из 275.0
ID 55.0 из 275.0
ID 57.0 из 275.0
ID 58.0 из 275.0
ID 59.0 из 275.0
ID 60.0 из 275.0
ID 62.0 из 275.0
ID 63.0 из 275.0
ID 64.0 из 275.0
ID 65.0 из 275.0
ID 66.0 из 275.0
ID 67.0 из 275.0
ID 68.0 из 275.0
ID 69.0 из 275.0
ID 70.0 из 275.0
ID 71.0 из 275.0
ID 73.0 из 275.0
ID 74.0 из 275.0
ID 7

Unnamed: 0,tension_index_int,mode_int,std_int,mean_int,var_int,pnn50_int,RMSSD_int,ivr_int,vpr_int,papr_int,...,y40,y41,y42,y43,y44,y45,y46,y47,y48,y49
0,1.4648e+08,0.227959,0.0326258,0.243542,13.3964,0,0.038905,0.667826,20.9257,0.614147,...,0,0,0,0,0,0,0,0,0,0
1,1.4648e+08,0.227959,0.0328015,0.243854,13.4513,0,0.0391815,0.667826,20.9257,0.614147,...,0,0,0,0,0,0,0,0,0,0
2,1.4648e+08,0.227959,0.0330884,0.244401,13.5386,0,0.0392472,0.667826,20.9257,0.614147,...,0,0,0,0,0,0,0,0,0,0
3,1.4648e+08,0.227959,0.0333075,0.244818,13.605,0,0.0392661,0.667826,20.9257,0.614147,...,0,0,0,0,0,0,0,0,0,0
4,1.4648e+08,0.227959,0.0333494,0.244896,13.6178,0,0.0392185,0.667826,20.9257,0.614147,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3597,3.37556e+08,0.191682,0.0765829,0.210964,36.3015,0,0.10341,1.29407,14.6763,2.39981,...,0,0,0,0,1,1,1,1,1,1
3598,3.37556e+08,0.191682,0.0765965,0.210911,36.3169,0,0.1034,1.29407,14.6763,2.39981,...,0,0,0,1,1,1,1,1,1,1
3599,3.23186e+08,0.1915,0.0766174,0.210833,36.3403,0,0.103583,1.2378,14.6903,2.29765,...,0,0,1,1,1,1,1,1,1,1
3600,3.23186e+08,0.1915,0.0766174,0.210833,36.3403,0,0.0989891,1.2378,14.6903,2.29765,...,0,1,1,1,1,1,1,1,1,0


In [36]:
max_id = test.id.unique().max()
for id_user in test.id.unique():
    print(f'ID {id_user} из {max_id}')
    stats_for_id = StatFeatures(test.loc[test.id == id_user, 'x'], count_intervals=PREDICT_PART).get_statistic()
    for i in tqdm(range(test.loc[test.id == id_user].shape[0] - PREDICT_PART)):
        if test.iloc[i: i + PREDICT_PART, 2].max() == test.iloc[i: i + PREDICT_PART, 2].min():
            continue
        stats_for_interval = StatFeatures(test.iloc[i: i + PREDICT_PART, 2], count_intervals=PREDICT_PART).get_statistic()
        df_test_stats.loc[i, :] = stats_for_interval + stats_for_id + list(test.iloc[i: i + PREDICT_PART, 2].values) + list(
            test.iloc[i: i + PREDICT_PART, 3].values)

100%|██████████| 439/439 [00:13<00:00, 31.49it/s]
100%|██████████| 19/19 [00:00<00:00, 33.72it/s]
100%|██████████| 104/104 [00:03<00:00, 32.84it/s]
100%|██████████| 95/95 [00:02<00:00, 32.82it/s]
100%|██████████| 151/151 [00:05<00:00, 30.00it/s]
100%|██████████| 89/89 [00:02<00:00, 30.61it/s]
100%|██████████| 78/78 [00:02<00:00, 31.92it/s]
100%|██████████| 80/80 [00:02<00:00, 35.70it/s]
100%|██████████| 49/49 [00:01<00:00, 35.38it/s]
100%|██████████| 148/148 [00:04<00:00, 36.29it/s]
100%|██████████| 85/85 [00:02<00:00, 34.53it/s]
100%|██████████| 102/102 [00:02<00:00, 34.20it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
100%|██████████| 125/125 [00:04<00:00, 28.44it/s]
100%|██████████| 120/120 [00:03<00:00, 33.06it/s]
100%|██████████| 159/159 [00:04<00:00, 34.60it/s]
100%|██████████| 122/122 [00:03<00:00, 34.74it/s]
100%|██████████| 142/142 [00:04<00:00, 30.02it/s]
100%|██████████| 106/106 [00:03<00:00, 31.66it/s]
100%|██████████| 87/87 [00:02<00:00, 31.20it/s]
100%|██████████| 57/57 [00:0

ID 23.0 из 268.0
ID 38.0 из 268.0
ID 41.0 из 268.0
ID 54.0 из 268.0
ID 61.0 из 268.0
ID 72.0 из 268.0
ID 75.0 из 268.0
ID 85.0 из 268.0
ID 92.0 из 268.0
ID 115.0 из 268.0
ID 118.0 из 268.0
ID 119.0 из 268.0
ID 126.0 из 268.0
ID 133.0 из 268.0
ID 139.0 из 268.0
ID 140.0 из 268.0
ID 199.0 из 268.0
ID 204.0 из 268.0
ID 226.0 из 268.0
ID 263.0 из 268.0
ID 266.0 из 268.0
ID 268.0 из 268.0


In [38]:
df_test_stats.to_csv('data/split_data/test_stats.csv')
df_train_stats.to_csv('data/split_data/train_stats.csv')