In [4]:
import pandas as pd
import time 
import warnings
from tqdm.notebook import trange, tqdm
from os import listdir
from os.path import isfile, join
import glob

warnings.filterwarnings('ignore')

In [5]:
vocab = {
    0 : 'Пн',
    1 : 'Вт',
    2 : 'Ср',
    3 : 'Чт',
    4 : 'Пт',
    5 : 'Сб',
    6 : 'Вс'
}


In [16]:
def create_target_set(target_name, how_to_shift, hours = 48):
    Target_KS = pd.read_csv(target_name+"P_in_out.csv",sep = ";")
    Target_KS["DateTime"] = pd.to_datetime(Target_KS["DateTime"])   
    Target_KS["dayofweek"] = Target_KS["DateTime"].dt.dayofweek
    Target_KS["Pin"] = Target_KS["Pin"].str.replace(",",".").astype('float32')
    Target_KS["Pout"] = Target_KS["Pout"].str.replace(",",".").astype('float32')
    abs = pd.DataFrame()
    abs["DateTime"] = pd.date_range(Target_KS["DateTime"].min(), periods=((Target_KS["DateTime"].max() - Target_KS["DateTime"].min()).seconds/3600 + (Target_KS["DateTime"].max() - Target_KS["DateTime"].min()).days*24 +1), freq="h")
    Target_KS = pd.merge(abs,Target_KS,how='left',on="DateTime")
    Target_KS = Target_KS.ffill() #Тут заполнение пропусков чтобы дискретность была час     
    if how_to_shift == "1":
        Target_KS.loc[Target_KS['dayofweek'] == 0,'DateTime'] -= pd.DateOffset(hours=72)   
        Target_KS.loc[Target_KS['dayofweek'] == 1,'DateTime'] -= pd.DateOffset(hours=96) 
        Target_KS.loc[Target_KS['dayofweek'] == 2,'DateTime'] -= pd.DateOffset(hours=48)
        Target_KS.loc[Target_KS['dayofweek'] == 3,'DateTime'] -= pd.DateOffset(hours=48)
        Target_KS.loc[Target_KS['dayofweek'] == 4,'DateTime'] -= pd.DateOffset(hours=48)
        Target_KS.loc[Target_KS['dayofweek'] == 5,'DateTime'] -= pd.DateOffset(hours=48)
        Target_KS.loc[Target_KS['dayofweek'] == 6,'DateTime'] -= pd.DateOffset(hours=48)
        Target_KS["dayofweek"] = Target_KS["DateTime"].dt.dayofweek
        Target_KS["dayofweek"] = Target_KS["dayofweek"].replace(vocab, regex=True)
        Target_KS.columns = [f'{c}_'+ target_name if c!="DateTime" else c for c in Target_KS]
        Target_KS = Target_KS.drop(Target_KS[(Target_KS["DateTime"].dt.year < 2021)].index)
        return Target_KS
    elif how_to_shift == "2":
        Target_KS['DateTime'] -= pd.DateOffset(hours=hours)
        Target_KS["dayofweek"] = Target_KS["DateTime"].dt.dayofweek
        Target_KS["dayofweek"] = Target_KS["dayofweek"].replace(vocab, regex=True)
        Target_KS.rename(columns={"Pin": "Pin_lag_"+str(hours)+"h", "Pout": "Pout_lag_"+str(hours)+"h"}, inplace = True)
        Target_KS.columns = [f'{c}_'+ target_name if c!="DateTime" else c for c in Target_KS]
        Target_KS = Target_KS.drop(Target_KS[(Target_KS["DateTime"].dt.year < 2021)].index)
        return Target_KS
    else:        
        Target_KS["dayofweek"] = Target_KS["dayofweek"].replace(vocab, regex=True)
        Target_KS.columns = [f'{c}_'+ target_name if c!="DateTime" else c for c in Target_KS]
        return Target_KS
   
    

In [19]:
def prepare_dataset(target_name, how_to_shift_target,hours = 48):
    grs_list = glob.glob("ГРС/*")
    auto_plan_list = glob.glob("Автоплан/*")
    target_set = create_target_set(target_name, how_to_shift_target,hours)
    schema_info = pd.read_csv(target_name+".csv",sep = ";")
    schema_info["DateTime"] = pd.to_datetime(schema_info["DateTime"])
    target_set = pd.merge(target_set,schema_info,how='left',on="DateTime")
    for grs in tqdm(grs_list):
        feature = pd.read_csv(grs, sep = ";")
        feature["DateTime"] = pd.to_datetime(feature["DateTime"])
        feature.columns = [f'{c}_'+ grs.split("\\")[1][:-4] if c!="DateTime" else c for c in feature]
        target_set = pd.merge(target_set,feature,how='left',on="DateTime")
    for auto_list in tqdm(auto_plan_list):
        feature = pd.read_csv(auto_list, sep = ";")
        if "Date" in feature.columns:
            feature["Data"] = feature["Date"]
            feature.drop(columns=["Date"],inplace=True)
        try:
            feature["DateTime"] = pd.to_datetime(feature["Data"])
        except KeyError:
            continue
        feature.drop(columns=["Data"],inplace=True)
        feature.columns = [f'{c}_'+ auto_list.split("\\")[1][:-4] if c!="DateTime" else c for c in feature]
        target_set = pd.merge(target_set,feature,how='left',on="DateTime")
    #target_set = target_set.ffill()
    return target_set
        
    

In [20]:
t_set = prepare_dataset("КС-15", "2")

  0%|          | 0/119 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

In [21]:
t_set

Unnamed: 0,DateTime,Object_КС-15,Pin_lag_48h_КС-15,Pout_lag_48h_КС-15,dayofweek_КС-15,Object,Tnv,GPAinwork,PNA,Schema1,...,82_КС-19 Факт по фидерам,83_КС-19 Факт по фидерам,84_КС-19 Факт по фидерам,85_КС-19 Факт по фидерам,86_КС-19 Факт по фидерам,87_КС-19 Факт по фидерам,88_КС-19 Факт по фидерам,89_КС-19 Факт по фидерам,ID_Кто планировал,Login_Кто планировал
0,2021-01-01 00:00:00,2010.0,36.500000,46.400002,Пт,2010.0,-9.0,2.0,-6,21.0,...,0,0,0,0,0,0,0,0,14.0,Немытых
1,2021-01-01 01:00:00,2010.0,36.500000,46.400002,Пт,,,,,,...,0,0,0,0,0,0,0,0,,
2,2021-01-01 02:00:00,2010.0,36.500000,46.700001,Пт,2010.0,-9.0,2.0,-6,21.0,...,0,0,0,0,0,0,0,0,,
3,2021-01-01 03:00:00,2010.0,36.500000,46.700001,Пт,,,,,,...,0,0,0,0,0,0,0,0,,
4,2021-01-01 04:00:00,2010.0,36.299999,46.799999,Пт,2010.0,-9.0,2.0,-6,21.0,...,0,0,0,0,0,0,0,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27150,2024-02-03 10:00:00,2010.0,46.900002,46.900002,Сб,,,,,,...,,,,,,,,,,
27151,2024-02-03 11:00:00,2010.0,46.900002,46.900002,Сб,,,,,,...,,,,,,,,,,
27152,2024-02-03 12:00:00,2010.0,46.700001,46.700001,Сб,,,,,,...,,,,,,,,,,
27153,2024-02-03 13:00:00,2010.0,46.700001,46.700001,Сб,,,,,,...,,,,,,,,,,


In [64]:
t_set.to_excel("КС-15_set_v1.xlsx",index=None)