In [617]:
import pandas as pd
import numpy as np

### dto

In [618]:
from dataclasses import dataclass
from datetime import datetime
from typing import Dict, List, Optional


@dataclass
class Segment:
    start_point: float
    end_point: float
    target_segments: List[str]


@dataclass
class SourceSettings:
    source: str
    type: str
    handler: str
    rolling_number: str
    encoding: str
    is_single: bool
    main_folder: str
    key_folder: str
    nested_folders: str
    filename_key: str
    interpolation_type: str
    interpolation: Optional[float]
    segments: Dict[str, Segment]
    forbidden_columns: List[str]
    filtration_methods: List[str]
    secondary_features: List[str]
    aggregation_methods: List[str]
    billet_column: str
    convert_columns: Dict[str, str]


@dataclass
class PipelineSetup:
    MIN_TIME: datetime    # ограничение файлов по минимальной дате
    MAX_TIME: datetime    # ограничение файлов по максимальной дате
    NUM_OF_CORES: int    # количество выделенных ядер ЦПУ
    MARK_FILTER: bool    # фильтровать или нет марки рельс
    MARK: List[str]    # марка рельса
    PATH_TO_MATERIALS: Optional[str]    # путь до вспомогательных материалов
    PATH_TO_METADATA: str    # пути до данных
    PATH_TO_RESULT: str    # путь до папки с результатом. Создавать самим
    METADATA_BILLET_ID: str    # имя столбца с ИД заготовки


@dataclass
class Materials:
    PATHS: dict


@dataclass
class CutPoint:
    L_DELTA_MIN: float
    L_DELTA_MAX: float
    L_WIN: int
    R_DELTA_MIN: float
    R_DELTA_MAX: float
    R_WIN: int


In [1154]:
import ast

def parse_settings(settings: pd.DataFrame) -> Dict[str, SourceSettings]:
    sources = {}
    for _, source_settings in settings.iteritems():
        source_settings = source_settings.fillna("")

        # Парсим сегменты
        segments = {}
        for segment_data in str(source_settings["segments"]).split("\n"):
            segment_id, start, end, target_ids = eval(segment_data)
            segments[f"{source_settings['source']}_{segment_id}"] = Segment(
                start_point=start, end_point=end, target_segments=target_ids
            )
        # Парсим специальные колонки
        convert_columns = {}
        if source_settings["convert_columns"] != '':
            for convert_col in str(source_settings["convert_columns"]
                                   ).split("\n"):
                colname, convert_type = eval(convert_col)
                convert_columns[colname] = convert_type

        # Парсим остальные настройки источника
        if source_settings['interpolation_type'] == 'by value':
            interp = float(source_settings['interpolation'])
        elif source_settings['interpolation_type'] == 'by source':
            interp = source_settings['interpolation']
        elif source_settings['interpolation_type'] == "":
            interp = None
        else:
            raise NameError(
                f"Unknown interpolation type for source "
                f"{source_settings['source']}"
            )

        sources[source_settings['source']] = SourceSettings(
            source=source_settings['source'],
            type=source_settings['type'],
            handler=source_settings['handler'],
            rolling_number=str(source_settings['rolling_number']),
            encoding=source_settings['encoding'],
            is_single=source_settings['is_single'],
            main_folder=source_settings['main_folder'],
            key_folder=source_settings['key_folder'],
            nested_folders=source_settings['nested_folders'],
            filename_key=source_settings['filename_key'],
            interpolation_type=source_settings['interpolation_type'],
            billet_column=source_settings['billet_column'],
            interpolation=interp,
            segments=segments,
            forbidden_columns=ast.literal_eval(
                source_settings['forbidden_columns']
            ),
            filtration_methods=ast.literal_eval(
                source_settings['filtration_methods']
            ),
            secondary_features=ast.literal_eval(
                source_settings['secondary_features']
            ),
            aggregation_methods=ast.literal_eval(
                source_settings['agg_methods']
            ),
            convert_columns=convert_columns
        )

    return sources


In [1155]:
settings = pd.read_excel(
    "exploration/agregator/run/settings.xlsx", index_col=[0], header=None
).iloc[:, :-1]
settings

Unnamed: 0_level_0,1,2,3,4
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
source,LNK100,U0,WBF_PIRO,WBF_sgl
rolling_number,1,1,1,1
handler,BASE,BASE,WBFPIRO,WBFSINGLE
type,target,feature,feature,feature
encoding,UTF-8,UTF-8,UTF-8,ANSI
is_single,False,False,False,1
main_folder,\\ZSMK-9684-001\Data\2023,\\ZSMK-9684-001\Data\2023,\\ZSMK-9684-001\Data\2023,\\ZSMK-9684-001\Data\2023
key_folder,ipsrnk,U0,wbf,wbf
nested_folders,rail_points_files,rollings_points_files,billet_pirometer_files,
filename_key,,,,


In [1159]:
settings_parsed = parse_settings(settings)

In [1162]:
display(settings_parsed)

{'LNK100': SourceSettings(source='LNK100', type='target', handler='BASE', rolling_number='1', encoding='UTF-8', is_single=False, main_folder='\\\\ZSMK-9684-001\\Data\\2023', key_folder='ipsrnk', nested_folders='rail_points_files', filename_key='', interpolation_type='by value', interpolation=103.0, segments={'LNK100_1': Segment(start_point=0, end_point=3, target_segments=[])}, forbidden_columns=[], filtration_methods=['std'], secondary_features=['abs'], aggregation_methods=['tg', 'max', 'min', 'median'], billet_column='BilletPoints', convert_columns={}),
 'U0': SourceSettings(source='U0', type='feature', handler='BASE', rolling_number='1', encoding='UTF-8', is_single=False, main_folder='\\\\ZSMK-9684-001\\Data\\2023', key_folder='U0', nested_folders='rollings_points_files', filename_key='', interpolation_type='by value', interpolation=103.0, segments={'U0_1': Segment(start_point=0, end_point=3, target_segments=['LNK100_1']), 'U0_2': Segment(start_point=3, end_point=6, target_segments=[

### Transcription Class

In [619]:
from copy import copy
from typing import Dict, List

class Transcription:

    def __init__(
        self,
        workcenter: str = "",
        workunit: str = "",
        rolling_number: str = "",
        name: str = "",
        model: str = "",
        interpolation: str = "",
        sector_range: str = "",
        preprocessing: str = "",
        aggregation: str = "",
        secondary_functions: str = ""
    ):
        self.workcenter = workcenter
        self.workunit = workunit
        self.rolling_number = rolling_number
        self.name = name
        self.model = model
        self.interpolation = interpolation
        self.sector_range = sector_range
        self.preprocessing = preprocessing
        self.aggregation = aggregation
        self.secondary_functions = secondary_functions

    def __repr__(self):
        return str(self)

    def __hash__(self):
        return hash(str(self))

    def __eq__(self, other):
        return str(self) == str(other)

    def __str__(self):
        return (
            f"{self._add_under(self.workcenter)}"
            f"{self._add_under(self.workunit)}"
            f"{self._add_under(self.rolling_number)}"
            f"{self._add_under(self.name)}"
            f"{self._add_under(self.model)}"
            f"{self._add_under(self.interpolation)}"
            f"{self._add_under(self.sector_range)}"
            f"{self._add_dot(self.preprocessing)}"
            f"{self._add_under(self.aggregation)}"
        )[:-1]

    def add_tags(self, tags: Dict[str, List[str]], replace: bool = False):
        new_transcription = copy(self)
        for tag_type, tags_values in tags.items():
            if replace:
                full_tag = tags_values[0]
                for tag in tags_values[1:]:
                    full_tag += new_transcription._add_under(tag)
                setattr(new_transcription, tag_type, full_tag)
            else:

                for tag in tags_values:
                    transcription_attr = getattr(new_transcription, tag_type)
                    setattr(
                        new_transcription,
                        tag_type,
                        f"{new_transcription._add_under(transcription_attr)}"
                        f"{tag}",
                    )
        return new_transcription

    @staticmethod
    def _add_under(string: str):
        return f"{string}_" if string != "" else ""

    @staticmethod
    def _add_dot(string: str):
        return f"{string}." if string != "" else ""

In [620]:
### Пример инициализации объекта класса `Transcription`:
transcription = Transcription(
    workcenter="mill",
    workunit="bd1",
    rolling_number="1",
    name="temp",
    model="l1",
    interpolation="i103.0",
    sector_range="h0.0-3.0",
    preprocessing="ni",
    aggregation="min",
    secondary_functions=""
)

print(transcription)

mill_bd1_1_temp_l1_i103.0_h0.0-3.0_ni.min


In [621]:
### Метод `__repr__` 

transcription.__repr__()

'mill_bd1_1_temp_l1_i103.0_h0.0-3.0_ni.min'

In [622]:
### Метод `__hash__`

transcription.__hash__()

7894055372663120742

In [623]:
### Метод `__str__`

transcription.__str__()

'mill_bd1_1_temp_l1_i103.0_h0.0-3.0_ni.min'

In [624]:
### Метод `add_tags`

new_transcription = transcription.add_tags(
    {
    "sector_range": ["t0.0-3.0"],
    "aggregation": ["max"]
}, replace=True)
print(new_transcription)

mill_bd1_1_temp_l1_i103.0_t0.0-3.0_ni.max


### Array Class

In [625]:
from copy import copy
from typing import Dict, List

class Array:

    def __init__(
        self,
        transcription: Transcription,
        values: np.array,
        is_numeric: bool,
        is_billet: bool,
    ):
        self.transcription = transcription
        self.is_numeric = is_numeric
        self.is_billet = is_billet
        self.values = values
        self.key = str(self.transcription)

    def __repr__(self):
        return (
            f"{str(self.transcription)}; "
            f"is_numeric={self.is_numeric}"
            f", is_billet={self.is_billet}"
        )

    def replace_values(self, new_values: np.array):
        self.values = new_values

    def append_keys_to_transcription(
        self, tags: Dict[str, List[str]], replace: bool = False
    ):
        new_transcription = self.transcription.add_tags(tags, replace)
        new_array = copy(self)
        new_array.transcription = new_transcription
        new_array.key = str(new_transcription)
        return new_array

In [626]:
transcription = Transcription(
    workcenter="mill",
    workunit="bd1",
    rolling_number="1",
    name="temp",
    model="l1",
    interpolation="i103.0",
    sector_range="h0.0-3.0",
    preprocessing="ni",
    aggregation="min",
    secondary_functions=""
)

values = np.array([559.94, 572.96, 582.03, 594.40, 606.77, 620.23, 620.23, 640.36, 1023.74, 1062.85])

array = Array(
    transcription=transcription,
    values=values,
    is_numeric=True,  # Предположим, что все значения являются числами
    is_billet=False   # Предположим, что это не заготовка
)

print(array)

mill_bd1_1_temp_l1_i103.0_h0.0-3.0_ni.min; is_numeric=True, is_billet=False


In [627]:
# Применяем метод append_keys_to_transcription
tags = {"aggregation": ["max"]}
new_array = array.append_keys_to_transcription(tags, replace=False)

print(new_array.key)

mill_bd1_1_temp_l1_i103.0_h0.0-3.0_ni.min_max


### FeaturesGenerator Class

In [628]:
from copy import copy

class FeaturesGenerator:

    def __init__(self):
        self.methods = {
            "abs": self.absolute,
            "norm": self.norm,
        }

    def get_method_values(self, method, *args, **kwargs):
        return self.methods[method](*args, **kwargs)

    @staticmethod
    def absolute(data_array: Array, *args, **kwargs) -> Array:
        """Модуль"""
        transcription = copy(data_array.transcription)
        transcription.add_tags({"preprocessing": ["abs"]}, )
        array = Array(
            transcription=transcription,
            values=abs(data_array.values),
            is_numeric=True,
            is_billet=False
        )
        return array

    @staticmethod
    def norm(data_array: Array, MIN, MAX, *args, **kwargs) -> Array:
        """Модуль"""
        transcription = copy(data_array.transcription)
        name = (
            data_array.transcription.workunit + "_"
            + data_array.transcription.name
        )
        transcription.add_tags({"preprocessing": ["norm"]})
        array = Array(
            transcription=transcription,
            values=np.array(
                [
                    (val - MIN[name]) / (MAX[name] - MIN[name])
                    for val in data_array.values
                ]
            ),
            is_numeric=True,
            is_billet=False
        )
        return array

In [629]:
# Создаем объект FeaturesGenerator
features_generator = FeaturesGenerator()

# Создаем массив данных
values = np.array([-559.94, 572.96, -582.03, 594.40, -606.77, 620.23, -640.36, 1023.74, -1062.85])
array = Array(
    transcription=transcription,
    values=values,
    is_numeric=True,
    is_billet=False
)

# Применяем метод absolute
result_array = features_generator.absolute(array)

result_array.values

array([ 559.94,  572.96,  582.03,  594.4 ,  606.77,  620.23,  640.36,
       1023.74, 1062.85])

In [630]:
# Создаем объект FeaturesGenerator
features_generator = FeaturesGenerator()

# Создаем массив данных
values = np.array([-559.94, 572.96, -582.03, 594.40, -606.77, 620.23, -640.36, 1023.74, -1062.85])
array = Array(
    transcription=transcription,
    values=values,
    is_numeric=True,
    is_billet=False
)

# Задаем значения MIN и MAX
MIN = {"bd1_temp": -1062.85}
MAX = {"bd1_temp": 1023.74}

# Применяем метод norm
result_array = features_generator.norm(array, MIN, MAX)

result_array.values

array([0.24102004, 0.78396331, 0.23043339, 0.79423845, 0.21857672,
       0.8066175 , 0.20247869, 1.        , 0.        ])

### SourceSettings Class

In [631]:
from exploration.agregator import constants
from exploration.agregator.dto import SourceSettings

class SourceDataset:    

    def __init__(self, settings: SourceSettings):
        self.settings = settings
        self.data = {}
        self.billet_key = str(
            Transcription(
                workunit=self.settings.source,
                rolling_number=self.settings.rolling_number,
                name=self.settings.billet_column,
                interpolation="ni"
            )
        )

    def __iter__(self):
        self._data_keys = list(self.data.keys())
        self._index = -1
        return self

    def __next__(self):
        if len(self._data_keys) != len(list(self.data.keys())):
            raise RuntimeError("object changed size during iterations")
        if self._index < len(self.data) - 1:
            self._index += 1
            return self.data[self._data_keys[self._index]]
        else:
            raise StopIteration

    def __getitem__(self, key: str):
        return self.data[key]

    def __repr__(self):
        return f"{self.settings.source}; " \
                f"billet='{self.settings.billet_column}'"

    def billet_array(self):
        """Возвращает Array, для которого is_billet = True"""
        return self.data[self.billet_key]

    def append_dataframe_to_source_data(self, data: pd.DataFrame):
        """Формирует из DataFrame набор данных в формате Dict[Array]"""
        for name, values in data.iteritems():
            values = values.to_numpy()
            is_numeric = self._is_numeric(values)
            transcription = Transcription(
                workunit=self.settings.source,
                rolling_number=self.settings.rolling_number,
                name=str(name),
                interpolation="ni"
            )
            is_billet = (
                True if str(transcription) == self.billet_key else False
            )
            array = Array(
                transcription=transcription,
                values=values,
                is_numeric=is_numeric,
                is_billet=is_billet
            )
            self.data[str(transcription)] = array

    def append_array_to_source_data(self, array: Array, old_key: str = None):
        """Добавляет новый Array"""
        self.data[array.key] = array
        if old_key == self.billet_key:
            self.billet_key = str(array.transcription)

    def replace_array_values(self, key: str, new_values: np.array):
        """Изменяет значения определнного Array"""
        self.data[key].replace_values(new_values)

    def remove_arrays(self, arrays: List[str]):
        for key in arrays:
            self.data.pop(key)

    def append_tags_to_array(
        self, key: str, tags: Dict[str, List[str]], replace: bool = False
    ):
        new_array = self.data[key].append_keys_to_transcription(tags, replace)
        self.remove_arrays([key])
        self.append_array_to_source_data(new_array, key)

    def append_tags_to_all_arrays(
        self, tags: Dict[str, List[str]], replace: bool = False
    ):
        init_keys = self.data.keys()
        for key in init_keys:
            new_array = self.data[key].append_keys_to_transcription(
                tags, replace
            )
            self.remove_arrays([key])
            self.append_array_to_source_data(new_array, key)

    def return_arrays_by_tags(self, tags: Dict[str, str]) -> List[Array]:
        arrays = []
        for array in self:
            for tag_key, tag_value in tags.items():
                if getattr(array.transcription, tag_key) != tag_value:
                    break
                arrays.append(array)
        return arrays

    @staticmethod
    def _is_numeric(values: np.array) -> bool:
        column_type = values.dtype
        is_numeric = False
        for parent_type in constants.AVAILABLE_TYPES:
            if np.issubdtype(column_type, parent_type):
                is_numeric = True
                break
        return is_numeric

In [780]:
# Создание объекта SourceSettings
settings = SourceSettings(
    source="LNK100",
    type="feature",
    handler="BASE",
    rolling_number="",
    encoding="UTF-8",
    is_single=False,
    main_folder="\\ZSMK-9684-001\Data\2023",
    key_folder="ipsrnk",
    nested_folders="rail_points_files",
    filename_key="",
    interpolation_type="by_value",
    interpolation=103,
    segments={
            "LNK100_1": Segment(start_point=0, end_point=3, target_segments=[]),
            "LNK100_2": Segment(start_point=3, end_point=6, target_segments=["LNK100_1"]),
            "LNK100_3": Segment(start_point=30, end_point=70, target_segments=["LNK100_1"]),
            "LNK100_4": Segment(start_point=-3, end_point="end", target_segments=["LNK100_1"]),
            "LNK100_5": Segment(start_point=-6, end_point=-3, target_segments=["LNK100_1"])
        },
    forbidden_columns=[],
    filtration_methods=["std"],
    secondary_features=["abs"],
    aggregation_methods=['tg', 'max', 'min', 'median'],
    billet_column="BilletPoints",
    convert_columns={}
)
# Создание объекта SourceDataset
dataset = SourceDataset(settings)

In [633]:
iter(dataset)

LNK100; billet='BilletPoints'

In [634]:
repr(dataset)

"LNK100; billet='BilletPoints'"

In [635]:
df = pd.DataFrame({
    'BilletPoints': [0.025, 0.05, 0.075, 0.1, 0.125, 0.15, 0.175, 0.2, 0.225, 0.25, 0.275, 0.3, 0.325, 0.35, 0.375, 0.4, 0.425, 0.45, 0.475, 0.5, 0.525, 0.55, 0.575, 0.6, 0.625, 0.65, 0.675, 0.7, 0.725, 0.75, 0.775, 0.8, 0.825, 0.85, 0.875, 0.9, 0.925, 0.95, 0.975, 1],
    'Vert1000': [133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 134, 134, 134, 134, 134, 133, 132, 131, 130, 130, 129, 129, 129, 129, 127, 126, 125, 126, 124],
    'Vert1500': [224, 224, 224, 224, 224, 224, 224, 224, 224, 224, 224, 224, 224, 224, 224, 224, 224, 224, 224, 224, 224, 225, 226, 226, 225, 224, 223, 220, 218, 215, 213, 209, 205, 203, 200, 199, 197, 195, 194, 191],
    'Vert2000': [312, 312, 312, 312, 312, 312, 312, 312, 312, 312, 312, 312, 312, 312, 312, 312, 312, 312, 312, 312, 312, 311, 310, 309, 306, 308, 305, 301, 297, 292, 289, 284, 278, 277, 275, 273, 271, 270, 268, 264],
    'Vert3000': [400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 400, 392, 383, 375, 367, 360, 353, 342, 332, 321, 314, 305, 298, 292, 285, 282, 277, 273, 268, 264],
    'Hor1000': [318, 318, 318, 318, 318, 318, 318, 318, 318, 318, 318, 318, 318, 318, 318, 318, 318, 318, 318, 318, 318, 317, 322, 329, 327, 338, 349, 358, 368, 377, 386, 393, 398, 407, 411, 410, 412, 414, 419, 422],
    'Hor1500': [747, 747, 747, 747, 747, 747, 747, 747, 747, 747, 747, 747, 747, 747, 747, 747, 747, 747, 747, 747, 747, 752, 759, 755, 761, 763, 766, 767, 768, 768, 767, 766, 762, 758, 751, 742, 735, 726, 720, 716],
    'Hor2000': [1152, 1152, 1152, 1152, 1152, 1152, 1152, 1152, 1152, 1152, 1152, 1152, 1152, 1152, 1152, 1152, 1152, 1152, 1152, 1152, 1152, 1152, 1133, 1125, 1108, 1096, 1078, 1062, 1046, 1031, 1018, 1000, 981, 965, 944, 932, 912, 889, 871, 854],
    'Torsion': [-698, -698, -698, -698, -698, -694, -670, -658, -641, -635, -631, -609, -589, -558, -524, -483, -443, -398, -361, -336, -287, -248, -222, -200, -184, -177, -181, -185, -184, -183, -183, -177, -166, -157, -141, -120, -88, -51, -12, 30]
})


In [636]:
dataset.append_dataframe_to_source_data(df)

In [637]:
display(dataset.data)

{'LNK100_BilletPoints_ni': LNK100_BilletPoints_ni; is_numeric=True, is_billet=True,
 'LNK100_Vert1000_ni': LNK100_Vert1000_ni; is_numeric=True, is_billet=False,
 'LNK100_Vert1500_ni': LNK100_Vert1500_ni; is_numeric=True, is_billet=False,
 'LNK100_Vert2000_ni': LNK100_Vert2000_ni; is_numeric=True, is_billet=False,
 'LNK100_Vert3000_ni': LNK100_Vert3000_ni; is_numeric=True, is_billet=False,
 'LNK100_Hor1000_ni': LNK100_Hor1000_ni; is_numeric=True, is_billet=False,
 'LNK100_Hor1500_ni': LNK100_Hor1500_ni; is_numeric=True, is_billet=False,
 'LNK100_Hor2000_ni': LNK100_Hor2000_ni; is_numeric=True, is_billet=False,
 'LNK100_Torsion_ni': LNK100_Torsion_ni; is_numeric=True, is_billet=False}

In [638]:
dataset.billet_key

'LNK100_BilletPoints_ni'

In [639]:
dataset.data["LNK100_Vert1000_ni"].values

array([133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133,
       133, 133, 133, 133, 133, 133, 133, 133, 134, 134, 134, 134, 134,
       133, 132, 131, 130, 130, 129, 129, 129, 129, 127, 126, 125, 126,
       124], dtype=int64)

In [640]:
dataset.data["LNK100_Vert1000_ni"].transcription

LNK100_Vert1000_ni

In [641]:
# Создаем словарь тегов для добавления
tags = {
    "interpolation": ["i103.0"]
}

# Вызываем метод append_tags_to_array, передавая ключ массива и словарь тегов
dataset.append_tags_to_array("LNK100_Vert1000_ni", tags, replace=True)

In [642]:
dataset.data

{'LNK100_BilletPoints_ni': LNK100_BilletPoints_ni; is_numeric=True, is_billet=True,
 'LNK100_Vert1500_ni': LNK100_Vert1500_ni; is_numeric=True, is_billet=False,
 'LNK100_Vert2000_ni': LNK100_Vert2000_ni; is_numeric=True, is_billet=False,
 'LNK100_Vert3000_ni': LNK100_Vert3000_ni; is_numeric=True, is_billet=False,
 'LNK100_Hor1000_ni': LNK100_Hor1000_ni; is_numeric=True, is_billet=False,
 'LNK100_Hor1500_ni': LNK100_Hor1500_ni; is_numeric=True, is_billet=False,
 'LNK100_Hor2000_ni': LNK100_Hor2000_ni; is_numeric=True, is_billet=False,
 'LNK100_Torsion_ni': LNK100_Torsion_ni; is_numeric=True, is_billet=False,
 'LNK100_Vert1000_i103.0': LNK100_Vert1000_i103.0; is_numeric=True, is_billet=False}

In [643]:
dataset.data

{'LNK100_BilletPoints_ni': LNK100_BilletPoints_ni; is_numeric=True, is_billet=True,
 'LNK100_Vert1500_ni': LNK100_Vert1500_ni; is_numeric=True, is_billet=False,
 'LNK100_Vert2000_ni': LNK100_Vert2000_ni; is_numeric=True, is_billet=False,
 'LNK100_Vert3000_ni': LNK100_Vert3000_ni; is_numeric=True, is_billet=False,
 'LNK100_Hor1000_ni': LNK100_Hor1000_ni; is_numeric=True, is_billet=False,
 'LNK100_Hor1500_ni': LNK100_Hor1500_ni; is_numeric=True, is_billet=False,
 'LNK100_Hor2000_ni': LNK100_Hor2000_ni; is_numeric=True, is_billet=False,
 'LNK100_Torsion_ni': LNK100_Torsion_ni; is_numeric=True, is_billet=False,
 'LNK100_Vert1000_i103.0': LNK100_Vert1000_i103.0; is_numeric=True, is_billet=False}

In [644]:
# Cловарь с тегами, которые хотим проверить
tags_to_check = {
    "interpolation": "i103.0",
}

# Получаем список массивов, удовлетворяющих заданным тегам
arrays_with_tags = dataset.return_arrays_by_tags(tags_to_check)

# Выводим результат
for array in arrays_with_tags:
    print(array)

LNK100_Vert1000_i103.0; is_numeric=True, is_billet=False


In [645]:
arrays_with_tags[0].transcription

LNK100_Vert1000_i103.0

In [646]:
dataset.data[arrays_with_tags[0].transcription].values

array([133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, 133,
       133, 133, 133, 133, 133, 133, 133, 133, 134, 134, 134, 134, 134,
       133, 132, 131, 130, 130, 129, 129, 129, 129, 127, 126, 125, 126,
       124], dtype=int64)

### SecondaryFunctions Class

In [715]:
class SecondaryFunctions:

    def __init__(self):
        pass

    @staticmethod
    def sort_dataset_ascending_by_billet(
        source_data: SourceDataset
    ) -> SourceDataset:
        """Сортировка billet points по возрастанию"""
        new_sequence = sorted(
            enumerate(source_data.billet_array().values), key=lambda x: x[1]
        )
        new_sequence = [val[0] for val in new_sequence]
        for data_array in source_data:
            new_values = [data_array.values[index] for index in new_sequence]
            source_data.append_tags_to_array(
                data_array.key, {"secondary_functions": ["sortas"]}
            )
            source_data.replace_array_values(
                data_array.key, np.array(new_values)
            )
        return source_data

    @staticmethod
    def approximate_billet_by_bounds(
        source_data: SourceDataset
    ) -> SourceDataset:
        """Аппроксимация столбца Billet к
        необходимой, строго фиксированной размерности"""
        ub = source_data.settings.interpolation
        lb = 0
        max_billet = max(source_data.billet_array().values)
        new_billet_values = (
            source_data.billet_array().values * (ub - lb) / max_billet
        ) + lb
        source_data.replace_array_values(
            source_data.billet_key, new_billet_values
        )
        source_data.append_tags_to_all_arrays(
            {"interpolation": [f"[i_{lb:.1f}_{ub:.1f}]"]}, replace=True
        )
        return source_data

    @staticmethod
    def append_workcenter_to_transcription(
        source_data: SourceDataset, workcenters: Dict[str, str]
    ) -> SourceDataset:
        for data_array in source_data:
            tags = workcenters[data_array.transcription.workunit]
            source_data.append_tags_to_array(
                data_array.key, {"workcenter": [tags]}, replace=True
            )
        return source_data

    @staticmethod
    def cut_wbf_pirometer_signals(
        source_data: SourceDataset, piro_settings: Dict[str, Dict[str, str]]
    ):
        all_target_segments = list(
            set(
                [
                    target for seg in source_data.settings.segments.values()
                    for target in seg.target_segments
                ]
            )
        )
        all_cut_indexes = []
        for name, points in piro_settings["SETTINGS"].items():
            array = source_data.return_arrays_by_tags({"name": name})[0]

            # Ищем точки разделения
            cut_indexes = []
            point_idx = 0
            col_len = len(array.values)
            for idx in range(col_len):
                point_data = CutPoint(*points[point_idx])
                if (idx - point_data.L_WIN < 0
                        or idx + point_data.R_WIN >= col_len):
                    continue
                l_delta = (
                    array.values[idx] - array.values[idx - point_data.L_WIN]
                )
                r_delta = (
                    array.values[idx + point_data.R_WIN] - array.values[idx]
                )
                if (point_data.L_DELTA_MIN <= l_delta <= point_data.L_DELTA_MAX
                        and (point_data.R_DELTA_MIN <= r_delta <=
                                point_data.R_DELTA_MAX)):
                    cut_indexes.append(idx)
                    if point_idx + 1 < len(points):
                        point_idx += 1
                    else:
                        break
            print(cut_indexes)
            all_cut_indexes.extend(cut_indexes)
            for number, indexes in enumerate(zip(cut_indexes[:-1],
                                                    cut_indexes[1:])):
                source_data.settings.segments[f"{name}_{number}"] = Segment(
                    start_point=source_data.billet_array().values[indexes[0]],
                    end_point=source_data.billet_array().values[indexes[1]],
                    target_segments=all_target_segments
                )
        all_cut_indexes = sorted(all_cut_indexes)
        for number, indexes in enumerate(zip(all_cut_indexes[:-1],
                                                all_cut_indexes[1:])):
            source_data.settings.segments[f"ALL_PTS_{number}"] = Segment(
                start_point=source_data.billet_array().values[indexes[0]],
                end_point=source_data.billet_array().values[indexes[1]],
                target_segments=all_target_segments
            )

        print(all_cut_indexes)

        return source_data

    @staticmethod
    def convert_date_columns_to_numeric(
        source_data: SourceDataset, tags_list: List[Dict[str, str]]
    ) -> SourceDataset:
        for tag in tags_list:
            tag_arrays = source_data.return_arrays_by_tags(tag)
            for array in tag_arrays:
                date_values = [
                    (pd.Timestamp(value) - constants.BASE_TIME).total_seconds()
                    for value in array.values
                ]
                min_data = min(date_values)
                converted_values = [value - min_data for value in date_values]
                source_data.replace_array_values(
                    array.key, np.array(converted_values)
                )
                source_data.append_tags_to_array(
                    array.key, {"secondary_functions": ["converted"]}
                )
        return source_data

In [648]:
secondary_functions = SecondaryFunctions()

secondary_functions.sort_dataset_ascending_by_billet(dataset)

LNK100; billet='BilletPoints'

In [649]:
dataset.data["LNK100_BilletPoints_ni"].values

array([0.025, 0.05 , 0.075, 0.1  , 0.125, 0.15 , 0.175, 0.2  , 0.225,
       0.25 , 0.275, 0.3  , 0.325, 0.35 , 0.375, 0.4  , 0.425, 0.45 ,
       0.475, 0.5  , 0.525, 0.55 , 0.575, 0.6  , 0.625, 0.65 , 0.675,
       0.7  , 0.725, 0.75 , 0.775, 0.8  , 0.825, 0.85 , 0.875, 0.9  ,
       0.925, 0.95 , 0.975, 1.   ])

In [650]:
dataset.billet_array().values

array([0.025, 0.05 , 0.075, 0.1  , 0.125, 0.15 , 0.175, 0.2  , 0.225,
       0.25 , 0.275, 0.3  , 0.325, 0.35 , 0.375, 0.4  , 0.425, 0.45 ,
       0.475, 0.5  , 0.525, 0.55 , 0.575, 0.6  , 0.625, 0.65 , 0.675,
       0.7  , 0.725, 0.75 , 0.775, 0.8  , 0.825, 0.85 , 0.875, 0.9  ,
       0.925, 0.95 , 0.975, 1.   ])

In [651]:
dataset.data

{'LNK100_BilletPoints_ni': LNK100_BilletPoints_ni; is_numeric=True, is_billet=True,
 'LNK100_Vert1500_ni': LNK100_Vert1500_ni; is_numeric=True, is_billet=False,
 'LNK100_Vert2000_ni': LNK100_Vert2000_ni; is_numeric=True, is_billet=False,
 'LNK100_Vert3000_ni': LNK100_Vert3000_ni; is_numeric=True, is_billet=False,
 'LNK100_Hor1000_ni': LNK100_Hor1000_ni; is_numeric=True, is_billet=False,
 'LNK100_Hor1500_ni': LNK100_Hor1500_ni; is_numeric=True, is_billet=False,
 'LNK100_Hor2000_ni': LNK100_Hor2000_ni; is_numeric=True, is_billet=False,
 'LNK100_Torsion_ni': LNK100_Torsion_ni; is_numeric=True, is_billet=False,
 'LNK100_Vert1000_i103.0': LNK100_Vert1000_i103.0; is_numeric=True, is_billet=False}

In [652]:
secondary_functions.approximate_billet_by_bounds(dataset)

LNK100; billet='BilletPoints'

In [653]:
dataset.data

{'LNK100_Hor1500_ni': LNK100_Hor1500_ni; is_numeric=True, is_billet=False,
 'LNK100_Hor2000_ni': LNK100_Hor2000_ni; is_numeric=True, is_billet=False,
 'LNK100_Torsion_ni': LNK100_Torsion_ni; is_numeric=True, is_billet=False,
 'LNK100_Vert1000_i103.0': LNK100_Vert1000_i103.0; is_numeric=True, is_billet=False,
 'LNK100_BilletPoints_[i_0.0_103.0]': LNK100_BilletPoints_[i_0.0_103.0]; is_numeric=True, is_billet=True,
 'LNK100_Vert1500_[i_0.0_103.0]': LNK100_Vert1500_[i_0.0_103.0]; is_numeric=True, is_billet=False,
 'LNK100_Vert2000_[i_0.0_103.0]': LNK100_Vert2000_[i_0.0_103.0]; is_numeric=True, is_billet=False,
 'LNK100_Vert3000_[i_0.0_103.0]': LNK100_Vert3000_[i_0.0_103.0]; is_numeric=True, is_billet=False,
 'LNK100_Hor1000_[i_0.0_103.0]': LNK100_Hor1000_[i_0.0_103.0]; is_numeric=True, is_billet=False}

In [654]:
tags_to_check = {
"interpolation": "[i_0.0_103.0]",
}

# Получаем список массивов, удовлетворяющих заданным тегам
arrays_with_tags = dataset.return_arrays_by_tags(tags_to_check)
arrays_with_tags

[LNK100_BilletPoints_[i_0.0_103.0]; is_numeric=True, is_billet=True,
 LNK100_Vert1500_[i_0.0_103.0]; is_numeric=True, is_billet=False,
 LNK100_Vert2000_[i_0.0_103.0]; is_numeric=True, is_billet=False,
 LNK100_Vert3000_[i_0.0_103.0]; is_numeric=True, is_billet=False,
 LNK100_Hor1000_[i_0.0_103.0]; is_numeric=True, is_billet=False]

In [655]:
secondary_functions.append_workcenter_to_transcription(dataset, {"LNK100": "mill"})

LNK100; billet='BilletPoints'

In [656]:
dataset.data["mill_LNK100_Hor1500_ni"].transcription.workcenter

'mill'

In [657]:
dataset.settings.segments

{'LNK100_1': Segment(start_point=0, end_point=3, target_segments=[]),
 'LNK100_2': Segment(start_point=3, end_point=6, target_segments=['LNK100_1']),
 'LNK100_3': Segment(start_point=30, end_point=70, target_segments=['LNK100_1']),
 'LNK100_4': Segment(start_point=-3, end_point='end', target_segments=['LNK100_1']),
 'LNK100_5': Segment(start_point=-6, end_point=-3, target_segments=['LNK100_1'])}

#### WBF_PIRO_CUTTER

In [770]:
# Создание объекта SourceSettings
settings = SourceSettings(
    source="WBF_PIRO",
    type="feature",
    handler="WBFPIRO",
    rolling_number="1",
    encoding="UTF-8",
    is_single=False,
    main_folder="\\ZSMK-9684-001\Data\2023",
    key_folder="wbf",
    nested_folders="billet_pirometer_files",
    filename_key="",
    interpolation_type="",
    interpolation="",
    segments={
            "WBF_PIRO_1": Segment(start_point=0, end_point="end", target_segments=["LNK100_1"])
        },
    forbidden_columns=["BEAM_VERT_ENC", "BEAM_HOR_ENC_POS", "DM_ENC_HOR_POS", "DML_VERT_POS", "DMR_VERT_POS"],
    filtration_methods=[],
    secondary_features=[],
    aggregation_methods=['max', 'min','median','mean'],
    billet_column="moment",
    convert_columns={}
)
# Создание объекта SourceDataset
piroset = SourceDataset(settings)

In [771]:
file_path = 'testing/20240307170148_XX-Л210444041_WBF_1_T2.csv'
df = pd.read_csv(file_path, sep=';')
for col in df.drop("moment", axis=1).columns:
    df[col] = pd.to_numeric(df[col].str.replace(",", "."), errors="coerce")
df["moment"] = pd.to_datetime(df["moment"])
df

Unnamed: 0,moment,TEMP_PIR3,TEMP_PIR1,TEMP_PIR2,BEAM_VERT_ENC,BEAM_HOR_ENC_POS,DM_ENC_HOR_POS,DML_VERT_POS,DMR_VERT_POS
0,2024-03-07 23:59:24.507,1122.627319,1107.378418,1132.494263,45.315788,5.919983,115.760651,231.720001,256.759979
1,2024-03-07 23:59:24.557,1122.627319,1107.378418,1132.494263,45.842106,5.919983,115.760651,228.500000,253.659988
2,2024-03-07 23:59:24.607,1122.627319,1107.378418,1132.494263,46.894737,5.919983,115.760651,224.399994,249.739990
3,2024-03-07 23:59:24.657,1122.627319,1107.378418,1132.494263,47.421055,5.919983,115.760651,220.339996,245.860001
4,2024-03-07 23:59:24.707,1122.771973,1107.378418,1132.494263,47.947369,5.919983,115.760651,216.339996,242.059998
...,...,...,...,...,...,...,...,...,...
2368,2024-03-08 00:01:38.807,1107.320557,1083.940918,1092.361084,100.578949,6.239990,112.552254,407.559998,385.239990
2369,2024-03-08 00:01:38.857,1107.320557,1083.940918,1092.361084,100.578949,6.239990,111.065437,403.579987,381.199982
2370,2024-03-08 00:01:38.907,1107.320557,1084.288208,1090.798706,100.578949,6.239990,110.070496,398.500000,376.019989
2371,2024-03-08 00:01:38.957,1107.320557,1084.288208,1090.798706,100.578949,6.239990,109.679230,394.359985,371.899994


In [772]:
piroset.append_dataframe_to_source_data(df)

In [773]:
import json

# Указываем путь к файлу JSON
file_path = "exploration/agregator/run/materials/wbf_piro_cutter_settings.json"

# Открываем файл и загружаем его содержимое в переменную
with open(file_path, "r") as file:
    piro_settings = json.load(file)

In [774]:
new_piroset = SecondaryFunctions().cut_wbf_pirometer_signals(piroset, piro_settings)

[153, 469, 752, 1152, 1706]
[495, 773, 1172, 1596]
[1892, 2067]
[77, 1772, 1816, 2098, 2183, 2326]
[100, 1772, 1820, 2095, 2186, 2322]
[77, 100, 153, 469, 495, 752, 773, 1152, 1172, 1596, 1706, 1772, 1772, 1816, 1820, 1892, 2067, 2095, 2098, 2183, 2186, 2322, 2326]


In [775]:
new_piroset.settings.segments

{'WBF_PIRO_1': Segment(start_point=0, end_point='end', target_segments=['LNK100_1']),
 'BEAM_VERT_ENC_0': Segment(start_point=numpy.datetime64('2024-03-07T23:59:33.107000000'), end_point=numpy.datetime64('2024-03-07T23:59:53.057000000'), target_segments=['LNK100_1']),
 'BEAM_VERT_ENC_1': Segment(start_point=numpy.datetime64('2024-03-07T23:59:53.057000000'), end_point=numpy.datetime64('2024-03-08T00:00:08.907000000'), target_segments=['LNK100_1']),
 'BEAM_VERT_ENC_2': Segment(start_point=numpy.datetime64('2024-03-08T00:00:08.907000000'), end_point=numpy.datetime64('2024-03-08T00:00:30.957000000'), target_segments=['LNK100_1']),
 'BEAM_VERT_ENC_3': Segment(start_point=numpy.datetime64('2024-03-08T00:00:30.957000000'), end_point=numpy.datetime64('2024-03-08T00:01:04.107000000'), target_segments=['LNK100_1']),
 'BEAM_HOR_ENC_POS_0': Segment(start_point=numpy.datetime64('2024-03-07T23:59:55.457000000'), end_point=numpy.datetime64('2024-03-08T00:00:10.207000000'), target_segments=['LNK100_1'

In [767]:
some = SecondaryFunctions().convert_date_columns_to_numeric(piroset, [{"name": "moment"}])

In [776]:
piroset.data["WBF_PIRO_1_moment_ni"].values

array(['2024-03-07T23:59:24.507000000', '2024-03-07T23:59:24.557000000',
       '2024-03-07T23:59:24.607000000', ...,
       '2024-03-08T00:01:38.907000000', '2024-03-08T00:01:38.957000000',
       '2024-03-08T00:01:39.007000000'], dtype='datetime64[ns]')

In [777]:
some.data["WBF_PIRO_1_moment_ni"].values

array([0.00000000e+00, 5.00000715e-02, 1.00000024e-01, ...,
       1.34400000e+02, 1.34450000e+02, 1.34500000e+02])

### SegmentsAggregator Class

In [845]:
class AggregatedValue:

    def __init__(
        self,
        segment_id: str,
        transcription: Transcription,
        value: Optional[float],
        is_bad: bool = False,
        bad_reason: str = None
    ):
        self.segment_id = segment_id
        self.transcription = transcription
        self.value = value
        self.is_bad = is_bad
        self.bad_reason = bad_reason

    def __repr__(self):
        return f"{str(self.transcription)}, value={self.value}"


class AggregatedSegment(dict):

    def __init__(
        self,
        name: str,
        start_point: float,
        end_point: float,
        values: Dict[str, AggregatedValue] = ()
    ):
        self._name = name
        self._start_point = start_point
        self._end_point = end_point
        super(AggregatedSegment, self).__init__(values)

    def append_value(self, values: List[AggregatedValue]):
        for value in values:
            self[str(value.transcription)] = value

    def name(self):
        return self._name

    def start_point(self):
        return self._start_point

    def end_point(self):
        return self._end_point


class AggregatedSourceDict(dict):

    def __init__(
        self,
        settings: SourceSettings,
        segments: Dict[str, AggregatedSegment] = ()
    ):
        super(AggregatedSourceDict, self).__init__(segments)
        self._source = settings.source
        self._is_target = True if settings.type == 'target' else False
        self._settings = settings

    def __repr__(self):
        count_bad = sum(
            [
                1 for seg_vals in self.values() for value in seg_vals.values()
                if value.is_bad
            ]
        )
        count_values = sum(len(seg_vals) for seg_vals in self.values())
        return f"AggregatedSource(total_values={count_values}, " \
               f"bad_values={count_bad})"

    def append_segment(self, segment: AggregatedSegment):
        self[segment.name()] = segment

    def source(self):
        return self._source

    def settings(self):
        return self._settings

    def is_target(self):
        return self._is_target

class SegmentsAggregator:

    def __init__(self):
        self.methods = {
            "median": self.median_aggregate,
            "min": self.min_aggregate,
            "max": self.max_aggregate,
            "mean": self.mean_aggregate,
            "tg": self.tg_aggregate,
        }

    @staticmethod
    def return_segment_values(
        segment: Segment, billet_array: Array, data_array: Array
    ) -> np.array:
        # Начальная точка
        if segment.start_point < 0:
            start_point = max(billet_array.values) + segment.start_point
        else:
            start_point = segment.start_point

        # Конечная точка
        if segment.end_point == "end":
            end_point = max(billet_array.values)
        elif segment.end_point < 0:
            end_point = max(billet_array.values) + segment.end_point
        else:
            end_point = segment.end_point

        # Сегментация
        points = billet_array.values[(billet_array.values < 3)
                                     & (billet_array.values > 0)]
        segment_data = [
            value for bil, value in zip(billet_array.values, data_array.values)
            if end_point >= bil >= start_point
        ]
        return np.array(points), np.array(segment_data)

    def get_method_value(
        self,
        segment_id: str,
        transcription: Transcription,
        method: str,
        *args,
        **kwargs,
    ) -> AggregatedValue:
        return AggregatedValue(
            segment_id, transcription, self.methods[method](*args, **kwargs)
        )

    @staticmethod
    def median_aggregate(data_array: np.array):
        return np.median(data_array)

    @staticmethod
    def max_aggregate(data_array: np.array):
        return np.max(data_array)

    @staticmethod
    def min_aggregate(data_array: np.array):
        return np.min(data_array)

    @staticmethod
    def mean_aggregate(data_array: np.array):
        return np.mean(data_array)

    @staticmethod
    def tg_aggregate(data_array: np.array, points: np.array):
        extended_points = np.c_[points, np.ones(len(points))]
        tg, _ = np.linalg.lstsq(extended_points, data_array, rcond=None)[0]

        return tg

    def _rotate(self, data_array: np.array, points: np.array):
        angle = np.arctan(self.tg_aggregate(data_array, points))

        rotation_matrix = np.array(
            [[np.cos(angle), -np.sin(angle)], [np.sin(angle),
                                               np.cos(angle)]]
        )

        # Смещаем график к началу координат
        x_centered = points - np.mean(points)
        y_centered = data_array - np.mean(data_array)

        # Поворачиваем график
        xy_rotated = np.dot(
            rotation_matrix, np.vstack([x_centered, y_centered])
        )

        # Возвращаем график в исходное положение
        rotated_points = xy_rotated[0] + np.mean(points)
        rotated_array = xy_rotated[1] + np.mean(data_array)

        return rotated_points, rotated_array

In [818]:
# Создание объекта SourceSettings
settings = SourceSettings(
    source="LNK100",
    type="feature",
    handler="BASE",
    rolling_number="1",
    encoding="UTF-8",
    is_single=False,
    main_folder="\\ZSMK-9684-001\Data\2023",
    key_folder="ipsrnk",
    nested_folders="rail_points_files",
    filename_key="",
    interpolation_type="by_value",
    interpolation=103,
    segments={
            "LNK100_1": Segment(start_point=0, end_point=3, target_segments=[]),
            "LNK100_2": Segment(start_point=3, end_point=6, target_segments=["LNK100_1"]),
            "LNK100_3": Segment(start_point=30, end_point=70, target_segments=["LNK100_1"]),
            "LNK100_4": Segment(start_point=-3, end_point="end", target_segments=["LNK100_1"]),
            "LNK100_5": Segment(start_point=-6, end_point=-3, target_segments=["LNK100_1"])
        },
    forbidden_columns=[],
    filtration_methods=["std"],
    secondary_features=["abs"],
    aggregation_methods=['tg', 'max', 'min', 'median'],
    billet_column="BilletPoints",
    convert_columns={}
)
# Создание объекта SourceDataset
dataset = SourceDataset(settings)

file_path = "testing/20240308000407_21046X404_IPSRNK_1_L.csv"
df = pd.read_csv(file_path, sep=';')
for col in df.columns:
    if col == "BilletPoints":
        df[col] = pd.to_numeric(df[col].str.replace(",", "."), errors="coerce").fillna(0).astype(float)
    else:
        df[col] = pd.to_numeric(df[col].replace(",", "."), errors="coerce").fillna(0).astype(float)

dataset.append_dataframe_to_source_data(df)

In [831]:
pd.DataFrame(dataset.data["LNK100_1_Vert1000_ni"].values, columns={"Vert1000"})

Unnamed: 0,Vert1000
0,181.0
1,181.0
2,181.0
3,181.0
4,181.0
...,...
4182,0.0
4183,0.0
4184,0.0
4185,0.0


In [897]:
points, seg_data = SegmentsAggregator().return_segment_values(dataset.settings.segments["LNK100_1"], dataset.billet_array(), dataset.data["LNK100_1_Vert1000_ni"])

In [902]:
points = [0.025, 0.05 , 0.075, 0.1  , 0.125, 0.15 , 0.175, 0.2  , 0.225,
       0.25 , 0.275, 0.3  , 0.325, 0.35 , 0.375, 0.4  , 0.425, 0.45 ,
       0.475, 0.5  , 0.525, 0.55 , 0.575, 0.6  , 0.625, 0.65 , 0.675,
       0.7  , 0.725, 0.75 , 0.775, 0.8  , 0.825, 0.85 , 0.875, 0.9  ,
       0.925, 0.95 , 0.975, 1.   , 1.025, 1.05 , 1.075, 1.1  , 1.125,
       1.15 , 1.175, 1.2  , 1.225, 1.25 , 1.275, 1.3  , 1.325, 1.35 ,
       1.375, 1.4  , 1.425, 1.45 , 1.475, 1.5  , 1.525, 1.55 , 1.575,
       1.6  , 1.625, 1.65 , 1.675, 1.7  , 1.725, 1.75 , 1.775, 1.8  ,
       1.825, 1.85 , 1.875, 1.9  , 1.925, 1.95 , 1.975, 2.   , 2.025,
       2.05 , 2.075, 2.1  , 2.125, 2.15 , 2.175, 2.2  , 2.225, 2.25 ,
       2.275, 2.3  , 2.325, 2.35 , 2.375, 2.4  , 2.425, 2.45 , 2.475,
       2.5  , 2.525, 2.55 , 2.575, 2.6  , 2.625, 2.65 , 2.675, 2.7  ,
       2.725, 2.75 , 2.775, 2.8  , 2.825, 2.85 , 2.875, 2.9  , 2.925,
       2.95 , 2.975]
len(points)

119

In [903]:
seg_data = [181., 181., 181., 181., 181., 181., 181., 181., 181., 181.,
       181., 181., 181., 181., 181., 181., 181., 181., 181., 184., 187.,
       189., 191., 192., 193., 195., 195., 194., 194., 194., 194., 194.,
       194., 191., 189., 185., 183., 179., 177., 175., 172., 171., 170.,
       168., 168., 167., 167., 168., 167., 167., 167., 166., 163., 160.,
       158., 155., 153., 148., 143., 139., 134., 131., 129., 126., 123.,
       121., 119., 118., 118., 118., 118., 120., 121., 125., 128., 131.,
       134., 135., 134., 133., 134., 133., 135., 134., 135., 136., 135.,
       135., 136., 136., 135., 135., 136., 137., 135., 134., 133., 135.,
       136., 136., 136., 137., 137., 137., 137., 135., 134., 133., 133.,
       131., 131., 128., 126., 122., 118., 116., 114., 114., 115.]
len(seg_data)

119

In [904]:
SegmentsAggregator()._rotate(seg_data, points)

(array([ 28.67729317,  28.67823394,  28.6791747 ,  28.68011547,
         28.68105624,  28.68199701,  28.68293778,  28.68387855,
         28.68481932,  28.68576009,  28.68670085,  28.68764162,
         28.68858239,  28.68952316,  28.69046393,  28.6914047 ,
         28.69234547,  28.69328624,  28.694227  ,  31.69304291,
         34.69185882,  36.69138301,  38.6909072 ,  39.69113969,
         40.69137217,  42.69089636,  42.69183713,  41.69348619,
         41.69442695,  41.69536772,  41.69630849,  41.69724926,
         41.69819003,  38.70125566,  36.703613  ,  32.70738692,
         30.70974427,  26.71351818,  24.71587553,  22.71823287,
         19.7212985 ,  18.72294756,  17.72459661,  15.72695396,
         15.72789473,  14.72954378,  14.73048455,  15.73071703,
         14.73236609,  14.73330686,  14.73424763,  13.73589668,
         10.73896231,   7.74202794,   5.74438529,   2.74745092,
          0.74980826,  -4.24570953,  -9.24122733, -13.23745341,
        -18.2329712 , -21.22990557, -23.

In [793]:
dataset.settings.segments

{'LNK100_1': Segment(start_point=0, end_point=3, target_segments=[]),
 'LNK100_2': Segment(start_point=3, end_point=6, target_segments=['LNK100_1']),
 'LNK100_3': Segment(start_point=30, end_point=70, target_segments=['LNK100_1']),
 'LNK100_4': Segment(start_point=-3, end_point='end', target_segments=['LNK100_1']),
 'LNK100_5': Segment(start_point=-6, end_point=-3, target_segments=['LNK100_1'])}

In [854]:
dataset.data

{'LNK100_1_BilletPoints_ni': LNK100_1_BilletPoints_ni; is_numeric=True, is_billet=True,
 'LNK100_1_Vert1000_ni': LNK100_1_Vert1000_ni; is_numeric=True, is_billet=False,
 'LNK100_1_Vert1500_ni': LNK100_1_Vert1500_ni; is_numeric=True, is_billet=False,
 'LNK100_1_Vert2000_ni': LNK100_1_Vert2000_ni; is_numeric=True, is_billet=False,
 'LNK100_1_Vert3000_ni': LNK100_1_Vert3000_ni; is_numeric=True, is_billet=False,
 'LNK100_1_Hor1000_ni': LNK100_1_Hor1000_ni; is_numeric=True, is_billet=False,
 'LNK100_1_Hor1500_ni': LNK100_1_Hor1500_ni; is_numeric=True, is_billet=False,
 'LNK100_1_Hor2000_ni': LNK100_1_Hor2000_ni; is_numeric=True, is_billet=False,
 'LNK100_1_Torsion_ni': LNK100_1_Torsion_ni; is_numeric=True, is_billet=False}

In [846]:
seg = SegmentsAggregator()

In [860]:
some = np.array(dataset.data["LNK100_1_Hor1000_ni"].values)

In [864]:
np.median(some)

90.0

In [869]:
agg_median = AggregatedValue("some", dataset.data["LNK100_1_Hor1000_ni"].transcription, value=0)

agg_median = seg.get_method_value("some", dataset.data["LNK100_1_Hor1000_ni"].transcription, seg.median_aggregate(dataset.data["LNK100_1_Hor1000_ni"].values))

KeyError: 90.0

In [874]:
dataset.settings.segments["LNK100_1"].start_point

0

In [878]:
agg_seg = AggregatedSegment(
    "LNK100_1_Hor1000_ni_median",
    dataset.settings.segments["LNK100_1"].start_point,
    dataset.settings.segments["LNK100_1"].end_point,
)

agg_seg.append_value([seg.get_method_value("some",
                     dataset.data["LNK100_1_Hor1000_ni"].transcription,
                     seg.median_aggregate(dataset.data["LNK100_1_Hor1000_ni"].values))])

KeyError: 90.0

### Fliters Class

In [908]:
class Filters:

    def __init__(self):
        self.methods = {
            "std": self.remove_by_std,
            "forbidden_columns": self.remove_by_forbidden_columns
        }

    def filter_by(self, method: str, *args, **kwargs):
        return self.methods[method](*args, **kwargs)

    @staticmethod
    def remove_by_std(data_array: Array, *args, **kwargs):
        if data_array.is_numeric:
            return True if data_array.values.std() == 0 else False
        else:
            return False

    @staticmethod
    def remove_by_forbidden_columns(
        data_array: Array,
        source_data: SourceDataset,
        *args,
        **kwargs,
    ):
        if (data_array.transcription.name
                in source_data.settings.forbidden_columns):
            return True
        else:
            return False

### HandlerFactory Class

In [910]:
from abc import ABC, abstractmethod

class SourceHandler(ABC):
    source: str

    @abstractmethod
    def __init__(
        self, features_generator: FeaturesGenerator, filters: Filters,
        secondary_functions: SecondaryFunctions,
        segments_aggregator: SegmentsAggregator
    ):
        self.segments_aggregator = segments_aggregator
        self.secondary_functions = secondary_functions
        self.feature_generator = features_generator
        self.filters = filters

    @abstractmethod
    def process_pipeline(
        self, billet_id: str, data: pd.DataFrame, settings: SourceSettings
    ):
        ...

In [911]:
class HandlersFactory:

    def __init__(
        self, features_generator: FeaturesGenerator, filters: Filters,
        materials: Materials, secondary_functions: SecondaryFunctions,
        segments_aggregator: SegmentsAggregator
    ):
        handler_setup = {
            "features_generator": features_generator,
            "filters": filters,
            "secondary_functions": secondary_functions,
            "segments_aggregator": segments_aggregator,
            "materials": materials
        }
        self.handlers_dict = {
            BASEHandler.source: BASEHandler(**handler_setup),
            BASEWITHPOINTSHandler.source: BASEHandler(**handler_setup),
            WBFPIROHandler.source: WBFPIROHandler(**handler_setup),
            WBFSINGLEHandler.source: WBFSINGLEHandler(**handler_setup)
        }

    def get_handler(self, source: str) -> SourceHandler:
        return self.handlers_dict[source]

### BASEHandler Class

In [912]:
class BASEHandler(SourceHandler):
    source = "BASE"

    def __init__(
        self,
        features_generator: FeaturesGenerator,
        filters: Filters,
        materials: Materials,
        secondary_functions: SecondaryFunctions,
        segments_aggregator: SegmentsAggregator,
    ):
        self.materials = materials
        self.segments_aggregator = segments_aggregator
        self.secondary_functions = secondary_functions
        self.feature_generator = features_generator
        self.filters = filters

    def process_pipeline(
        self, billet_id: str, data: pd.DataFrame, settings: SourceSettings
    ) -> AggregatedSourceDict:
        source_data = SourceDataset(settings)

        # Подготовительный пайплайн
        source_data.append_dataframe_to_source_data(data)
        source_data = (
            self.secondary_functions.
            sort_dataset_ascending_by_billet(source_data)
        )
        source_data = self.secondary_functions.approximate_billet_by_bounds(
            source_data
        )
        with open(self.materials.PATHS['workcenters']) as handle:
            workcenters = json.load(handle)
        source_data = (
            self.secondary_functions.append_workcenter_to_transcription(
                source_data, workcenters
            )
        )

        # Базовый пайплайн
        source_data = self.filter_data(source_data)
        source_data = self.generate_features(source_data)
        aggregated_source = self.calculate_aggregations(source_data)
        return aggregated_source

    def filter_data(self, source_data: SourceDataset) -> SourceDataset:
        """Фильтрация данных"""
        filtered_columns = []
        for data_array in source_data:
            if data_array.is_billet:
                continue
            for method in [*source_data.settings.filtration_methods,
                           "forbidden_columns"]:
                is_bad = self.filters.filter_by(
                    method=method,
                    data_array=data_array,
                    source_data=source_data
                )
                if is_bad:
                    filtered_columns.append(data_array.transcription)
        source_data.remove_arrays(filtered_columns)
        return source_data

    def generate_features(self, source_data: SourceDataset) -> SourceDataset:
        """Генерация вторичных фичей"""
        new_arrays = []
        for method in source_data.settings.secondary_features:
            for data_array in source_data:
                if data_array.is_billet or not data_array.is_numeric:
                    continue
                new_arrays.append(
                    self.feature_generator.get_method_values(
                        method, data_array
                    )
                )
        for new_array in new_arrays:
            source_data.append_array_to_source_data(new_array)
        return source_data

    def calculate_aggregations(
        self, source_data: SourceDataset
    ) -> AggregatedSourceDict:
        """Аггрегация данных по сегментам"""
        aggregated_source = AggregatedSourceDict(source_data.settings)
        for segment_id, segment in source_data.settings.segments.items():
            aggregated_segment = AggregatedSegment(
                segment_id, segment.start_point, segment.end_point
            )
            for data_array in source_data:
                if data_array.is_billet:
                    continue
                for method in source_data.settings.aggregation_methods:
                    transcription = data_array.transcription.add_tags(
                        {
                            "sector_range": [
                                f"{segment.start_point}_{segment.end_point}"
                            ],
                            "aggregation": [f"{method}"]
                        }
                    )
                    segment_values = (
                        self.segments_aggregator.return_segment_values(
                            segment, source_data.billet_array(), data_array
                        )
                    )
                    segment_values = segment_values[~pd.isnull(segment_values)]
                    if not data_array.is_numeric:
                        aggregated_value = AggregatedValue(
                            segment_id,
                            transcription,
                            None,
                            True,
                            "Not numeric",
                        )
                    elif len(segment_values) == 0:
                        aggregated_value = AggregatedValue(
                            segment_id, transcription, None, True, "Empty"
                        )
                    else:
                        aggregated_value = (
                            self.segments_aggregator.get_method_value(
                                segment_id, transcription, method,
                                segment_values
                            )
                        )
                    aggregated_segment.append_value([aggregated_value])
            aggregated_source.append_segment(aggregated_segment)
        return aggregated_source

### BASEWITHPOINTSHandler

In [977]:
class BASEWITHPOINTSHandler(SourceHandler):
    source = "BASEWITHPOINTS"

    def __init__(
        self,
        features_generator: FeaturesGenerator,
        filters: Filters,
        materials: Materials,
        secondary_functions: SecondaryFunctions,
        segments_aggregator: SegmentsAggregator,
    ):
        self.materials = materials
        self.segments_aggregator = segments_aggregator
        self.secondary_functions = secondary_functions
        self.feature_generator = features_generator
        self.filters = filters

    def process_pipeline(
        self, billet_id: str, data: pd.DataFrame, settings: SourceSettings
    ) -> AggregatedSourceDict:
        source_data = SourceDataset(settings)

        # Подготовительный пайплайн
        source_data.append_dataframe_to_source_data(data)
        source_data = (
            self.secondary_functions.
            sort_dataset_ascending_by_billet(source_data)
        )
        source_data = self.secondary_functions.approximate_billet_by_bounds(
            source_data
        )
        with open(self.materials.PATHS['workcenters']) as handle:
            workcenters = json.load(handle)

        with open(self.materials.PATHS['min_data']) as handle:
            MIN = json.load(handle)

        with open(self.materials.PATHS['max_data']) as handle:
            MAX = json.load(handle)

        source_data = (
            self.secondary_functions.append_workcenter_to_transcription(
                source_data, workcenters
            )
        )

        # Базовый пайплайн
        source_data = self.filter_data(source_data)
        source_data = self.generate_features(source_data, MIN, MAX)
        aggregated_source = self.calculate_aggregations(source_data)
        return aggregated_source

    def filter_data(self, source_data: SourceDataset) -> SourceDataset:
        """Фильтрация данных"""
        filtered_columns = []
        for data_array in source_data:
            if data_array.is_billet:
                continue
            for method in [*source_data.settings.filtration_methods,
                           "forbidden_columns"]:
                is_bad = self.filters.filter_by(
                    method=method,
                    data_array=data_array,
                    source_data=source_data
                )
                if is_bad:
                    filtered_columns.append(data_array.transcription)
        source_data.remove_arrays(filtered_columns)
        return source_data

    def generate_features(
        self, source_data: SourceDataset, MIN, MAX
    ) -> SourceDataset:
        """Генерация вторичных фичей"""
        new_arrays = []
        for method in source_data.settings.secondary_features:
            for data_array in source_data:
                if data_array.is_billet or not data_array.is_numeric:
                    continue
                new_arrays.append(
                    self.feature_generator.get_method_values(
                        method, data_array, MIN, MAX
                    )
                )
        for new_array in new_arrays:
            source_data.append_array_to_source_data(new_array)
        return source_data

    def calculate_aggregations(
        self, source_data: SourceDataset
    ) -> AggregatedSourceDict:
        """Аггрегация данных по сегментам"""
        aggregated_source = AggregatedSourceDict(source_data.settings)
        for segment_id, segment in source_data.settings.segments.items():
            aggregated_segment = AggregatedSegment(
                segment_id, segment.start_point, segment.end_point
            )
            for data_array in source_data:
                if data_array.is_billet:
                    continue
                for method in source_data.settings.aggregation_methods:
                    transcription = data_array.transcription.add_tags(
                        {
                            "sector_range": [
                                f"{segment.start_point}_{segment.end_point}"
                            ],
                            "aggregation": [f"{method}"]
                        }
                    )

                    points, segment_values = (
                        self.segments_aggregator.return_segment_values(
                            segment, source_data.billet_array(), data_array
                        )
                    )
                    nan_indexes = np.where(np.isnan(segment_values))
                    segment_values = segment_values[~np.isnan(segment_values)]
                    points = np.delete(points, nan_indexes)
                    segment_values = segment_values[~pd.isnull(segment_values)]

                    if not data_array.is_numeric:
                        aggregated_value = AggregatedValue(
                            segment_id,
                            transcription,
                            None,
                            True,
                            "Not numeric",
                        )
                    elif len(segment_values) == 0:
                        aggregated_value = AggregatedValue(
                            segment_id, transcription, None, True, "Empty"
                        )
                    else:
                        aggregated_value = (
                            self.segments_aggregator.get_method_value(
                                segment_id, transcription, method,
                                segment_values, points
                            )
                        )
                    aggregated_segment.append_value([aggregated_value])
            aggregated_source.append_segment(aggregated_segment)
        return aggregated_source

### WBFPIROHandler

In [919]:
class WBFPIROHandler(SourceHandler):
    source = "WBFPIRO"

    def __init__(
        self,
        features_generator: FeaturesGenerator,
        filters: Filters,
        materials: Materials,
        secondary_functions: SecondaryFunctions,
        segments_aggregator: SegmentsAggregator,
    ):
        self.materials = materials
        self.segments_aggregator = segments_aggregator
        self.secondary_functions = secondary_functions
        self.feature_generator = features_generator
        self.filters = filters

    def process_pipeline(
        self, billet_id: str, data: pd.DataFrame, settings: SourceSettings
    ) -> AggregatedSourceDict:
        source_data = SourceDataset(settings)

        # Подготовительный пайплайн
        source_data.append_dataframe_to_source_data(data)
        source_data = self.secondary_functions.convert_date_columns_to_numeric(
            source_data, [{
                "name": "moment"
            }]
        )
        with open(self.materials.PATHS['wbf_piro_cutter_settings']) as handle:
            wbf_piro_cutter_settings = json.load(handle)
        source_data = self.secondary_functions.cut_wbf_pirometer_signals(
            source_data, wbf_piro_cutter_settings
        )
        source_data = (
            self.secondary_functions.
            sort_dataset_ascending_by_billet(source_data)
        )
        with open(self.materials.PATHS['workcenters']) as handle:
            workcenters = json.load(handle)
        source_data = (
            self.secondary_functions.append_workcenter_to_transcription(
                source_data, workcenters
            )
        )

        # Базовый пайплайн
        source_data = self.filter_data(source_data)
        source_data = self.generate_features(source_data)
        aggregated_source = self.calculate_aggregations(source_data)
        return aggregated_source

    def filter_data(self, source_data: SourceDataset) -> SourceDataset:
        """Фильтрация данных"""
        filtered_columns = []
        for data_array in source_data:
            if data_array.is_billet:
                continue
            for method in [*source_data.settings.filtration_methods,
                           "forbidden_columns"]:
                is_bad = self.filters.filter_by(
                    method=method,
                    data_array=data_array,
                    source_data=source_data
                )
                if is_bad:
                    filtered_columns.append(data_array.transcription)
        source_data.remove_arrays(filtered_columns)
        return source_data

    def generate_features(self, source_data: SourceDataset) -> SourceDataset:
        """Генерация вторичных фичей"""
        new_arrays = []
        for method in source_data.settings.secondary_features:
            for data_array in source_data:
                if data_array.is_billet or not data_array.is_numeric:
                    continue
                new_arrays.append(
                    self.feature_generator.get_method_values(
                        method, data_array
                    )
                )
        for new_array in new_arrays:
            source_data.append_array_to_source_data(new_array)
        return source_data

    def calculate_aggregations(
        self, source_data: SourceDataset
    ) -> AggregatedSourceDict:
        """Аггрегация данных по сегментам"""
        aggregated_source = AggregatedSourceDict(source_data.settings)
        for segment_id, segment in source_data.settings.segments.items():
            aggregated_segment = AggregatedSegment(
                segment_id, segment.start_point, segment.end_point
            )
            for data_array in source_data:
                if data_array.is_billet:
                    continue
                for method in source_data.settings.aggregation_methods:
                    transcription = data_array.transcription.add_tags(
                        {
                            "sector_range": [f"{segment_id}"],
                            "aggregation": [f"{method}"]
                        }
                    )
                    segment_values = (
                        self.segments_aggregator.return_segment_values(
                            segment,
                            source_data.billet_array(),
                            data_array,
                        )
                    )
                    segment_values = segment_values[~pd.isnull(segment_values)]
                    if not data_array.is_numeric:
                        aggregated_value = AggregatedValue(
                            segment_id,
                            transcription,
                            None,
                            True,
                            "Not numeric",
                        )
                    elif len(segment_values) == 0:
                        aggregated_value = AggregatedValue(
                            segment_id, transcription, None, True, "Empty"
                        )
                    else:
                        aggregated_value = (
                            self.segments_aggregator.get_method_value(
                                segment_id, transcription, method,
                                segment_values
                            )
                        )
                    aggregated_segment.append_value([aggregated_value])
            aggregated_source.append_segment(aggregated_segment)
        return aggregated_source

### WBFSINGLEHandler

In [1078]:
class WBFSINGLEHandler(SourceHandler):
    source = "WBFSINGLE"

    def __init__(
        self,
        features_generator: FeaturesGenerator,
        filters: Filters,
        materials: Materials,
        secondary_functions: SecondaryFunctions,
        segments_aggregator: SegmentsAggregator,
    ):
        self.materials = materials
        self.segments_aggregator = segments_aggregator
        self.secondary_functions = secondary_functions
        self.feature_generator = features_generator
        self.filters = filters

    def process_pipeline(
        self, billet_id: str, data: pd.DataFrame, settings: SourceSettings
    ) -> AggregatedSourceDict:
        data = data[data[settings.billet_column
                         ].apply(lambda x: x[:-1] in billet_id)]
        data = data.sort_values(by=["CHARGING_TIME"])
        source_data = SourceDataset(settings)

        # Подготовительный пайплайн
        source_data.append_dataframe_to_source_data(data)

        # Базовый пайплайн
        source_data = self.filter_data(source_data)
        aggregated_source = self.calculate_aggregations(source_data)
        return aggregated_source

    def filter_data(self, source_data: SourceDataset) -> SourceDataset:
        """Фильтрация данных"""
        filtered_columns = []
        for data_array in source_data:
            if data_array.is_billet:
                continue
            for method in [*source_data.settings.filtration_methods,
                           "forbidden_columns"]:
                is_bad = self.filters.filter_by(
                    method=method,
                    data_array=data_array,
                    source_data=source_data
                )
                if is_bad:
                    filtered_columns.append(data_array.transcription)
        source_data.remove_arrays(filtered_columns)
        return source_data

    def calculate_aggregations(
        self, source_data: SourceDataset
    ) -> AggregatedSourceDict:
        """Аггрегация данных по сегментам"""
        aggregated_source = AggregatedSourceDict(source_data.settings)
        for segment_id, segment in source_data.settings.segments.items():
            aggregated_segment = AggregatedSegment(
                segment_id, segment.start_point, segment.end_point
            )
            for data_array in source_data:
                if data_array.is_billet:
                    continue
                transcription = data_array.transcription.add_tags(
                    {"sector_range": ["single"]}
                )
                if not data_array.is_numeric:
                    aggregated_value = AggregatedValue(
                        segment_id, transcription, None, True, "Not numeric"
                    )
                else:
                    aggregated_value = AggregatedValue(
                        segment_id, transcription, float(data_array.values[0])
                    )
                aggregated_segment.append_value([aggregated_value])
            aggregated_source.append_segment(aggregated_segment)
        return aggregated_source


In [978]:
from glob import glob
import os

PATH_TO_MATERIALS=r"exploration\agregator\run\materials\*"

# Компоненты запуска процесса обработки файлов
materials = Materials(
    {
        os.path.splitext(os.path.basename(path))[0]: path
        for path in glob(PATH_TO_MATERIALS)
    } if PATH_TO_MATERIALS else None
)

features_generator = FeaturesGenerator()
secondary_functions = SecondaryFunctions()
segments_aggregator = SegmentsAggregator()
filters = Filters()

# Фабрика сборки кастомных пайплайнов
handlers_factory = HandlersFactory(
    features_generator=features_generator,
    filters=filters,
    secondary_functions=secondary_functions,
    segments_aggregator=segments_aggregator,
    materials=materials
)


In [979]:
# Параметры для обработки
billet_id = "BilletPoints"

file_path = "testing/20240308000407_21046X404_IPSRNK_1_L.csv"
df = pd.read_csv(file_path, sep=';')
for col in df.columns:
    if col == "BilletPoints":
        df[col] = pd.to_numeric(df[col].str.replace(",", "."), errors="coerce").fillna(0).astype(float)
    else:
        df[col] = pd.to_numeric(df[col].replace(",", "."), errors="coerce").fillna(0).astype(float)

data = df  # Ваш DataFrame

# Создание объекта SourceSettings
settings = SourceSettings(
    source="LNK100",
    type="feature",
    handler="BASE",
    rolling_number="1",
    encoding="UTF-8",
    is_single=False,
    main_folder="\\ZSMK-9684-001\Data\2023",
    key_folder="ipsrnk",
    nested_folders="rail_points_files",
    filename_key="",
    interpolation_type="by_value",
    interpolation=103,
    segments={
            "LNK100_1": Segment(start_point=0, end_point=3, target_segments=[])
        },
    forbidden_columns=[],
    filtration_methods=["std"],
    secondary_features=["abs"],
    aggregation_methods=['max', 'min', 'median'],
    billet_column="BilletPoints",
    convert_columns={}
)

# Получение объекта BASEHandler
handler = handlers_factory.get_handler("BASE")

# Запуск обработки данных с помощью BASEHandler
result = handler.process_pipeline(billet_id, data, settings)

In [980]:
result

AggregatedSource(total_values=24, bad_values=0)

In [981]:
result['LNK100_1']

{'mill_LNK100_1_Hor1500_ni_0_3_max': mill_LNK100_1_Hor1500_ni_0_3_max, value=443.0,
 'mill_LNK100_1_Hor1500_ni_0_3_min': mill_LNK100_1_Hor1500_ni_0_3_min, value=223.0,
 'mill_LNK100_1_Hor1500_ni_0_3_median': mill_LNK100_1_Hor1500_ni_0_3_median, value=283.0,
 'mill_LNK100_1_Hor2000_ni_0_3_max': mill_LNK100_1_Hor2000_ni_0_3_max, value=719.0,
 'mill_LNK100_1_Hor2000_ni_0_3_min': mill_LNK100_1_Hor2000_ni_0_3_min, value=340.0,
 'mill_LNK100_1_Hor2000_ni_0_3_median': mill_LNK100_1_Hor2000_ni_0_3_median, value=492.0,
 'mill_LNK100_1_Torsion_ni_0_3_max': mill_LNK100_1_Torsion_ni_0_3_max, value=468.0,
 'mill_LNK100_1_Torsion_ni_0_3_min': mill_LNK100_1_Torsion_ni_0_3_min, value=2.0,
 'mill_LNK100_1_Torsion_ni_0_3_median': mill_LNK100_1_Torsion_ni_0_3_median, value=275.0,
 'mill_LNK100_1_Vert1000_[i_0.0_103.0]_0_3_max': mill_LNK100_1_Vert1000_[i_0.0_103.0]_0_3_max, value=195.0,
 'mill_LNK100_1_Vert1000_[i_0.0_103.0]_0_3_min': mill_LNK100_1_Vert1000_[i_0.0_103.0]_0_3_min, value=114.0,
 'mill_LNK10

In [982]:
result['LNK100_1']["mill_LNK100_1_Hor1500_ni_0_3_max"].transcription

mill_LNK100_1_Hor1500_ni_0_3_max

In [990]:
# Параметры для обработки
billet_id = "BilletPoints"

file_path = "testing/20240308000407_21046X404_IPSRNK_1_L.csv"
df = pd.read_csv(file_path, sep=';')
for col in df.columns:
    if col == "BilletPoints":
        df[col] = pd.to_numeric(df[col].str.replace(",", "."), errors="coerce").fillna(0).astype(float)
    else:
        df[col] = pd.to_numeric(df[col].replace(",", "."), errors="coerce").fillna(0).astype(float)

data = df  # Ваш DataFrame

# Создание объекта SourceSettings
settings = SourceSettings(
    source="LNK100",
    type="feature",
    handler="BASEWITHPOINTS",
    rolling_number="1",
    encoding="UTF-8",
    is_single=False,
    main_folder="\\ZSMK-9684-001\Data\2023",
    key_folder="ipsrnk",
    nested_folders="rail_points_files",
    filename_key="",
    interpolation_type="by_value",
    interpolation=103,
    segments={
            "LNK100_1": Segment(start_point=0, end_point=3, target_segments=[])
        },
    forbidden_columns=[],
    filtration_methods=["std"],
    secondary_features=["abs"],
    aggregation_methods=['max', 'min', 'median'],
    billet_column="BilletPoints",
    convert_columns={}
)

# Получение объекта BASEHandler
handler = handlers_factory.get_handler("BASEWITHPOINTS")

# Запуск обработки данных с помощью BASEHandler
result = handler.process_pipeline(billet_id, data, settings)

In [984]:
result['LNK100_1']

{'mill_LNK100_1_Hor1500_ni_0_3_max': mill_LNK100_1_Hor1500_ni_0_3_max, value=443.0,
 'mill_LNK100_1_Hor1500_ni_0_3_min': mill_LNK100_1_Hor1500_ni_0_3_min, value=223.0,
 'mill_LNK100_1_Hor1500_ni_0_3_median': mill_LNK100_1_Hor1500_ni_0_3_median, value=283.0,
 'mill_LNK100_1_Hor2000_ni_0_3_max': mill_LNK100_1_Hor2000_ni_0_3_max, value=719.0,
 'mill_LNK100_1_Hor2000_ni_0_3_min': mill_LNK100_1_Hor2000_ni_0_3_min, value=340.0,
 'mill_LNK100_1_Hor2000_ni_0_3_median': mill_LNK100_1_Hor2000_ni_0_3_median, value=492.0,
 'mill_LNK100_1_Torsion_ni_0_3_max': mill_LNK100_1_Torsion_ni_0_3_max, value=468.0,
 'mill_LNK100_1_Torsion_ni_0_3_min': mill_LNK100_1_Torsion_ni_0_3_min, value=2.0,
 'mill_LNK100_1_Torsion_ni_0_3_median': mill_LNK100_1_Torsion_ni_0_3_median, value=275.0,
 'mill_LNK100_1_Vert1000_[i_0.0_103.0]_0_3_max': mill_LNK100_1_Vert1000_[i_0.0_103.0]_0_3_max, value=195.0,
 'mill_LNK100_1_Vert1000_[i_0.0_103.0]_0_3_min': mill_LNK100_1_Vert1000_[i_0.0_103.0]_0_3_min, value=114.0,
 'mill_LNK10

In [1056]:
from glob import glob
import os

PATH_TO_MATERIALS=r"exploration\agregator\run\materials\*"

# Компоненты запуска процесса обработки файлов
materials = Materials(
    {
        os.path.splitext(os.path.basename(path))[0]: path
        for path in glob(PATH_TO_MATERIALS)
    } if PATH_TO_MATERIALS else None
)

features_generator = FeaturesGenerator()
secondary_functions = SecondaryFunctions()
segments_aggregator = SegmentsAggregator()
filters = Filters()

# Фабрика сборки кастомных пайплайнов
handlers_factory = HandlersFactory(
    features_generator=features_generator,
    filters=filters,
    secondary_functions=secondary_functions,
    segments_aggregator=segments_aggregator,
    materials=materials
)

# Параметры для обработки
billet_id = "moment"

file_path = 'testing/20240307170148_XX-Л210444041_WBF_1_T2.csv'
df = pd.read_csv(file_path, sep=';')
for col in df.drop("moment", axis=1).columns:
    df[col] = pd.to_numeric(df[col].str.replace(",", "."), errors="coerce")
df["moment"] = pd.to_datetime(df["moment"])

data = df  # Ваш DataFrame

# Создание объекта SourceSettings
settings = SourceSettings(
    source="WBF_PIRO",
    type="feature",
    handler="WBFPIRO",
    rolling_number="1",
    encoding="UTF-8",
    is_single=False,
    main_folder="\\ZSMK-9684-001\Data\2023",
    key_folder="wbf",
    nested_folders="billet_pirometer_files",
    filename_key="",
    interpolation_type="",
    interpolation="",
    segments={
            "WBF_PIRO_1": Segment(start_point=0, end_point="end", target_segments=["LNK100_1"])
        },
    forbidden_columns=["BEAM_VERT_ENC", "BEAM_HOR_ENC_POS", "DM_ENC_HOR_POS", "DML_VERT_POS", "DMR_VERT_POS"],
    filtration_methods=[],
    secondary_features=[],
    aggregation_methods=['max', 'min','median','mean'],
    billet_column="moment",
    convert_columns={}
)

# Указываем путь к файлу JSON
file_path = "exploration/agregator/run/materials/wbf_piro_cutter_settings.json"

# Открываем файл и загружаем его содержимое в переменную
with open(file_path, "r") as file:
    piro_settings = json.load(file)

# Получение объекта BASEHandler
handler = handlers_factory.get_handler("WBFPIRO")

# Запуск обработки данных с помощью BASEHandler
result = handler.process_pipeline(billet_id, data, settings)

[153, 469, 752, 1152, 1706]
[495, 773, 1172, 1596]
[1892, 2067]
[77, 1772, 1816, 2098, 2183, 2326]
[100, 1772, 1820, 2095, 2186, 2322]
[77, 100, 153, 469, 495, 752, 773, 1152, 1172, 1596, 1706, 1772, 1772, 1816, 1820, 1892, 2067, 2095, 2098, 2183, 2186, 2322, 2326]


In [996]:
result["WBF_PIRO_1"]

{'mill_WBF_PIRO_1_TEMP_PIR3_ni_WBF_PIRO_1_max': mill_WBF_PIRO_1_TEMP_PIR3_ni_WBF_PIRO_1_max, value=1187.239501953125,
 'mill_WBF_PIRO_1_TEMP_PIR3_ni_WBF_PIRO_1_min': mill_WBF_PIRO_1_TEMP_PIR3_ni_WBF_PIRO_1_min, value=1107.1539306640625,
 'mill_WBF_PIRO_1_TEMP_PIR3_ni_WBF_PIRO_1_median': mill_WBF_PIRO_1_TEMP_PIR3_ni_WBF_PIRO_1_median, value=1183.767333984375,
 'mill_WBF_PIRO_1_TEMP_PIR3_ni_WBF_PIRO_1_mean': mill_WBF_PIRO_1_TEMP_PIR3_ni_WBF_PIRO_1_mean, value=1165.8901766886754,
 'mill_WBF_PIRO_1_TEMP_PIR1_ni_WBF_PIRO_1_max': mill_WBF_PIRO_1_TEMP_PIR1_ni_WBF_PIRO_1_max, value=1187.5867919921875,
 'mill_WBF_PIRO_1_TEMP_PIR1_ni_WBF_PIRO_1_min': mill_WBF_PIRO_1_TEMP_PIR1_ni_WBF_PIRO_1_min, value=1083.94091796875,
 'mill_WBF_PIRO_1_TEMP_PIR1_ni_WBF_PIRO_1_median': mill_WBF_PIRO_1_TEMP_PIR1_ni_WBF_PIRO_1_median, value=1185.9375,
 'mill_WBF_PIRO_1_TEMP_PIR1_ni_WBF_PIRO_1_mean': mill_WBF_PIRO_1_TEMP_PIR1_ni_WBF_PIRO_1_mean, value=1161.4180109833446,
 'mill_WBF_PIRO_1_TEMP_PIR2_ni_WBF_PIRO_1_max

In [1079]:
from glob import glob
import os

PATH_TO_MATERIALS=r"exploration\agregator\run\materials\*"

# Компоненты запуска процесса обработки файлов
materials = Materials(
    {
        os.path.splitext(os.path.basename(path))[0]: path
        for path in glob(PATH_TO_MATERIALS)
    } if PATH_TO_MATERIALS else None
)

features_generator = FeaturesGenerator()
secondary_functions = SecondaryFunctions()
segments_aggregator = SegmentsAggregator()
filters = Filters()

# Фабрика сборки кастомных пайплайнов
handlers_factory = HandlersFactory(
    features_generator=features_generator,
    filters=filters,
    secondary_functions=secondary_functions,
    segments_aggregator=segments_aggregator,
    materials=materials
)

# Параметры для обработки
billet_id = "Л210421011"

file_path = 'testing/20240308000000_20240309000000_WBF.csv'
# Чтение файла с указанием кодировки ANSI
df = pd.read_csv(file_path, sep=';', encoding='ANSI')
forbidden_cols = ["billet_number", "CHARGING_TIME", "IS_DISCHARGED"]
for col in df.columns:
    if col not in forbidden_cols:
        # Если тип столбца - строка, заменяем запятые на точки и преобразуем в числовой тип
        if df[col].dtype == 'object':
            df[col] = df[col].str.replace(',', '.')
            df[col] = pd.to_numeric(df[col], errors='coerce')
df = df.fillna(0)
data = df  # Ваш DataFrame

# Создание объекта SourceSettings
settings = SourceSettings(
    source="WBF_sgl",
    type="feature",
    handler="WBFSINGLE",
    rolling_number="1",
    encoding="ANSI",
    is_single=True,
    main_folder="\\ZSMK-9684-001\Data\2023",
    key_folder="wbf",
    nested_folders="",
    filename_key="",
    interpolation_type="",
    interpolation="",
    segments={
            "1": Segment(start_point=0, end_point=0, target_segments=["LNK100_1"])
        },
    forbidden_columns=[],
    filtration_methods=[],
    secondary_features=[],
    aggregation_methods=[],
    billet_column="billet_number",
    convert_columns={}
)

# Получение объекта BASEHandler
handler = handlers_factory.get_handler("WBFSINGLE")

# Запуск обработки данных с помощью BASEHandler
result = handler.process_pipeline(billet_id, data, settings)

In [1080]:
result["1"]

{'WBF_sgl_1_PIECE_PK_ni_single': WBF_sgl_1_PIECE_PK_ni_single, value=2.4030712380000006e+19,
 'WBF_sgl_1_FURNACE_FK_ni_single': WBF_sgl_1_FURNACE_FK_ni_single, value=1.0,
 'WBF_sgl_1_CHARGING_TIME_ni_single': WBF_sgl_1_CHARGING_TIME_ni_single, value=None,
 'WBF_sgl_1_TOT_X_NODE_ni_single': WBF_sgl_1_TOT_X_NODE_ni_single, value=100.0,
 'WBF_sgl_1_TOT_Y_NODE_ni_single': WBF_sgl_1_TOT_Y_NODE_ni_single, value=7.0,
 'WBF_sgl_1_TOT_Z_NODE_ni_single': WBF_sgl_1_TOT_Z_NODE_ni_single, value=7.0,
 'WBF_sgl_1_X_POS_ni_single': WBF_sgl_1_X_POS_ni_single, value=599.0,
 'WBF_sgl_1_X_COORD_ni_single': WBF_sgl_1_X_COORD_ni_single, value=4540.5,
 'WBF_sgl_1_IS_DISCHARGED_ni_single': WBF_sgl_1_IS_DISCHARGED_ni_single, value=None,
 'WBF_sgl_1_DISCHARGING_TIME_ni_single': WBF_sgl_1_DISCHARGING_TIME_ni_single, value=0.0,
 'WBF_sgl_1_MARKA_ni_single': WBF_sgl_1_MARKA_ni_single, value=0.0,
 'WBF_sgl_1_BISRA_ni_single': WBF_sgl_1_BISRA_ni_single, value=5.0,
 'WBF_sgl_1_STRATEGY_ni_single': WBF_sgl_1_STRATEGY_

In [1074]:
file_path = 'testing/20240308000000_20240309000000_WBF.csv'
# Чтение файла с указанием кодировки ANSI
df = pd.read_csv(file_path, sep=';', encoding='ANSI')
forbidden_cols = ["billet_number", "CHARGING_TIME", "IS_DISCHARGED"]
for col in df.columns:
    if col not in forbidden_cols:
        # Если тип столбца - строка, заменяем запятые на точки и преобразуем в числовой тип
        if df[col].dtype == 'object':
            df[col] = df[col].str.replace(',', '.')
            df[col] = pd.to_numeric(df[col], errors='coerce')
df = df.fillna(0)
df

Unnamed: 0,PIECE_PK,FURNACE_FK,billet_number,CHARGING_TIME,TOT_X_NODE,TOT_Y_NODE,TOT_Z_NODE,X_POS,X_COORD,IS_DISCHARGED,...,Z1_LV2_IN_percent,Z2_LV2_IN_percent,Z3_LV2_IN_percent,Z4_LV2_IN_percent,Z5_LV2_IN_percent,Z6_LV2_IN_percent,Z7_LV2_IN_percent,Z8_LV2_IN_percent,Z9_LV2_IN_percent,Z10_LV2_IN_percent
0,2.403071e+19,1,Л210444041,2024-03-07 19:36:25,100,7,7,2714.0,6678.0,Y,...,100.0,100.0,100.0,100.0,100.0,100.0,100.000000,100.000000,100.000000,100.000000
1,2.403071e+19,1,Л210421011,2024-03-07 19:38:53,100,7,7,599.0,4540.5,Y,...,100.0,100.0,100.0,100.0,100.0,100.0,100.000000,100.000000,100.000000,100.000000
2,2.403071e+19,1,Л210422011,2024-03-07 19:41:17,100,7,7,2717.0,6666.5,Y,...,100.0,100.0,100.0,100.0,100.0,100.0,100.000000,100.000000,100.000000,100.000000
3,2.403071e+19,1,Л210421051,2024-03-07 19:43:42,100,7,7,598.0,4540.0,Y,...,100.0,100.0,100.0,100.0,100.0,100.0,100.000000,100.000000,100.000000,100.000000
4,2.403071e+19,1,Л210422051,2024-03-07 19:45:58,100,7,7,2718.0,6663.0,Y,...,100.0,100.0,100.0,100.0,100.0,100.0,100.000000,100.000000,100.000000,100.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
489,2.403081e+19,1,Л210812020,2024-03-08 16:49:04,100,7,7,595.0,4538.5,Y,...,100.0,100.0,100.0,100.0,100.0,100.0,46.468058,46.464116,46.456725,46.452045
490,2.403081e+19,1,Л210813011,2024-03-08 16:51:04,100,7,7,2716.0,6670.0,Y,...,100.0,100.0,100.0,100.0,100.0,100.0,45.746079,45.742149,45.734779,45.730112
491,2.403081e+19,1,Л210814011,2024-03-08 16:53:02,100,7,7,548.0,4517.5,Y,...,100.0,100.0,100.0,100.0,100.0,100.0,45.064495,45.060574,45.053221,45.048564
492,2.403081e+19,1,Л210813031,2024-03-08 16:55:02,100,7,7,2710.0,6701.0,Y,...,100.0,100.0,100.0,100.0,100.0,100.0,44.328498,44.324590,44.317261,44.312620


In [1075]:
data = df
data = data[data[settings.billet_column].apply(lambda x: x[:-1] in billet_id)]
data

Unnamed: 0,PIECE_PK,FURNACE_FK,billet_number,CHARGING_TIME,TOT_X_NODE,TOT_Y_NODE,TOT_Z_NODE,X_POS,X_COORD,IS_DISCHARGED,...,Z1_LV2_IN_percent,Z2_LV2_IN_percent,Z3_LV2_IN_percent,Z4_LV2_IN_percent,Z5_LV2_IN_percent,Z6_LV2_IN_percent,Z7_LV2_IN_percent,Z8_LV2_IN_percent,Z9_LV2_IN_percent,Z10_LV2_IN_percent
1,2.403071e+19,1,Л210421011,2024-03-07 19:38:53,100,7,7,599.0,4540.5,Y,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0


In [1037]:
df = df.fillna(0)
tmp = df.isna().sum().to_frame()
tmp[tmp[0] != 0]

Unnamed: 0,0


In [1113]:
# Компоненты запуска процесса обработки файлов
PATH_TO_MATERIALS=r"exploration\agregator\run\materials\*"

materials = Materials(
    {
        os.path.splitext(os.path.basename(path))[0]: path
        for path in glob(PATH_TO_MATERIALS)
    } if PATH_TO_MATERIALS else None
)

features_generator = FeaturesGenerator()
secondary_functions = SecondaryFunctions()
segments_aggregator = SegmentsAggregator()
filters = Filters()

# Фабрика сборки кастомных пайплайнов
handlers_factory = HandlersFactory(
    features_generator=features_generator,
    filters=filters,
    secondary_functions=secondary_functions,
    segments_aggregator=segments_aggregator,
    materials=materials
)

# ID заготовки
billet_id = "Л210421011"

file_path = "testing/20240308000407_21046X404_IPSRNK_1_L.csv"
df = pd.read_csv(file_path, sep=';')
for col in df.columns:
    if col == "BilletPoints":
        df[col] = pd.to_numeric(df[col].str.replace(",", "."), errors="coerce").fillna(0).astype(float)
    else:
        df[col] = pd.to_numeric(df[col].replace(",", "."), errors="coerce").fillna(0).astype(float)

data = df  # Ваш DataFrame

# Создание объекта SourceSettings
settings = SourceSettings(
    source="LNK100",
    type="target",
    handler="BASE",
    rolling_number="1",
    encoding="UTF-8",
    is_single=False,
    main_folder="\\ZSMK-9684-001\Data\2023",
    key_folder="ipsrnk",
    nested_folders="rail_points_files",
    filename_key="",
    interpolation_type="by_value",
    interpolation=103,
    segments={
            "LNK100_1": Segment(start_point=0, end_point=3, target_segments=[])
        },
    forbidden_columns=[],
    filtration_methods=["std"],
    secondary_features=["abs"],
    aggregation_methods=['max', 'min', 'median'],
    billet_column="BilletPoints",
    convert_columns={}
)

# Получение объекта BASEHandler
handler = handlers_factory.get_handler("BASE")

# Запуск обработки данных с помощью BASEHandler
seg_LNK100 = handler.process_pipeline(billet_id, data, settings)

In [1114]:
# Компоненты запуска процесса обработки файлов
PATH_TO_MATERIALS=r"exploration\agregator\run\materials\*"

materials = Materials(
    {
        os.path.splitext(os.path.basename(path))[0]: path
        for path in glob(PATH_TO_MATERIALS)
    } if PATH_TO_MATERIALS else None
)

features_generator = FeaturesGenerator()
secondary_functions = SecondaryFunctions()
segments_aggregator = SegmentsAggregator()
filters = Filters()

# Фабрика сборки кастомных пайплайнов
handlers_factory = HandlersFactory(
    features_generator=features_generator,
    filters=filters,
    secondary_functions=secondary_functions,
    segments_aggregator=segments_aggregator,
    materials=materials
)

# Параметры для обработки
billet_id = "Л210421011"

file_path = 'testing/20240307170148_XX-Л210444041_WBF_1_T2.csv'
df = pd.read_csv(file_path, sep=';')
for col in df.drop("moment", axis=1).columns:
    df[col] = pd.to_numeric(df[col].str.replace(",", "."), errors="coerce")
df["moment"] = pd.to_datetime(df["moment"])

data = df  # Ваш DataFrame

# Создание объекта SourceSettings
settings = SourceSettings(
    source="WBF_PIRO",
    type="feature",
    handler="WBFPIRO",
    rolling_number="1",
    encoding="UTF-8",
    is_single=False,
    main_folder="\\ZSMK-9684-001\Data\2023",
    key_folder="wbf",
    nested_folders="billet_pirometer_files",
    filename_key="",
    interpolation_type="",
    interpolation="",
    segments={
            "WBF_PIRO_1": Segment(start_point=0, end_point="end", target_segments=["LNK100_1"])
        },
    forbidden_columns=["BEAM_VERT_ENC", "BEAM_HOR_ENC_POS", "DM_ENC_HOR_POS", "DML_VERT_POS", "DMR_VERT_POS"],
    filtration_methods=[],
    secondary_features=[],
    aggregation_methods=['max', 'min','median','mean'],
    billet_column="moment",
    convert_columns={}
)

# Путь к файлу JSON до параметров окон для формирования дополнительных сегментов
file_path = "exploration/agregator/run/materials/wbf_piro_cutter_settings.json"
with open(file_path, "r") as file:
    piro_settings = json.load(file)

# Получение объекта WBFPIROHandler
handler = handlers_factory.get_handler("WBFPIRO")

# Запуск обработки данных с помощью WBFPIROHandler
seg_WBF_PIRO = handler.process_pipeline(billet_id, data, settings)

[153, 469, 752, 1152, 1706]
[495, 773, 1172, 1596]
[1892, 2067]
[77, 1772, 1816, 2098, 2183, 2326]
[100, 1772, 1820, 2095, 2186, 2322]
[77, 100, 153, 469, 495, 752, 773, 1152, 1172, 1596, 1706, 1772, 1772, 1816, 1820, 1892, 2067, 2095, 2098, 2183, 2186, 2322, 2326]


In [1115]:
# Компоненты запуска процесса обработки файлов
PATH_TO_MATERIALS=r"exploration\agregator\run\materials\*"

materials = Materials(
    {
        os.path.splitext(os.path.basename(path))[0]: path
        for path in glob(PATH_TO_MATERIALS)
    } if PATH_TO_MATERIALS else None
)

features_generator = FeaturesGenerator()
secondary_functions = SecondaryFunctions()
segments_aggregator = SegmentsAggregator()
filters = Filters()

# Фабрика сборки кастомных пайплайнов
handlers_factory = HandlersFactory(
    features_generator=features_generator,
    filters=filters,
    secondary_functions=secondary_functions,
    segments_aggregator=segments_aggregator,
    materials=materials
)

# Параметры для обработки
billet_id = "Л210421011"

file_path = 'testing/20240308000000_20240309000000_WBF.csv'
# Чтение файла с указанием кодировки ANSI
df = pd.read_csv(file_path, sep=';', encoding='ANSI')
forbidden_cols = ["billet_number", "CHARGING_TIME", "IS_DISCHARGED"]
for col in df.columns:
    if col not in forbidden_cols:
        # Если тип столбца - строка, заменяем запятые на точки и преобразуем в числовой тип
        if df[col].dtype == 'object':
            df[col] = df[col].str.replace(',', '.')
            df[col] = pd.to_numeric(df[col], errors='coerce')
df = df.fillna(0)

data = df  # Ваш DataFrame

# Создание объекта SourceSettings
settings = SourceSettings(
    source="WBF_sgl",
    type="feature",
    handler="WBFSINGLE",
    rolling_number="1",
    encoding="ANSI",
    is_single=True,
    main_folder="\\ZSMK-9684-001\Data\2023",
    key_folder="wbf",
    nested_folders="",
    filename_key="",
    interpolation_type="",
    interpolation="",
    segments={
            "WBF_sgl_1": Segment(start_point=0, end_point=0, target_segments=["LNK100_1"])
        },
    forbidden_columns=[],
    filtration_methods=[],
    secondary_features=[],
    aggregation_methods=[],
    billet_column="billet_number",
    convert_columns={}
)

# Получение объекта WBFSINGLEHandler
handler = handlers_factory.get_handler("WBFSINGLE")

# Запуск обработки данных с помощью WBFSINGLEHandler
seg_WBF_sgl = handler.process_pipeline(billet_id, data, settings)

In [1127]:
all_aggregated_sources["WBF_PIRO"]

AggregatedSource(total_values=492, bad_values=0)

In [1132]:
for segment_name, segment_values in all_aggregated_sources["WBF_PIRO"].items():
    print(segment_values)

{'mill_WBF_PIRO_1_TEMP_PIR3_ni_WBF_PIRO_1_max': mill_WBF_PIRO_1_TEMP_PIR3_ni_WBF_PIRO_1_max, value=1187.239501953125, 'mill_WBF_PIRO_1_TEMP_PIR3_ni_WBF_PIRO_1_min': mill_WBF_PIRO_1_TEMP_PIR3_ni_WBF_PIRO_1_min, value=1107.1539306640625, 'mill_WBF_PIRO_1_TEMP_PIR3_ni_WBF_PIRO_1_median': mill_WBF_PIRO_1_TEMP_PIR3_ni_WBF_PIRO_1_median, value=1183.767333984375, 'mill_WBF_PIRO_1_TEMP_PIR3_ni_WBF_PIRO_1_mean': mill_WBF_PIRO_1_TEMP_PIR3_ni_WBF_PIRO_1_mean, value=1165.8901766886754, 'mill_WBF_PIRO_1_TEMP_PIR1_ni_WBF_PIRO_1_max': mill_WBF_PIRO_1_TEMP_PIR1_ni_WBF_PIRO_1_max, value=1187.5867919921875, 'mill_WBF_PIRO_1_TEMP_PIR1_ni_WBF_PIRO_1_min': mill_WBF_PIRO_1_TEMP_PIR1_ni_WBF_PIRO_1_min, value=1083.94091796875, 'mill_WBF_PIRO_1_TEMP_PIR1_ni_WBF_PIRO_1_median': mill_WBF_PIRO_1_TEMP_PIR1_ni_WBF_PIRO_1_median, value=1185.9375, 'mill_WBF_PIRO_1_TEMP_PIR1_ni_WBF_PIRO_1_mean': mill_WBF_PIRO_1_TEMP_PIR1_ni_WBF_PIRO_1_mean, value=1161.4180109833446, 'mill_WBF_PIRO_1_TEMP_PIR2_ni_WBF_PIRO_1_max': mill_

In [1128]:
all_aggregated_sources["WBF_PIRO"].settings().segments["WBF_PIRO_1"]

Segment(start_point=0, end_point='end', target_segments=['LNK100_1'])

In [1130]:
all_aggregated_sources["WBF_PIRO"].settings().segments["WBF_PIRO_1"].target_segments

['LNK100_1']

In [1116]:
all_aggregated_sources = {
    "LNK100": seg_LNK100,
    "WBF_PIRO": seg_WBF_PIRO,
    "WBF_sgl": seg_WBF_sgl
}

### Matcher

In [1141]:
class Matcher:

    def __init__(self, sources_settings: Dict[str, SourceSettings]):
        self.sources_settings = sources_settings
        self.target_name = [
            name for name, src in self.sources_settings.items()
            if src.type == "target"
        ][0]

    def match_features_to_target(
        self, all_aggregated_sources: Dict[str, AggregatedSourceDict]
    ) -> (Dict[str, AggregatedSegment], Dict[str, Dict[str, float]]):
        targets_segments = {
            seg_key: all_aggregated_sources[self.target_name][seg_key]
            for seg_key in self.sources_settings[self.target_name
                                                 ].segments.keys()
        }
        print(targets_segments)
        for source_name, source_values in all_aggregated_sources.items():
            if source_values.is_target():
                continue
            for segment_name, segment_values in source_values.items():
                segment_settings = source_values.settings(
                ).segments[segment_name]
                for target_name in segment_settings.target_segments:
                    targets_segments[target_name].append_value(
                        segment_values.values()
                    )
        targets_dict = self._create_dict_from_segments(targets_segments)
        return targets_segments, targets_dict

    @staticmethod
    def _create_dict_from_segments(
        targets_segments: Dict[str, AggregatedSegment]
    ) -> Dict[str, Dict[str, float]]:
        targets_dict = {}
        for target_segment, aggregated_values in targets_segments.items():
            targets_dict[target_segment] = {
                str(value.transcription):
                float(value.value) if value.value else None
                for value in aggregated_values.values()
            }
        return targets_dict

In [1142]:
LNK100_settings = SourceSettings(
    source="LNK100",
    type="target",
    handler="BASE",
    rolling_number="1",
    encoding="UTF-8",
    is_single=False,
    main_folder="\\ZSMK-9684-001\Data\2023",
    key_folder="ipsrnk",
    nested_folders="rail_points_files",
    filename_key="",
    interpolation_type="by_value",
    interpolation=103,
    segments={
            "LNK100_1": Segment(start_point=0, end_point=3, target_segments=[])
        },
    forbidden_columns=[],
    filtration_methods=["std"],
    secondary_features=["abs"],
    aggregation_methods=['max', 'min', 'median'],
    billet_column="BilletPoints",
    convert_columns={}
)

WBF_PIRO_settings = SourceSettings(
    source="WBF_PIRO",
    type="feature",
    handler="WBFPIRO",
    rolling_number="1",
    encoding="UTF-8",
    is_single=False,
    main_folder="\\ZSMK-9684-001\Data\2023",
    key_folder="wbf",
    nested_folders="billet_pirometer_files",
    filename_key="",
    interpolation_type="",
    interpolation="",
    segments={
            "WBF_PIRO_1": Segment(start_point=0, end_point="end", target_segments=["LNK100_1"])
        },
    forbidden_columns=["BEAM_VERT_ENC", "BEAM_HOR_ENC_POS", "DM_ENC_HOR_POS", "DML_VERT_POS", "DMR_VERT_POS"],
    filtration_methods=[],
    secondary_features=[],
    aggregation_methods=['max', 'min','median','mean'],
    billet_column="moment",
    convert_columns={}
)

WBF_sgl_settings = SourceSettings(
    source="WBF_sgl",
    type="feature",
    handler="WBFSINGLE",
    rolling_number="1",
    encoding="ANSI",
    is_single=True,
    main_folder="\\ZSMK-9684-001\Data\2023",
    key_folder="wbf",
    nested_folders="",
    filename_key="",
    interpolation_type="",
    interpolation="",
    segments={
            "WBF_sgl_1": Segment(start_point=0, end_point=0, target_segments=["LNK100_1"])
        },
    forbidden_columns=[],
    filtration_methods=[],
    secondary_features=[],
    aggregation_methods=[],
    billet_column="billet_number",
    convert_columns={}
)

sources_settings = {
    "LNK100": LNK100_settings,
    "WBF_PIRO": WBF_PIRO_settings,
    "WBF_sgl": WBF_sgl_settings
}

In [1143]:
matcher = Matcher(sources_settings)

In [1147]:
targets_segments, targets_dict = matcher.match_features_to_target(all_aggregated_sources)

{'LNK100_1': {'mill_LNK100_1_Hor1500_ni_0_3_max': mill_LNK100_1_Hor1500_ni_0_3_max, value=443.0, 'mill_LNK100_1_Hor1500_ni_0_3_min': mill_LNK100_1_Hor1500_ni_0_3_min, value=223.0, 'mill_LNK100_1_Hor1500_ni_0_3_median': mill_LNK100_1_Hor1500_ni_0_3_median, value=283.0, 'mill_LNK100_1_Hor2000_ni_0_3_max': mill_LNK100_1_Hor2000_ni_0_3_max, value=719.0, 'mill_LNK100_1_Hor2000_ni_0_3_min': mill_LNK100_1_Hor2000_ni_0_3_min, value=340.0, 'mill_LNK100_1_Hor2000_ni_0_3_median': mill_LNK100_1_Hor2000_ni_0_3_median, value=492.0, 'mill_LNK100_1_Torsion_ni_0_3_max': mill_LNK100_1_Torsion_ni_0_3_max, value=468.0, 'mill_LNK100_1_Torsion_ni_0_3_min': mill_LNK100_1_Torsion_ni_0_3_min, value=2.0, 'mill_LNK100_1_Torsion_ni_0_3_median': mill_LNK100_1_Torsion_ni_0_3_median, value=275.0, 'mill_LNK100_1_Vert1000_[i_0.0_103.0]_0_3_max': mill_LNK100_1_Vert1000_[i_0.0_103.0]_0_3_max, value=195.0, 'mill_LNK100_1_Vert1000_[i_0.0_103.0]_0_3_min': mill_LNK100_1_Vert1000_[i_0.0_103.0]_0_3_min, value=114.0, 'mill_LNK

In [1134]:
targets_segments

{'LNK100_1': {'mill_LNK100_1_Hor1500_ni_0_3_max': mill_LNK100_1_Hor1500_ni_0_3_max, value=443.0,
  'mill_LNK100_1_Hor1500_ni_0_3_min': mill_LNK100_1_Hor1500_ni_0_3_min, value=223.0,
  'mill_LNK100_1_Hor1500_ni_0_3_median': mill_LNK100_1_Hor1500_ni_0_3_median, value=283.0,
  'mill_LNK100_1_Hor2000_ni_0_3_max': mill_LNK100_1_Hor2000_ni_0_3_max, value=719.0,
  'mill_LNK100_1_Hor2000_ni_0_3_min': mill_LNK100_1_Hor2000_ni_0_3_min, value=340.0,
  'mill_LNK100_1_Hor2000_ni_0_3_median': mill_LNK100_1_Hor2000_ni_0_3_median, value=492.0,
  'mill_LNK100_1_Torsion_ni_0_3_max': mill_LNK100_1_Torsion_ni_0_3_max, value=468.0,
  'mill_LNK100_1_Torsion_ni_0_3_min': mill_LNK100_1_Torsion_ni_0_3_min, value=2.0,
  'mill_LNK100_1_Torsion_ni_0_3_median': mill_LNK100_1_Torsion_ni_0_3_median, value=275.0,
  'mill_LNK100_1_Vert1000_[i_0.0_103.0]_0_3_max': mill_LNK100_1_Vert1000_[i_0.0_103.0]_0_3_max, value=195.0,
  'mill_LNK100_1_Vert1000_[i_0.0_103.0]_0_3_min': mill_LNK100_1_Vert1000_[i_0.0_103.0]_0_3_min, va

In [1120]:
targets_dict.keys

{'LNK100_1': {'mill_LNK100_1_Hor1500_ni_0_3_max': 443.0,
  'mill_LNK100_1_Hor1500_ni_0_3_min': 223.0,
  'mill_LNK100_1_Hor1500_ni_0_3_median': 283.0,
  'mill_LNK100_1_Hor2000_ni_0_3_max': 719.0,
  'mill_LNK100_1_Hor2000_ni_0_3_min': 340.0,
  'mill_LNK100_1_Hor2000_ni_0_3_median': 492.0,
  'mill_LNK100_1_Torsion_ni_0_3_max': 468.0,
  'mill_LNK100_1_Torsion_ni_0_3_min': 2.0,
  'mill_LNK100_1_Torsion_ni_0_3_median': 275.0,
  'mill_LNK100_1_Vert1000_[i_0.0_103.0]_0_3_max': 195.0,
  'mill_LNK100_1_Vert1000_[i_0.0_103.0]_0_3_min': 114.0,
  'mill_LNK100_1_Vert1000_[i_0.0_103.0]_0_3_median': 139.0,
  'mill_LNK100_1_Vert1500_[i_0.0_103.0]_0_3_max': 203.0,
  'mill_LNK100_1_Vert1500_[i_0.0_103.0]_0_3_min': 158.0,
  'mill_LNK100_1_Vert1500_[i_0.0_103.0]_0_3_median': 191.0,
  'mill_LNK100_1_Vert2000_[i_0.0_103.0]_0_3_max': 253.0,
  'mill_LNK100_1_Vert2000_[i_0.0_103.0]_0_3_min': 208.0,
  'mill_LNK100_1_Vert2000_[i_0.0_103.0]_0_3_median': 217.0,
  'mill_LNK100_1_Vert3000_[i_0.0_103.0]_0_3_max': 397.

In [1138]:
dic =  {
"Л54321098_2023": {
    "LNK100": r"\\ZSMK-9684-001\Data\2023\07\07\ipsrnk\rail_points_files\20230707001909_25178X302_IPSRNK_1_L.csv",
    "U0": r"\\ZSMK-9684-001\Data\2023\08\09\U0\rollings_points_files\20230809113255_Л258973020_U0_1_L.csv"
},
"Л98765432_2023": {
    "WBF_PIRO": r"\\ZSMK-9684-001\Data\2023\12\14\wbf\billet_pirometer_files\20231214172621_23-Л211344030_WBF_1_T2.csv",
    "U0": r"\\ZSMK-9684-001\Data\2023\08\09\U0\rollings_points_files\20230809113255_Л258973020_U0_1_L.csv"
}
}

In [1140]:
for source, filepath in dic["Л54321098_2023"].items():
    print(source + ":" + filepath)

LNK100:\\ZSMK-9684-001\Data\2023\07\07\ipsrnk\rail_points_files\20230707001909_25178X302_IPSRNK_1_L.csv
U0:\\ZSMK-9684-001\Data\2023\08\09\U0\rollings_points_files\20230809113255_Л258973020_U0_1_L.csv
