diff --git a/translator/app/translator/core/functions.py b/translator/app/translator/core/functions.py index 0ebae670..2f9eda9c 100644 --- a/translator/app/translator/core/functions.py +++ b/translator/app/translator/core/functions.py @@ -58,7 +58,7 @@ def concat_kwargs(kwargs: Dict[str, str]) -> str: @staticmethod def map_field(field: Field, source_mapping: SourceMapping) -> str: - generic_field_name = field.generic_names_map[source_mapping.source_id] + generic_field_name = field.get_generic_field_name(source_mapping.source_id) mapped_field = source_mapping.fields_mapping.get_platform_field_name(generic_field_name=generic_field_name) if isinstance(mapped_field, list): mapped_field = mapped_field[0] diff --git a/translator/app/translator/core/mapping.py b/translator/app/translator/core/mapping.py index e3e9f72a..89a45127 100644 --- a/translator/app/translator/core/mapping.py +++ b/translator/app/translator/core/mapping.py @@ -122,3 +122,7 @@ def get_suitable_source_mappings(self, *args, **kwargs) -> List[SourceMapping]: def get_source_mapping(self, source_id: str) -> Optional[SourceMapping]: return self._source_mappings.get(source_id) + + @property + def default_mapping(self) -> SourceMapping: + return self._source_mappings[DEFAULT_MAPPING_NAME] diff --git a/translator/app/translator/core/mixins/logic.py b/translator/app/translator/core/mixins/logic.py index cf9f959a..b5497e6d 100644 --- a/translator/app/translator/core/mixins/logic.py +++ b/translator/app/translator/core/mixins/logic.py @@ -1,26 +1,29 @@ from typing import List, Union -from app.translator.core.models.field import Field, Keyword -from app.translator.core.models.identifier import Identifier from app.translator.core.custom_types.tokens import LogicalOperatorType, GroupType +from app.translator.core.models.field import FieldValue, Keyword +from app.translator.core.models.identifier import Identifier class ANDLogicOperatorMixin: @staticmethod - def get_missed_and_token_indices(tokens: List[Union[Field, Keyword, Identifier]]) -> List[int]: + def get_missed_and_token_indices(tokens: List[Union[FieldValue, Keyword, Identifier]]) -> List[int]: missed_and_indices = [] for index in range(len(tokens) - 1): token = tokens[index] next_token = tokens[index + 1] - if (isinstance(token, (Field, Keyword)) - and not (isinstance(next_token, Identifier) and ( - next_token.token_type in LogicalOperatorType - or next_token.token_type == GroupType.R_PAREN))): + if ((isinstance(token, (FieldValue, Keyword)) + or isinstance(token, Identifier) and token.token_type == GroupType.R_PAREN) + and not (isinstance(next_token, Identifier) + and (next_token.token_type + in (LogicalOperatorType.AND, LogicalOperatorType.OR, GroupType.R_PAREN)))): missed_and_indices.append(index + 1) return list(reversed(missed_and_indices)) - def add_and_token_if_missed(self, tokens: List[Union[Field, Keyword, Identifier]]) -> List[Union[Field, Keyword, Identifier]]: + def add_and_token_if_missed(self, + tokens: List[Union[FieldValue, Keyword, Identifier]] + ) -> List[Union[FieldValue, Keyword, Identifier]]: indices = self.get_missed_and_token_indices(tokens=tokens) for index in indices: tokens.insert(index, Identifier(token_type=LogicalOperatorType.AND)) diff --git a/translator/app/translator/core/models/field.py b/translator/app/translator/core/models/field.py index 57cafcb0..2491e5bf 100644 --- a/translator/app/translator/core/models/field.py +++ b/translator/app/translator/core/models/field.py @@ -1,16 +1,36 @@ from typing import Union, Optional +from app.translator.core.mapping import SourceMapping, DEFAULT_MAPPING_NAME from app.translator.core.models.identifier import Identifier from app.translator.core.custom_types.tokens import OperatorType class Field: - def __init__(self, source_name: str, operator: Identifier = None, value: Union[int, str, list, tuple] = None): + def __init__(self, source_name: str): + self.source_name = source_name + self.__generic_names_map = {} + + def get_generic_field_name(self, source_id: str) -> Optional[str]: + return self.__generic_names_map.get(source_id) + + def set_generic_names_map(self, source_mappings: list[SourceMapping], default_mapping: SourceMapping) -> None: + generic_names_map = { + source_mapping.source_id: source_mapping.fields_mapping.get_generic_field_name(self.source_name) + for source_mapping in source_mappings + } + if DEFAULT_MAPPING_NAME not in generic_names_map: + fields_mapping = default_mapping.fields_mapping + generic_names_map[DEFAULT_MAPPING_NAME] = fields_mapping.get_generic_field_name(self.source_name) + + self.__generic_names_map = generic_names_map + + +class FieldValue: + def __init__(self, source_name: str, operator: Identifier, value: Union[int, str, list, tuple]): + self.field = Field(source_name=source_name) self.operator = operator self.values = [] self.__add_value(value) - self.source_name = source_name # input translation field name - self.generic_names_map = {} @property def value(self): @@ -30,31 +50,7 @@ def __add__(self, other): self.values.append(other) def __repr__(self): - if self.operator: - return f"{self.source_name} {self.operator.token_type} {self.values}" - - return f"{self.source_name}" - - def __eq__(self, other): - if isinstance(other, Field): - return self._hash == other._hash - """For OR operator check""" - if self.source_name == other.source_name and self.operator == other.operator: - return True - return False - - def __neq__(self, other): - """For AND operator check""" - if self.source_name != other.source_name: - return True - return False - - @property - def _hash(self): - return hash(str(self)) - - def __hash__(self): - return hash(str(self)) + return f"{self.field.source_name} {self.operator.token_type} {self.values}" class Keyword: diff --git a/translator/app/translator/core/models/functions/base.py b/translator/app/translator/core/models/functions/base.py index 8fa70f10..881b5942 100644 --- a/translator/app/translator/core/models/functions/base.py +++ b/translator/app/translator/core/models/functions/base.py @@ -3,14 +3,14 @@ from dataclasses import dataclass, field from typing import List, Union -from app.translator.core.models.field import Field, Keyword +from app.translator.core.models.field import Field, FieldValue, Keyword from app.translator.core.models.identifier import Identifier @dataclass class Function: name: str = None - args: List[Union[Field, Keyword, Function, Identifier]] = field(default_factory=list) + args: List[Union[Field, FieldValue, Keyword, Function, Identifier]] = field(default_factory=list) as_clause: str = None by_clauses: List[Field] = field(default_factory=list) diff --git a/translator/app/translator/core/parser.py b/translator/app/translator/core/parser.py index c80002ae..f28aecde 100644 --- a/translator/app/translator/core/parser.py +++ b/translator/app/translator/core/parser.py @@ -21,7 +21,7 @@ from app.translator.core.functions import PlatformFunctions from app.translator.core.mapping import BasePlatformMappings, SourceMapping -from app.translator.core.models.field import Field +from app.translator.core.models.field import FieldValue from app.translator.core.models.functions.base import ParsedFunctions from app.translator.core.models.platform_details import PlatformDetails from app.translator.core.models.parser_output import SiemContainer, MetaInfoContainer @@ -50,15 +50,15 @@ def get_tokens_and_source_mappings(self, if not query: raise TokenizerGeneralException("Can't translate empty query. Please provide more details") tokens = self.tokenizer.tokenize(query=query) - field_tokens = self.tokenizer.filter_tokens(tokens, Field) + field_tokens = [token.field for token in self.tokenizer.filter_tokens(tokens, FieldValue)] field_names = [field.source_name for field in field_tokens] - suitable_source_mappings = self.mappings.get_suitable_source_mappings(field_names=field_names, **log_sources) - self.tokenizer.set_field_generic_names_map(field_tokens, suitable_source_mappings, self.mappings) + source_mappings = self.mappings.get_suitable_source_mappings(field_names=field_names, **log_sources) + self.tokenizer.set_field_tokens_generic_names_map(field_tokens, source_mappings, self.mappings.default_mapping) - return tokens, suitable_source_mappings + return tokens, source_mappings def set_functions_fields_generic_names(self, functions: ParsedFunctions, source_mappings: List[SourceMapping]) -> None: - field_tokens = self.tokenizer.filter_function_tokens(tokens=functions.functions) - self.tokenizer.set_field_generic_names_map(field_tokens, source_mappings, self.mappings) + field_tokens = self.tokenizer.get_field_tokens_from_func_args(args=functions.functions) + self.tokenizer.set_field_tokens_generic_names_map(field_tokens, source_mappings, self.mappings.default_mapping) diff --git a/translator/app/translator/core/render.py b/translator/app/translator/core/render.py index 7416082e..77915173 100644 --- a/translator/app/translator/core/render.py +++ b/translator/app/translator/core/render.py @@ -27,11 +27,12 @@ from app.translator.core.exceptions.parser import UnsupportedOperatorException from app.translator.core.functions import PlatformFunctions from app.translator.core.mapping import BasePlatformMappings, SourceMapping, LogSourceSignature, DEFAULT_MAPPING_NAME -from app.translator.core.models.field import Field, Keyword +from app.translator.core.models.field import Field, FieldValue, Keyword from app.translator.core.models.functions.base import Function, ParsedFunctions +from app.translator.core.models.identifier import Identifier from app.translator.core.models.platform_details import PlatformDetails from app.translator.core.models.parser_output import MetaInfoContainer -from app.translator.core.custom_types.tokens import LogicalOperatorType, OperatorType, GroupType +from app.translator.core.custom_types.tokens import LogicalOperatorType, OperatorType class BaseQueryFieldValue(ABC): @@ -133,7 +134,7 @@ def generate_functions(self, functions: List[Function], source_mapping: SourceMa return self.platform_functions.render(functions, source_mapping) if self.platform_functions else "" def map_field(self, field: Field, source_mapping: SourceMapping) -> List[str]: - generic_field_name = field.generic_names_map[source_mapping.source_id] + generic_field_name = field.get_generic_field_name(source_mapping.source_id) # field can be mapped to corresponding platform field name or list of platform field names mapped_field = source_mapping.fields_mapping.get_platform_field_name(generic_field_name=generic_field_name) if not mapped_field and self.is_strict_mapping: @@ -145,10 +146,10 @@ def map_field(self, field: Field, source_mapping: SourceMapping) -> List[str]: return mapped_field if mapped_field else [generic_field_name] if generic_field_name else [field.source_name] def apply_token(self, - token: Union[Field, Keyword, LogicalOperatorType, GroupType], + token: Union[FieldValue, Keyword, Identifier], source_mapping: SourceMapping) -> str: - if isinstance(token, (Field, Keyword)): - mapped_fields = self.map_field(token, source_mapping) if isinstance(token, Field) else [None] + if isinstance(token, FieldValue): + mapped_fields = self.map_field(token.field, source_mapping) if len(mapped_fields) > 1: return self.group_token % self.operator_map[LogicalOperatorType.OR].join([ self.field_value_map.apply_field_value(field=field, operator=token.operator, value=token.value) @@ -158,12 +159,17 @@ def apply_token(self, return self.field_value_map.apply_field_value(field=mapped_fields[0], operator=token.operator, value=token.value) + elif isinstance(token, Keyword): + return self.field_value_map.apply_field_value(field=None, + operator=token.operator, + value=token.value) elif token.token_type in LogicalOperatorType: return self.operator_map.get(token.token_type) + return token.token_type def generate_query(self, - query: List[Union[Field, Keyword, LogicalOperatorType, GroupType]], + query: List[Union[FieldValue, Keyword, Identifier]], source_mapping: SourceMapping) -> str: result_values = [] for token in query: @@ -173,8 +179,7 @@ def generate_query(self, def wrap_query_with_meta_info(self, meta_info: MetaInfoContainer, query: str): if meta_info and (meta_info.id or meta_info.title): query_meta_info = "\n".join( - self.wrap_with_comment(f"{key}{value}") - for key, value in {"name: ": meta_info.title, "uuid: ": meta_info.id}.items() if value + self.wrap_with_comment(f"{key}{value}") for key, value in {"name: ": meta_info.title, "uuid: ": meta_info.id}.items() if value ) query = f"{query}\n\n{query_meta_info}" return query diff --git a/translator/app/translator/core/tokenizer.py b/translator/app/translator/core/tokenizer.py index 8f7ab6a2..85d5af6d 100644 --- a/translator/app/translator/core/tokenizer.py +++ b/translator/app/translator/core/tokenizer.py @@ -27,20 +27,20 @@ TokenizerGeneralException, QueryParenthesesException ) -from app.translator.core.mapping import SourceMapping, DEFAULT_MAPPING_NAME, BasePlatformMappings -from app.translator.core.models.field import Field, Keyword +from app.translator.core.mapping import SourceMapping +from app.translator.core.models.field import Field, FieldValue, Keyword from app.translator.core.models.functions.base import Function from app.translator.core.models.functions.sort import SortArg from app.translator.core.models.identifier import Identifier from app.translator.core.custom_types.tokens import OperatorType, GroupType from app.translator.tools.utils import get_match_group -TOKEN_TYPE = Union[Field, Keyword, Identifier] +TOKEN_TYPE = Union[FieldValue, Keyword, Identifier] class BaseTokenizer(ABC): @abstractmethod - def tokenize(self, query: str) -> List[Union[Field, Keyword, Identifier]]: + def tokenize(self, query: str) -> List[Union[FieldValue, Keyword, Identifier]]: raise NotImplementedError() @@ -180,18 +180,18 @@ def process_value_wildcard_symbols(self, return self._clean_value(value, wildcard_symbol), op @staticmethod - def create_field(field_name: str, operator: Identifier, value: Union[str, List]) -> Field: - return Field(operator=operator, value=value, source_name=field_name) + def create_field_value(field_name: str, operator: Identifier, value: Union[str, List]) -> FieldValue: + return FieldValue(source_name=field_name, operator=operator, value=value) - def search_field_value(self, query): + def search_field_value(self, query) -> Tuple[FieldValue, str]: field_name = self.search_field(query) operator = self.search_operator(query, field_name) query, operator, value = self.search_value(query=query, operator=operator, field_name=field_name) value, operator_token = self.process_value_wildcard_symbols(value=value, operator=operator, wildcard_symbol=self.wildcard_symbol) - field = self.create_field(field_name=field_name, operator=operator_token, value=value) - return field, query + field_value = self.create_field_value(field_name=field_name, operator=operator_token, value=value) + return field_value, query def _match_field_value(self, query: str, white_space_pattern: str = r"\s+") -> bool: single_value_operator_group = fr"(?:{'|'.join(self.single_value_operators_map)})" @@ -208,7 +208,7 @@ def _match_field_value(self, query: str, white_space_pattern: str = r"\s+") -> b return False - def _get_identifier(self, query: str) -> Tuple[Union[Field, Keyword, Identifier], str]: + def _get_identifier(self, query: str) -> Tuple[Union[FieldValue, Keyword, Identifier], str]: query = query.strip("\n").strip(" ").strip("\n") if query.startswith(GroupType.L_PAREN): return Identifier(token_type=GroupType.L_PAREN), query[1:] @@ -240,7 +240,7 @@ def _validate_parentheses(tokens): raise QueryParenthesesException() return True - def tokenize(self, query: str) -> List[Union[Field, Keyword, Identifier]]: + def tokenize(self, query: str) -> List[Union[FieldValue, Keyword, Identifier]]: tokenized = [] while query: identifier, query = self._get_identifier(query=query) @@ -250,34 +250,28 @@ def tokenize(self, query: str) -> List[Union[Field, Keyword, Identifier]]: @staticmethod def filter_tokens(tokens: List[TOKEN_TYPE], - token_type: Union[Type[Field], Type[Keyword], Type[Identifier]]) -> List[TOKEN_TYPE]: + token_type: Union[Type[FieldValue], Type[Keyword], Type[Identifier]]) -> List[TOKEN_TYPE]: return [token for token in tokens if isinstance(token, token_type)] - def filter_function_tokens(self, - tokens: List[Union[Field, Keyword, Identifier, Function, SortArg]]) -> List[TOKEN_TYPE]: + def get_field_tokens_from_func_args(self, + args: List[Union[Field, FieldValue, Keyword, Identifier, Function, SortArg]] + ) -> List[Field]: result = [] - for token in tokens: - if isinstance(token, Field): - result.append(token) - elif isinstance(token, Function): - result.extend(self.filter_function_tokens(tokens=token.args)) - result.extend(self.filter_function_tokens(tokens=token.by_clauses)) - elif isinstance(token, SortArg): - result.append(token.field) + for arg in args: + if isinstance(arg, Field): + result.append(arg) + elif isinstance(arg, FieldValue): + result.append(arg.field) + elif isinstance(arg, Function): + result.extend(self.get_field_tokens_from_func_args(args=arg.args)) + result.extend(self.get_field_tokens_from_func_args(args=arg.by_clauses)) + elif isinstance(arg, SortArg): + result.append(arg.field) return result @staticmethod - def set_field_generic_names_map(tokens: List[Field], - source_mappings: List[SourceMapping], - platform_mappings: BasePlatformMappings) -> None: + def set_field_tokens_generic_names_map(tokens: List[Field], + source_mappings: List[SourceMapping], + default_mapping: SourceMapping) -> None: for token in tokens: - generic_names_map = { - source_mapping.source_id: source_mapping.fields_mapping.get_generic_field_name(token.source_name) - for source_mapping in source_mappings - } - if DEFAULT_MAPPING_NAME not in generic_names_map: - default_source_mapping = platform_mappings.get_source_mapping(DEFAULT_MAPPING_NAME) - fields_mapping = default_source_mapping.fields_mapping - generic_names_map[DEFAULT_MAPPING_NAME] = fields_mapping.get_generic_field_name(token.source_name) - - token.generic_names_map = generic_names_map + token.set_generic_names_map(source_mappings, default_mapping) diff --git a/translator/app/translator/platforms/athena/tokenizer.py b/translator/app/translator/platforms/athena/tokenizer.py index 0e67349b..37dd8f3b 100644 --- a/translator/app/translator/platforms/athena/tokenizer.py +++ b/translator/app/translator/platforms/athena/tokenizer.py @@ -20,6 +20,7 @@ from typing import Tuple, Any from app.translator.core.custom_types.values import ValueType +from app.translator.core.models.field import FieldValue from app.translator.core.models.identifier import Identifier from app.translator.core.tokenizer import QueryTokenizer from app.translator.core.custom_types.tokens import OperatorType @@ -66,7 +67,7 @@ def get_operator_and_value(self, match: re.Match, operator: str = OperatorType.E return super().get_operator_and_value(match, operator) - def search_field_value(self, query): + def search_field_value(self, query) -> Tuple[FieldValue, str]: field_name = self.search_field(query) operator = self.search_operator(query, field_name) should_process_value_wildcard_symbols = self.should_process_value_wildcard_symbols(operator) @@ -81,8 +82,8 @@ def search_field_value(self, query): ) field_name = field_name.strip('"') - field = self.create_field(field_name=field_name, operator=operator_token, value=value) - return field, query + field_value = self.create_field_value(field_name=field_name, operator=operator_token, value=value) + return field_value, query def tokenize(self, query: str) -> list: query = re.sub(r"\s*ESCAPE\s*'.'", '', query) # remove `ESCAPE 'escape_char'` in LIKE expr diff --git a/translator/app/translator/platforms/base/lucene/tokenizer.py b/translator/app/translator/platforms/base/lucene/tokenizer.py index 85eb48a2..c44173b0 100644 --- a/translator/app/translator/platforms/base/lucene/tokenizer.py +++ b/translator/app/translator/platforms/base/lucene/tokenizer.py @@ -22,7 +22,7 @@ from app.translator.core.custom_types.values import ValueType from app.translator.core.exceptions.parser import TokenizerGeneralException from app.translator.core.mixins.logic import ANDLogicOperatorMixin -from app.translator.core.models.field import Keyword, Field +from app.translator.core.models.field import Keyword, FieldValue from app.translator.core.models.identifier import Identifier from app.translator.core.tokenizer import QueryTokenizer from app.translator.core.custom_types.tokens import OperatorType @@ -41,7 +41,6 @@ class LuceneTokenizer(QueryTokenizer, ANDLogicOperatorMixin): } field_pattern = r"(?P[a-zA-Z\.\-_]+)" - match_operator_pattern = r"(?:___field___\s*(?P:\[\*\sTO|:\[|:<|:>|:))\s*" _num_value_pattern = r"\d+(?:\.\d+)*" num_value_pattern = fr"(?P<{ValueType.number_value}>{_num_value_pattern})\s*" double_quotes_value_pattern = fr'"(?P<{ValueType.double_quotes_value}>(?:[:a-zA-Z\*0-9=+%#\-_/,\'\.$&^@!\(\)\{{\}}\s]|\\\"|\\)*)"\s*' @@ -61,10 +60,10 @@ class LuceneTokenizer(QueryTokenizer, ANDLogicOperatorMixin): wildcard_symbol = "*" @staticmethod - def create_field(field_name: str, operator: Identifier, value: Union[str, List]) -> Field: + def create_field_value(field_name: str, operator: Identifier, value: Union[str, List]) -> FieldValue: field_name = field_name.replace(".text", "") field_name = field_name.replace(".keyword", "") - return Field(operator=operator, value=value, source_name=field_name) + return FieldValue(source_name=field_name, operator=operator, value=value) @staticmethod def clean_quotes(value: Union[str, int]): @@ -131,6 +130,6 @@ def _match_field_value(self, query: str, white_space_pattern: str = r"\s*") -> b return super()._match_field_value(query, white_space_pattern=white_space_pattern) - def tokenize(self, query: str) -> List[Union[Field, Keyword, Identifier]]: + def tokenize(self, query: str) -> List[Union[FieldValue, Keyword, Identifier]]: tokens = super().tokenize(query=query) return self.add_and_token_if_missed(tokens=tokens) diff --git a/translator/app/translator/platforms/base/spl/tokenizer.py b/translator/app/translator/platforms/base/spl/tokenizer.py index 0ef3977b..6862fbee 100644 --- a/translator/app/translator/platforms/base/spl/tokenizer.py +++ b/translator/app/translator/platforms/base/spl/tokenizer.py @@ -21,7 +21,7 @@ from app.translator.core.custom_types.values import ValueType from app.translator.core.mixins.logic import ANDLogicOperatorMixin -from app.translator.core.models.field import Field, Keyword +from app.translator.core.models.field import FieldValue, Keyword from app.translator.core.models.identifier import Identifier from app.translator.core.tokenizer import QueryTokenizer from app.translator.core.custom_types.tokens import OperatorType @@ -68,6 +68,6 @@ def get_operator_and_value(self, match: re.Match, operator: str = OperatorType.E return super().get_operator_and_value(match) - def tokenize(self, query: str) -> List[Union[Field, Keyword, Identifier]]: + def tokenize(self, query: str) -> List[Union[FieldValue, Keyword, Identifier]]: tokens = super().tokenize(query=query) return self.add_and_token_if_missed(tokens=tokens) diff --git a/translator/app/translator/platforms/chronicle/tokenizer.py b/translator/app/translator/platforms/chronicle/tokenizer.py index dd64500a..19c4a873 100644 --- a/translator/app/translator/platforms/chronicle/tokenizer.py +++ b/translator/app/translator/platforms/chronicle/tokenizer.py @@ -21,6 +21,7 @@ from app.translator.core.custom_types.values import ValueType from app.translator.core.exceptions.parser import TokenizerGeneralException +from app.translator.core.models.field import FieldValue from app.translator.core.tokenizer import QueryTokenizer from app.translator.core.custom_types.tokens import OperatorType from app.translator.platforms.chronicle.escape_manager import chronicle_escape_manager @@ -77,7 +78,7 @@ class ChronicleRuleTokenizer(ChronicleQueryTokenizer): back_quotes_value_pattern = fr'`(?P<{ValueType.back_quotes_value}>(?:[:a-zA-Z\*0-9=+%#\-_/,\'\"\\\.$&^@!\(\)\{{\}}\s])*)`' regex_value_regex = fr"{double_quotes_value_pattern}|{back_quotes_value_pattern}\s*\)\s*(?:nocase)?\s*" - def search_field_value(self, query): + def search_field_value(self, query) -> Tuple[FieldValue, str]: if query.startswith("re.regex("): field_search = re.search(self.regex_field_regex, query) if field_search is None: @@ -99,8 +100,8 @@ def search_field_value(self, query): pos = value_search.end() query = query[pos:] - field = self.create_field(field_name=field, operator=operator, value=value) - return field, query + field_value = self.create_field_value(field_name=field, operator=operator, value=value) + return field_value, query else: return super().search_field_value(query=query) diff --git a/translator/app/translator/platforms/logscale/tokenizer.py b/translator/app/translator/platforms/logscale/tokenizer.py index ee606141..293662d5 100644 --- a/translator/app/translator/platforms/logscale/tokenizer.py +++ b/translator/app/translator/platforms/logscale/tokenizer.py @@ -21,9 +21,9 @@ from app.translator.core.custom_types.values import ValueType from app.translator.core.mixins.logic import ANDLogicOperatorMixin -from app.translator.core.models.field import Keyword, Field +from app.translator.core.models.field import Keyword, FieldValue from app.translator.core.models.identifier import Identifier -from app.translator.core.custom_types.tokens import GroupType, LogicalOperatorType, OperatorType +from app.translator.core.custom_types.tokens import LogicalOperatorType, OperatorType from app.translator.core.tokenizer import QueryTokenizer from app.translator.platforms.logscale.escape_manager import logscale_escape_manager from app.translator.tools.utils import get_match_group @@ -67,17 +67,6 @@ def _get_identifier(self, query: str) -> (list, str): return super()._get_identifier(query) - def tokenize(self, query: str) -> List[Union[Field, Keyword, Identifier]]: - tokenized = [] - while query: - identifier, query = self._get_identifier(query=query) - if tokenized: - if isinstance(identifier, Identifier) and identifier.token_type in (GroupType.L_PAREN, LogicalOperatorType.NOT): - if isinstance(tokenized[-1], (Field, Keyword)) or tokenized[-1].token_type == GroupType.R_PAREN: - tokenized.append(Identifier(token_type=LogicalOperatorType.AND)) - elif isinstance(identifier, (Field, Keyword)): - if isinstance(tokenized[-1], (Field, Keyword)) or tokenized[-1].token_type == GroupType.R_PAREN: - tokenized.append(Identifier(token_type=LogicalOperatorType.AND)) - tokenized.append(identifier) - self._validate_parentheses(tokenized) - return self.add_and_token_if_missed(tokens=tokenized) + def tokenize(self, query: str) -> List[Union[FieldValue, Keyword, Identifier]]: + tokens = super().tokenize(query=query) + return self.add_and_token_if_missed(tokens=tokens) diff --git a/translator/app/translator/platforms/qradar/tokenizer.py b/translator/app/translator/platforms/qradar/tokenizer.py index b50dd031..9bf2cb41 100644 --- a/translator/app/translator/platforms/qradar/tokenizer.py +++ b/translator/app/translator/platforms/qradar/tokenizer.py @@ -21,7 +21,7 @@ from app.translator.core.custom_types.values import ValueType from app.translator.platforms.qradar.const import UTF8_PAYLOAD_PATTERN, SINGLE_QUOTES_VALUE_PATTERN, NUM_VALUE_PATTERN -from app.translator.core.models.field import Keyword +from app.translator.core.models.field import Keyword, FieldValue from app.translator.core.models.identifier import Identifier from app.translator.core.tokenizer import QueryTokenizer from app.translator.core.custom_types.tokens import OperatorType @@ -77,7 +77,7 @@ def escape_field_name(self, field_name): field_name = field_name.replace(' ', r'\ ') return field_name - def search_field_value(self, query): + def search_field_value(self, query) -> Tuple[FieldValue, str]: field_name = self.search_field(query) operator = self.search_operator(query, field_name) should_process_value_wildcard_symbols = self.should_process_value_wildcard_symbols(operator) @@ -92,8 +92,8 @@ def search_field_value(self, query): ) field_name = field_name.strip('"') - field = self.create_field(field_name=field_name, operator=operator_token, value=value) - return field, query + field_value = self.create_field_value(field_name=field_name, operator=operator_token, value=value) + return field_value, query def search_keyword(self, query: str) -> Tuple[Keyword, str]: keyword_search = re.search(self.keyword_pattern, query) diff --git a/translator/app/translator/platforms/sigma/models/compiler.py b/translator/app/translator/platforms/sigma/models/compiler.py index 3d39235e..8e630017 100644 --- a/translator/app/translator/platforms/sigma/models/compiler.py +++ b/translator/app/translator/platforms/sigma/models/compiler.py @@ -16,7 +16,7 @@ ----------------------------------------------------------------- """ -from app.translator.core.models.field import Field, Keyword +from app.translator.core.models.field import FieldValue, Keyword from app.translator.platforms.sigma.models.group import Group from app.translator.core.models.identifier import Identifier from app.translator.platforms.sigma.models.operator import Operator, NOT @@ -31,14 +31,14 @@ def generate(self, tokens: list, group: Group = None): return group group = group if group else Group() token = tokens[0] - if isinstance(token, (Field, Keyword)): + if isinstance(token, (FieldValue, Keyword)): group += token return self.generate(tokens=tokens[1::], group=group) elif token.token_type == LogicalOperatorType.OR or token.token_type == LogicalOperatorType.AND: group.items = Operator(operator_type=token.token_type) return self.generate(tokens=tokens[1::], group=group) elif token.token_type == LogicalOperatorType.NOT: - if isinstance(tokens[1], (Field, Keyword)): + if isinstance(tokens[1], (FieldValue, Keyword)): tokens.insert(2, Identifier(token_type=GroupType.R_PAREN)) tokens.insert(1, Identifier(token_type=GroupType.L_PAREN)) sub_group = Group() diff --git a/translator/app/translator/platforms/sigma/models/modifiers.py b/translator/app/translator/platforms/sigma/models/modifiers.py index 30868b43..96d333f1 100644 --- a/translator/app/translator/platforms/sigma/models/modifiers.py +++ b/translator/app/translator/platforms/sigma/models/modifiers.py @@ -1,6 +1,6 @@ from typing import Union, List -from app.translator.core.models.field import Field +from app.translator.core.models.field import FieldValue from app.translator.core.models.identifier import Identifier from app.translator.core.custom_types.tokens import LogicalOperatorType, OperatorType, GroupType @@ -29,7 +29,7 @@ def modifier_all(self, field_name: str, modifier: str, values: Union[str, List[str]]) -> Union[tuple, list]: if (isinstance(values, list) and len(values) == 1) or isinstance(values, str): operator = self.map_modifier(modifier=modifier) - return (Field(source_name=field_name, operator=operator, value=values), ) + return (FieldValue(source_name=field_name, operator=operator, value=values), ) else: tokens = [] for value in values: @@ -53,8 +53,7 @@ def modifier_windash(self, field_name: str, modifier: Union[str, list], tokens.append(self.or_token) return [Identifier(token_type=GroupType.L_PAREN), *tokens[:-1], Identifier(token_type=GroupType.R_PAREN)] operator = self.map_modifier(modifier=modifier) - field = Field(source_name=field_name, operator=operator, value=self.__prepare_windash_value(value=values)) - return (field,) + return (FieldValue(source_name=field_name, operator=operator, value=self.__prepare_windash_value(value=values)),) def apply_multi_modifier(self, field_name: str, modifier: list, values: Union[str, List[str]]) -> Union[tuple, list]: @@ -69,7 +68,7 @@ def apply_modifier(self, field_name: str, modifier: list, values: Union[str, Lis modifier = OperatorType.EQ return self.modifier_windash(field_name=field_name, modifier=modifier, values=values) operator = self.map_modifier(modifier=modifier) - return (Field(source_name=field_name, operator=operator, value=values), ) + return (FieldValue(source_name=field_name, operator=operator, value=values), ) def create_token(self, field_name: str, modifier: list, value: Union[str, List[str], int]) -> Union[tuple, list]: diff --git a/translator/app/translator/platforms/sigma/parsers/sigma.py b/translator/app/translator/platforms/sigma/parsers/sigma.py index 769e8531..90d3953b 100644 --- a/translator/app/translator/platforms/sigma/parsers/sigma.py +++ b/translator/app/translator/platforms/sigma/parsers/sigma.py @@ -18,7 +18,6 @@ """ -import re from typing import List, Union from app.translator.core.tokenizer import QueryTokenizer @@ -27,7 +26,7 @@ from app.translator.platforms.sigma.tokenizer import SigmaTokenizer, SigmaConditionTokenizer from app.translator.core.exceptions.core import SigmaRuleValidationException from app.translator.core.mixins.rule import YamlRuleMixin -from app.translator.core.models.field import Field +from app.translator.core.models.field import FieldValue from app.translator.core.models.platform_details import PlatformDetails from app.translator.core.models.parser_output import SiemContainer, MetaInfoContainer @@ -75,14 +74,14 @@ def parse(self, text: str) -> SiemContainer: if key in ("product", "service", "category") } tokens = self.tokenizer.tokenize(detection=sigma_rule.get("detection")) - field_tokens = QueryTokenizer.filter_tokens(tokens, Field) + field_tokens = [token.field for token in QueryTokenizer.filter_tokens(tokens, FieldValue)] field_names = [field.source_name for field in field_tokens] - suitable_source_mappings = self.mappings.get_suitable_source_mappings(field_names=field_names, **log_sources) - QueryTokenizer.set_field_generic_names_map(field_tokens, suitable_source_mappings, self.mappings) + source_mappings = self.mappings.get_suitable_source_mappings(field_names=field_names, **log_sources) + QueryTokenizer.set_field_tokens_generic_names_map(field_tokens, source_mappings, self.mappings.default_mapping) return SiemContainer( query=tokens, meta_info=self._get_meta_info( rule=sigma_rule, - source_mapping_ids=[source_mapping.source_id for source_mapping in suitable_source_mappings] + source_mapping_ids=[source_mapping.source_id for source_mapping in source_mappings] ), ) diff --git a/translator/app/translator/platforms/sigma/renders/sigma.py b/translator/app/translator/platforms/sigma/renders/sigma.py index 2244e32f..1a8b0e3d 100644 --- a/translator/app/translator/platforms/sigma/renders/sigma.py +++ b/translator/app/translator/platforms/sigma/renders/sigma.py @@ -26,7 +26,7 @@ from app.translator.platforms.sigma.mapping import SigmaMappings, sigma_mappings, SigmaLogSourceSignature from app.translator.platforms.sigma.models.compiler import DataStructureCompiler from app.translator.core.mapping import SourceMapping, DEFAULT_MAPPING_NAME -from app.translator.core.models.field import Field, Keyword +from app.translator.core.models.field import FieldValue, Keyword from app.translator.platforms.sigma.models.group import Group from app.translator.platforms.sigma.models.operator import OR, AND, NOT from app.translator.core.models.platform_details import PlatformDetails @@ -68,7 +68,7 @@ def generate_data_structure(self, data: Any, source_mapping: SourceMapping): return self.generate_and(data, source_mapping) elif isinstance(data, NOT): return self.generate_not(data, source_mapping) - elif isinstance(data, Field): + elif isinstance(data, FieldValue): return self.generate_field(data, source_mapping) elif isinstance(data, Keyword): return self.generate_keyword(data) @@ -101,7 +101,7 @@ def generate_or(self, data: Any, source_mapping: SourceMapping): elif ( result and len(set(result.get(self.selection, [])).intersection(set(updated_node))) != 0 - and isinstance(data.items[i - 1], Field) + and isinstance(data.items[i - 1], FieldValue) and len(updated_node) == 1 and self.selection not in updated_node ): @@ -177,9 +177,9 @@ def map_field(source_mapping: SourceMapping, generic_field_name: str) -> str: field_name = source_mapping.fields_mapping.get_platform_field_name(generic_field_name) return field_name or generic_field_name - def generate_field(self, data: Field, source_mapping: SourceMapping): + def generate_field(self, data: FieldValue, source_mapping: SourceMapping): source_id = source_mapping.source_id - generic_field_name = data.generic_names_map.get(source_id) or data.source_name + generic_field_name = data.field.get_generic_field_name(source_id) or data.field.source_name field_name = self.map_field(source_mapping, generic_field_name) if data.operator.token_type not in (OperatorType.EQ, OperatorType.LT, OperatorType.LTE, OperatorType.GT, OperatorType.GTE, OperatorType.NEQ): diff --git a/translator/app/translator/platforms/sigma/tokenizer.py b/translator/app/translator/platforms/sigma/tokenizer.py index 10a4b8dd..a546dad4 100644 --- a/translator/app/translator/platforms/sigma/tokenizer.py +++ b/translator/app/translator/platforms/sigma/tokenizer.py @@ -21,10 +21,9 @@ from app.translator.platforms.sigma.models.modifiers import ModifierManager from app.translator.core.exceptions.parser import TokenizerGeneralException -from app.translator.core.models.field import Field, Keyword +from app.translator.core.models.field import FieldValue, Keyword from app.translator.core.models.identifier import Identifier from app.translator.core.custom_types.tokens import GroupType, LogicalOperatorType -from app.translator.core.tokenizer import QueryTokenizer class Selection: @@ -45,7 +44,7 @@ def __init__(self): list: self.__parse_or_selection } - def __parse_field(self, field_name: str, values: Union[int, str, List[str]]) -> Union[List, Field]: + def __parse_field(self, field_name: str, values: Union[int, str, List[str]]) -> Union[List, FieldValue]: field_name, *modifier = field_name.split("|") if "|" in field_name else (field_name, "=") return self.modifier_manager.generate(field_name=field_name, modifier=modifier, value=values)