Skip to content

Commit

Permalink
Detect YAML binary secrets
Browse files Browse the repository at this point in the history
This implements support for high-entropy secrets in binary
values in yaml files.

We encode the binary value into a hex- or base64-encoded
string (based on the plugin), and run the normal entropy
check. If the string is deemed to be high-entropy, we re
encode the string into a yaml binary (using `yaml.dump`)
and strip the `!!binary`. This yaml binary is considered
the secret, and is put into the baseline as normal.

I had to update a test function so that it uses a custom
hex high-entropy detector, since `HighEntropyStringsPlugin`
is now an abstract class.
  • Loading branch information
Victor Zhou committed Aug 29, 2019
1 parent 0af82f6 commit e5e0b3c
Show file tree
Hide file tree
Showing 7 changed files with 108 additions and 20 deletions.
15 changes: 9 additions & 6 deletions detect_secrets/core/potential_secret.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,18 @@ def __init__(
self.type = typ
self.filename = filename
self.lineno = lineno
self.secret_hash = self.hash_secret(secret)
self.set_secret(secret)
self.is_secret = is_secret
self.is_verified = False

# If two PotentialSecrets have the same values for these fields,
# they are considered equal. Note that line numbers aren't included
# in this, because line numbers are subject to change.
self.fields_to_compare = ['filename', 'secret_hash', 'type']

def set_secret(self, secret):
self.secret_hash = self.hash_secret(secret)

# NOTE: Originally, we never wanted to keep the secret value in memory,
# after finding it in the codebase. However, to support verifiable
# secrets (and avoid the pain of re-scanning again), we need to
Expand All @@ -61,11 +69,6 @@ def __init__(
# in the repository.
self.secret_value = secret

# If two PotentialSecrets have the same values for these fields,
# they are considered equal. Note that line numbers aren't included
# in this, because line numbers are subject to change.
self.fields_to_compare = ['filename', 'secret_hash', 'type']

@staticmethod
def hash_secret(secret):
"""This offers a way to coherently test this class,
Expand Down
2 changes: 1 addition & 1 deletion detect_secrets/plugins/common/yaml_file_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def _tag_dict_values(self, map_node):
self._create_key_value_pair_for_mapping_node_value(
key='__value__',
value=value.value,
tag='tag:yaml.org,2002:str',
tag=value.tag,
),
self._create_key_value_pair_for_mapping_node_value(
key='__line__',
Expand Down
68 changes: 62 additions & 6 deletions detect_secrets/plugins/high_entropy_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@
from backports import configparser
except ImportError: # pragma: no cover
import configparser
import base64
import math
import os
import re
import string
from abc import ABCMeta
from abc import abstractmethod
from contextlib import contextmanager

import yaml
Expand Down Expand Up @@ -206,14 +208,23 @@ def _analyze_yaml_file(self, file, filename):

try:
if '__line__' in item and item['__line__'] not in ignored_lines:
potential_secrets.update(
self.analyze_string(
item['__value__'],
item['__line__'],
filename,
),
# An isinstance check doesn't work in py2
# so we need the __is_binary__ field.
string_to_scan = self.decode_binary(item['__value__']) \
if item['__is_binary__'] \
else item['__value__']

secrets = self.analyze_string(
string_to_scan,
item['__line__'],
filename,
)

if item['__is_binary__']:
secrets = self._encode_yaml_binary_secrets(secrets)

potential_secrets.update(secrets)

if '__line__' in item:
continue

Expand All @@ -226,6 +237,39 @@ def _analyze_yaml_file(self, file, filename):

return potential_secrets

def _encode_yaml_binary_secrets(self, secrets):
result = {}
"""The secrets dict format is
`{PotentialSecret: PotentialSecret}`, where both key and
value are the same object. Therefore, we can just mutate
the potential secret once.
"""
for potential_secret in secrets.keys():
secret_in_yaml_format = yaml.dump(
self.encode_to_binary(potential_secret.secret_value),
).replace(
'!!binary ',
'',
)

potential_secret.set_secret(secret_in_yaml_format)

result[potential_secret] = potential_secret

return result

@abstractmethod
def decode_binary(self, bytes_object): # pragma: no cover
"""Converts the bytes to a string which can be checked for
high entropy."""
pass

@abstractmethod
def encode_to_binary(self, string): # pragma: no cover
"""Converts a string (usually a high-entropy secret) to
binary. Usually the inverse of decode_binary."""
pass


class HexHighEntropyString(HighEntropyStringsPlugin):
"""HighEntropyStringsPlugin for hex encoded strings"""
Expand Down Expand Up @@ -278,6 +322,12 @@ def calculate_shannon_entropy(self, data):

return entropy

def decode_binary(self, bytes_object):
return bytes_object.decode('utf-8')

def encode_to_binary(self, string):
return string.encode('utf-8')


class Base64HighEntropyString(HighEntropyStringsPlugin):
"""HighEntropyStringsPlugin for base64 encoded strings"""
Expand All @@ -299,3 +349,9 @@ def __dict__(self):
})

return output

def decode_binary(self, bytes_object):
return base64.b64encode(bytes_object).decode('utf-8')

def encode_to_binary(self, string):
return base64.b64decode(string)
2 changes: 2 additions & 0 deletions test_data/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,5 @@ list_of_keys:
- 234567890a

test_agent::allowlisted_api_key: 'ToCynx5Se4e2PtoZxEhW7lUJcOX15c54' # pragma: allowlist secret

high_entropy_binary_secret: !!binary MjNjcnh1IDJieXJpdXYyeXJpaTJidnl1MnI4OXkyb3UwMg==
2 changes: 2 additions & 0 deletions test_data/config2.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# This is yaml.dump('2b00042f7481c7b056c4b410d28f33cf'.encode('utf-8'))
high_entropy_hex_binary_secret: !!binary MmIwMDA0MmY3NDgxYzdiMDU2YzRiNDEwZDI4ZjMzY2Y=
2 changes: 1 addition & 1 deletion tests/plugins/common/yaml_file_parser_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def test_get_ignored_lines(self):
['yaml_value', 'expected_value', 'expected_is_binary'],
[
('string_value', 'string_value', False),
('!!binary YWJjZGVm', 'YWJjZGVm', True),
('!!binary YWJjZGVm', b'abcdef', True),
],
)
def test_possible_secret_format(
Expand Down
37 changes: 31 additions & 6 deletions tests/plugins/high_entropy_strings_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,11 @@
from __future__ import unicode_literals

import codecs
import string

import pytest

from detect_secrets.plugins.high_entropy_strings import Base64HighEntropyString
from detect_secrets.plugins.high_entropy_strings import HexHighEntropyString
from detect_secrets.plugins.high_entropy_strings import HighEntropyStringsPlugin
from testing.mocks import mock_file_object


Expand Down Expand Up @@ -213,12 +211,13 @@ def test_yaml_file(self):
with open('test_data/config.yaml') as f:
secrets = plugin.analyze(f, 'test_data/config.yaml')

assert len(secrets.values()) == 2
assert len(secrets.values()) == 3
for secret in secrets.values():
location = str(secret).splitlines()[1]
assert location in (
'Location: test_data/config.yaml:3',
'Location: test_data/config.yaml:6',
'Location: test_data/config.yaml:15',
)

def test_env_file(self):
Expand All @@ -234,6 +233,21 @@ def test_env_file(self):
)


class HexHighEntropyStringsWithStandardEntropy(HexHighEntropyString):
"""Copies the HexHighEntropyString class, but keeps the standard
Shannon entropy calculation.
"""

def __init__(self, *args, **kwargs):
super(HexHighEntropyStringsWithStandardEntropy, self).__init__(
*args,
**kwargs
)

def calculate_shannon_entropy(self, data):
return super(HexHighEntropyString, self).calculate_shannon_entropy(data)


class TestHexHighEntropyStrings(HighEntropyStringsTest):

def setup(self):
Expand All @@ -247,10 +261,21 @@ def setup(self):
secret_string='2b00042f7481c7b056c4b410d28f33cf',
)

def test_yaml_file(self):
plugin = HexHighEntropyString(3)
with open('test_data/config2.yaml') as f:
secrets = plugin.analyze(f, 'test_data/config2.yaml')

assert len(secrets.values()) == 1
for secret in secrets.values():
location = str(secret).splitlines()[1]
assert location in (
'Location: test_data/config2.yaml:2',
)

def test_discounts_when_all_numbers(self):
original_scanner = HighEntropyStringsPlugin(
charset=string.hexdigits,
limit=3,
original_scanner = HexHighEntropyStringsWithStandardEntropy(
hex_limit=3,
exclude_lines_regex=None,
)

Expand Down

0 comments on commit e5e0b3c

Please sign in to comment.