Skip to content

Commit

Permalink
Merge pull request #5932 from drew2a/feature/improve_scrubber
Browse files Browse the repository at this point in the history
Improve Sentry Scrubber
  • Loading branch information
drew2a committed Jan 12, 2021
2 parents 3fedeab + ec8d9e5 commit 176ba77
Show file tree
Hide file tree
Showing 2 changed files with 134 additions and 97 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -31,18 +31,24 @@ def __init__(self):
r'data\/media',
r'WINNT\\Profiles',
'Documents and Settings',
'Users',
]

self.dict_keys_for_scrub = ['USERNAME', 'USERDOMAIN']
self.event_fields_to_cut = []
self.exclusions = ['local', '127.0.0.1']

self.placeholder_user = '<user>'
self.placeholder_ip = '<IP>'
self.placeholder_hash = '<hash>'
# this is the dict (key: sensitive_info, value: placeholder)
self.sensitive_occurrences = {}

self.exclusions = ['local', '127.0.0.1']
# placeholders
self.create_placeholder = lambda text: f'<{text}>'

self.user_name = None
self.placeholder_user = self.create_placeholder('user')
self.placeholder_ip = self.create_placeholder('IP')
self.placeholder_hash = self.create_placeholder('hash')

# compiled regular expressions
self.re_folders = []
self.re_ip = None
self.re_hash = None
Expand All @@ -52,7 +58,7 @@ def __init__(self):
def _compile_re(self):
"""Compile all regular expressions."""
for folder in self.home_folders:
folder_pattern = r'(?<=' + folder + r'[/\\])\w+(?=[/\\])'
folder_pattern = r'(?<=' + folder + r'[/\\])[\w\s~]+(?=[/\\])'
self.re_folders.append(re.compile(folder_pattern, re.I))

self.re_ip = re.compile(r'(?<!\.)\b(\d{1,3}\.){3}\d{1,3}\b(?!\.)', re.I)
Expand Down Expand Up @@ -118,29 +124,37 @@ def scrub_text(self, text):
if text is None:
return text

def cut_username(m):
def scrub_username(m):
group = m.group(0)
if group in self.exclusions:
return group
self.user_name = group
self.add_sensitive_pair(group, self.placeholder_user)
replacement = self.placeholder_user
return replacement

for regex in self.re_folders:
text = regex.sub(cut_username, text)
text = regex.sub(scrub_username, text)

# cut an IP
def cut_ip(m):
def scrub_ip(m):
return self.placeholder_ip if m.group(0) not in self.exclusions else m.group(0)

text = self.re_ip.sub(cut_ip, text)
text = self.re_ip.sub(scrub_ip, text)

# cut hash
text = self.re_hash.sub(self.placeholder_hash, text)

# replace all user name occurrences in the whole string
if self.user_name:
text = re.sub(r'\b' + re.escape(self.user_name) + r'\b', self.placeholder_user, text)
# replace all sensitive occurrences in the whole string
if self.sensitive_occurrences:
escaped_sensitive_occurrences = [re.escape(user_name) for user_name in self.sensitive_occurrences]
pattern = r'([^<]|^)\b(' + '|'.join(escaped_sensitive_occurrences) + r')\b'

def scrub_value(m):
if m.group(2) not in self.sensitive_occurrences:
return m.group(0)
return m.group(1) + self.sensitive_occurrences[m.group(2)]

text = re.sub(pattern, scrub_value, text)

return text

Expand Down Expand Up @@ -173,6 +187,18 @@ def scrub_entity_recursively(self, entity, depth=10):
return [self.scrub_entity_recursively(item, depth) for item in entity]

if isinstance(entity, dict):
return {key: self.scrub_entity_recursively(entity[key], depth) for key in entity}
result = {}
for key, value in entity.items():
if key in self.dict_keys_for_scrub:
placeholder = self.create_placeholder(key)
self.add_sensitive_pair(value, placeholder)
result[key] = self.scrub_entity_recursively(value, depth)
return result

return entity

def add_sensitive_pair(self, text, placeholder):
if text in self.sensitive_occurrences:
return

self.sensitive_occurrences[text] = placeholder
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
import pytest

from tribler_common.sentry_reporter.sentry_reporter import BREADCRUMBS, CONTEXTS, EXTRA, LOGENTRY, OS_ENVIRON, \
REPORTER, \
STACKTRACE, \
SYSINFO, \
SYS_ARGV
from tribler_common.sentry_reporter.sentry_reporter import (
BREADCRUMBS,
CONTEXTS,
EXTRA,
LOGENTRY,
OS_ENVIRON,
REPORTER,
STACKTRACE,
SYSINFO,
SYS_ARGV,
)
from tribler_common.sentry_reporter.sentry_scrubber import SentryScrubber


Expand All @@ -28,6 +34,8 @@ def test_patterns(scrubber):
assert any(regex.search('/data/media/username3/some/') for regex in scrubber.re_folders)
assert any(regex.search('WINNT\\Profiles\\username\\some') for regex in scrubber.re_folders)
assert any(regex.search('Documents and Settings\\username\\some') for regex in scrubber.re_folders)
assert any(regex.search('C:\\Users\\Some User\\') for regex in scrubber.re_folders)
assert any(regex.search('C:\\Users\\USERNAM~1\\') for regex in scrubber.re_folders)

# ip negative
assert not scrubber.re_ip.search('0.0.0')
Expand Down Expand Up @@ -60,20 +68,18 @@ def test_scrub_path(scrubber):
assert scrubber.scrub_text('/usr/local/path/') == '/usr/local/path/'
assert scrubber.scrub_text('some text') == 'some text'

assert scrubber.user_name is None
assert not scrubber.sensitive_occurrences

# scrub positive

# this particular example is kinda bug (<<user>>)
# but it is not really important what placeholder we use
# hence, let's leave it at that for now.
assert scrubber.scrub_text('/users/user/apps') == \
f'/users/<{scrubber.placeholder_user}>/apps'
assert scrubber.user_name == 'user'
assert scrubber.scrub_text('/users/user/apps') == f'/users/{scrubber.placeholder_user}/apps'
assert 'user' in scrubber.sensitive_occurrences

assert scrubber.scrub_text('/users/username/some/long_path') == \
f'/users/{scrubber.placeholder_user}/some/long_path'
assert scrubber.user_name == 'username'
assert scrubber.scrub_text('/users/username/some/long_path') == f'/users/{scrubber.placeholder_user}/some/long_path'
assert 'username' in scrubber.sensitive_occurrences


def test_scrub_text_ip(scrubber):
Expand All @@ -85,39 +91,41 @@ def test_scrub_text_ip(scrubber):
assert scrubber.scrub_text('0.0.0.1') == scrubber.placeholder_ip
assert scrubber.scrub_text('0.100.0.1') == scrubber.placeholder_ip

assert scrubber.user_name is None
assert not scrubber.sensitive_occurrences


def test_scrub_text_hash(scrubber):
# negative
assert scrubber.scrub_text('0a303030303030303030303030303030303030300') == \
'0a303030303030303030303030303030303030300'
assert scrubber.scrub_text('0a3030303030303030303303030303030303030') == \
'0a3030303030303030303303030303030303030'
assert (
scrubber.scrub_text('0a303030303030303030303030303030303030300') == '0a303030303030303030303030303030303030300'
)
assert scrubber.scrub_text('0a3030303030303030303303030303030303030') == '0a3030303030303030303303030303030303030'

# positive
assert scrubber.scrub_text('3030303030303030303030303030303030303030') == \
scrubber.placeholder_hash
assert scrubber.scrub_text('hash:3030303030303030303030303030303030303030') == \
f'hash:{scrubber.placeholder_hash}'
assert scrubber.scrub_text('3030303030303030303030303030303030303030') == scrubber.placeholder_hash
assert scrubber.scrub_text('hash:3030303030303030303030303030303030303030') == f'hash:{scrubber.placeholder_hash}'

assert scrubber.user_name is None
assert not scrubber.sensitive_occurrences


def test_scrub_text_complex_string(scrubber):
source = 'this is a string that have been sent from ' \
'192.168.1.1(3030303030303030303030303030303030303030) ' \
'located at usr/someuser/path at ' \
'someuser machine(someuserany)'
source = (
'this is a string that have been sent from '
'192.168.1.1(3030303030303030303030303030303030303030) '
'located at usr/someuser/path at '
'someuser machine(someuserany)'
)

actual = scrubber.scrub_text(source)

assert actual == f'this is a string that have been sent from ' \
f'{scrubber.placeholder_ip}({scrubber.placeholder_hash}) ' \
f'located at usr/{scrubber.placeholder_user}/path at ' \
f'{scrubber.placeholder_user} machine(someuserany)'
assert (
actual == f'this is a string that have been sent from '
f'{scrubber.placeholder_ip}({scrubber.placeholder_hash}) '
f'located at usr/{scrubber.placeholder_user}/path at '
f'{scrubber.placeholder_user} machine(someuserany)'
)

assert scrubber.user_name == 'someuser'
assert 'someuser' in scrubber.sensitive_occurrences
assert scrubber.scrub_text('someuser') == scrubber.placeholder_user


Expand All @@ -132,60 +140,64 @@ def test_scrub_event(scrubber):
CONTEXTS: {
REPORTER: {
OS_ENVIRON: {
'PATH': '/users/username/apps'
'USERNAME': 'User Name',
'PATH': '/users/username/apps',
'TMP_WIN': r'C:\Users\USERNAM~1\AppData\Local\Temp',
'USERDOMAIN': 'a',
},
STACKTRACE: ['Traceback (most recent call last):',
'File "/Users/username/Tribler/tribler/src/tribler-gui/tribler_gui/"'],
SYSINFO: {'sys.path': ['/Users/username/Tribler/',
'/Users/username/',
'.']},
STACKTRACE: [
'Traceback (most recent call last):',
'File "/Users/username/Tribler/tribler/src/tribler-gui/tribler_gui/"',
],
SYSINFO: {'sys.path': ['/Users/username/Tribler/', '/Users/username/', '.']},
}
},
EXTRA: {
SYS_ARGV: ['/Users/username/Tribler']
},
LOGENTRY: {
'message': 'Exception with username',
'params': ['Traceback File: /Users/username/Tribler/']
},
EXTRA: {SYS_ARGV: ['/Users/username/Tribler']},
LOGENTRY: {'message': 'Exception with username', 'params': ['Traceback File: /Users/username/Tribler/']},
BREADCRUMBS: {
'values': [
{'type': 'log', 'message': 'Traceback File: /Users/username/Tribler/', 'timestamp': '1'},
{'type': 'log', 'message': 'Traceback File: /Users/username/Tribler/', 'timestamp': '1'},
{'type': 'log', 'message': 'IP: 192.168.1.1', 'timestamp': '2'}
{'type': 'log', 'message': 'IP: 192.168.1.1', 'timestamp': '2'},
]
},

}

assert scrubber.scrub_event(event) == {
CONTEXTS: {
REPORTER: {
OS_ENVIRON: {
'PATH': f'/users/{scrubber.placeholder_user}/apps'
'USERNAME': '<USERNAME>',
'PATH': f'/users/{scrubber.placeholder_user}/apps',
'TMP_WIN': f'C:\\Users\\{scrubber.placeholder_user}\\AppData\\Local\\Temp',
'USERDOMAIN': '<USERDOMAIN>',
},
STACKTRACE: [
'Traceback (most recent call last):',
f'File "/Users/{scrubber.placeholder_user}/Tribler/tribler/src/tribler-gui/tribler_gui/"',
],
SYSINFO: {
'sys.path': [
f'/Users/{scrubber.placeholder_user}/Tribler/',
f'/Users/{scrubber.placeholder_user}/',
'.',
]
},
STACKTRACE: ['Traceback (most recent call last):',
f'File "/Users/{scrubber.placeholder_user}/Tribler/tribler/src/tribler-gui/tribler_gui/"'],
SYSINFO: {'sys.path': [f'/Users/{scrubber.placeholder_user}/Tribler/',
f'/Users/{scrubber.placeholder_user}/',
'.']},
},
},
LOGENTRY: {
'message': f'Exception with {scrubber.placeholder_user}',
'params': [f'Traceback File: /Users/{scrubber.placeholder_user}/Tribler/']
},
EXTRA: {
SYS_ARGV: [f'/Users/{scrubber.placeholder_user}/Tribler']
'params': [f'Traceback File: /Users/{scrubber.placeholder_user}/Tribler/'],
},
EXTRA: {SYS_ARGV: [f'/Users/{scrubber.placeholder_user}/Tribler']},
BREADCRUMBS: {
'values': [
{'type': 'log',
'message': f'Traceback File: /Users/{scrubber.placeholder_user}/Tribler/',
'timestamp': '1'},
{'type': 'log',
'message': f'IP: {scrubber.placeholder_ip}',
'timestamp': '2'}
{
'type': 'log',
'message': f'Traceback File: /Users/{scrubber.placeholder_user}/Tribler/',
'timestamp': '1',
},
{'type': 'log', 'message': f'IP: {scrubber.placeholder_ip}', 'timestamp': '2'},
]
},
}
Expand All @@ -201,7 +213,8 @@ def test_entities_recursively(scrubber):

event = {'some': {'value': [{'path': '/Users/username/Tribler'}]}}
assert scrubber.scrub_entity_recursively(event) == {
'some': {'value': [{'path': f'/Users/{scrubber.placeholder_user}/Tribler'}]}}
'some': {'value': [{'path': f'/Users/{scrubber.placeholder_user}/Tribler'}]}
}
# stop on depth

assert scrubber.scrub_entity_recursively(event) != event
Expand All @@ -215,41 +228,39 @@ def test_scrub_unnecessary_fields(scrubber):
# custom
custom_scrubber = SentryScrubber()
custom_scrubber.event_fields_to_cut = ['new', 'default']
assert custom_scrubber.scrub_event(
{
'default': 'event',
'new': 'field',
'modules': {}

}) == {
'modules': {}
}
assert custom_scrubber.scrub_event({'default': 'event', 'new': 'field', 'modules': {}}) == {'modules': {}}


def test_scrub_text_none(scrubber):
assert scrubber.scrub_text(None) is None


def test_scrub_text_some(scrubber):
def test_scrub_some_text(scrubber):
assert scrubber.scrub_text('some text') == 'some text'
assert scrubber.user_name is None
assert not scrubber.sensitive_occurrences


def test_scrub_dict(scrubber):
assert scrubber.scrub_entity_recursively(None) is None
assert scrubber.scrub_entity_recursively({}) == {}

assert scrubber.scrub_entity_recursively({'PATH': '/home/username/some/',
'USER': 'username'}) \
== {'PATH': f'/home/{scrubber.placeholder_user}/some/',
'USER': scrubber.placeholder_user}
assert scrubber.user_name == 'username'
assert scrubber.scrub_entity_recursively(
{'PATH': '/home/username/some/', 'USERDOMAIN': 'UD', 'USERNAME': 'U', 'REPEATED': 'user username UD U'}
) == {
'PATH': f'/home/{scrubber.placeholder_user}/some/',
'USERDOMAIN': '<USERDOMAIN>',
'USERNAME': '<USERNAME>',
'REPEATED': f'user {scrubber.placeholder_user} <USERDOMAIN> <USERNAME>',
}

assert 'username' in scrubber.sensitive_occurrences.keys()
assert 'UD' in scrubber.sensitive_occurrences.keys()
assert 'U' in scrubber.sensitive_occurrences.keys()


def test_scrub_list(scrubber):
assert scrubber.scrub_entity_recursively(None) is None
assert scrubber.scrub_entity_recursively([]) == []

assert scrubber.scrub_entity_recursively(['/home/username/some/']) == \
[f'/home/{scrubber.placeholder_user}/some/']
assert scrubber.user_name == 'username'
assert scrubber.scrub_entity_recursively(['/home/username/some/']) == [f'/home/{scrubber.placeholder_user}/some/']
assert 'username' in scrubber.sensitive_occurrences

0 comments on commit 176ba77

Please sign in to comment.