Skip to content

Commit

Permalink
Improve the leak enhancement with tasks group and retry. Disable Truf…
Browse files Browse the repository at this point in the history
…fleHog
  • Loading branch information
rotemplay committed Jun 28, 2022
1 parent 6f1be4f commit 5a3b2e4
Show file tree
Hide file tree
Showing 8 changed files with 131 additions and 72 deletions.
1 change: 1 addition & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ export ES_PORT=9200
export ES_USER=elastic
export ES_PASS=changeme
export CLONES_DIR=/data/
export REPO_MAX_CLONE_SIZE=100000

# Cron Interval in seconds, default of 15 minutes.
export CRON_INTERVAL=900
Expand Down
6 changes: 2 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ Based on the **Code C.A.I.N** framework:
- **Enhance results with IOLs** (Indicators Of Leak):
- Secrets in the found sources (including Git repos commits history):
- With [Shhgit][1] (using a customized rules list).
- With [TruffleHog][2].
- URIs (Including incdication of your organization's domains)
- Emails (Including indication of your organization's email addresses)
- Contributors
Expand Down Expand Up @@ -119,8 +118,7 @@ Make sure that you are not exposing it to the world, and doing your best to **re
## Contributing
Contributions are very welcomed.

Please follow our [contribution guidelines and documentation][3].
Please follow our [contribution guidelines and documentation][2].

[1]: <https://github.com/eth0izzle/shhgit>
[2]: <https://github.com/trufflesecurity/trufflehog>
[3]: <https://github.com/Playtika/leaktopus/blob/main/CONTRIBUTING.md>
[2]: <https://github.com/Playtika/leaktopus/blob/main/CONTRIBUTING.md>
3 changes: 3 additions & 0 deletions leaktopus_backend/leaktopus/common/db_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,8 +289,11 @@ def delete_leak_by_url(url):
cur = db.cursor()

cur.execute('''DELETE FROM leak WHERE url REGEXP ?''', (url,))
# @todo Get the leak id and delete by it.
cur.execute('''DELETE FROM secret WHERE url REGEXP ?''', (url,))
cur.execute('''DELETE FROM domain WHERE url REGEXP ?''', (url,))
# cur.execute('''DELETE FROM contributors WHERE url REGEXP ?''', (url,))
# cur.execute('''DELETE FROM sensitive_keywords WHERE url REGEXP ?''', (url,))

db.commit()

Expand Down
110 changes: 83 additions & 27 deletions leaktopus_backend/leaktopus/common/leak_enhancer.py
Original file line number Diff line number Diff line change
@@ -1,62 +1,118 @@
import os
import datetime
import shutil
from git.repo.base import Repo
import subprocess

from leaktopus.app import create_celery_app

celery = create_celery_app()


@celery.task
def leak_enhancer(repos_full_names, scan_id, organization_domains=[], sensitive_keywords=[]):
# How many times to retry the analysis task before failing.
ANALYSIS_MAX_RETRIES = 5
# Interval between analysis task retry.
RETRY_INTERVAL = 30
# Maximum size (in KB) of repository to clone. Reps bigger than that will be skipped.
# @todo Increase and allow to control via environment variable.
REPO_MAX_SIZE = os.environ.get('REPO_MAX_CLONE_SIZE', 100000)


def is_repo_max_size_exceeded(repo_name):
import requests
res = requests.get(f"https://api.github.com/repos/{repo_name}")
if res.status_code == 200:
repo_metadata = res.json()
if "size" in repo_metadata:
return int(repo_metadata["size"]) > int(REPO_MAX_SIZE)

# Fallback to true so the repository won't be tested.
return True


@celery.task(bind=True, max_retries=ANALYSIS_MAX_RETRIES)
def enhance_repo(self, repo_name, scan_id, clones_base_dir, organization_domains, sensitive_keywords):
import datetime
import leaktopus.common.scans as scans
from leaktopus.common.secrets_scanner import scan as secrets_scan
from leaktopus.common.domains_scanner import scan as domains_scan
from leaktopus.common.contributors_extractor import scan as contributors_extractor
from leaktopus.common.sensitive_keywords_extractor import scan as sensitive_keywords_extractor

# Skip step if abort was requested.
import leaktopus.common.scans as scans
from leaktopus.models.scan_status import ScanStatus
if scans.is_scan_aborting(scan_id):
return repos_full_names
return True

# Exit if repos_full_names is empty(failure in previous steps).
if not repos_full_names:
return []
if is_repo_max_size_exceeded(repo_name):
print(f"Skipped {repo_name} since max size exceeded")
return True

# Update the status, since aborting wasn't requested.
scans.update_scan_status(scan_id, ScanStatus.SCAN_ANALYZING)

clones_base_dir = os.environ.get('CLONES_DIR', '/tmp/leaktopus-clones/')
ts = datetime.datetime.now().timestamp()
repo_path = "https://github.com/" + repo_name + ".git"
clone_dir = os.path.join(clones_base_dir, str(ts), repo_name.replace("/", "_"))

for repo_name in repos_full_names:
if scans.is_scan_aborting(scan_id):
continue

repo_path = "https://github.com/" + repo_name + ".git"
clone_dir = os.path.join(clones_base_dir, str(ts), repo_name.replace("/", "_"))

try:
# Now, clone the repo.
Repo.clone_from(repo_path, clone_dir)

# Prepare the full Git diff for secrets scan.
subprocess.call(['sh', '/app/secrets/git-extract-diff'], cwd=clone_dir)
# Run the secrets scanning tool (shhgit)
# Extract the commits history from the repository.
full_diff_dir = os.path.join(clone_dir, 'commits_data')

domains_scan(repo_path, full_diff_dir, organization_domains)
sensitive_keywords_extractor(repo_path, full_diff_dir, sensitive_keywords)
contributors_extractor(repo_path, full_diff_dir, organization_domains)
secrets_scan(repo_path, full_diff_dir)
except Exception as e:
print(f'Exception raised on the analysis of {repo_name}, it would be retried soon.')

# Cleanup of repo clone.
# @todo Cleanup even in case of an error.
shutil.rmtree(clone_dir, ignore_errors=True)
shutil.rmtree(os.path.join(clones_base_dir, str(ts)), ignore_errors=True)

# Cleanup of entire analysis directory.
raise self.retry(exc=e, countdown=RETRY_INTERVAL)

# Cleanup of repo clone.
shutil.rmtree(os.path.join(clones_base_dir, str(ts)), ignore_errors=True)


@celery.task
def leak_enhancer(repos_full_names, scan_id, organization_domains=[], sensitive_keywords=[]):
from celery import group
import leaktopus.common.scans as scans
from leaktopus.models.scan_status import ScanStatus

# Skip step if abort was requested.
if scans.is_scan_aborting(scan_id):
return repos_full_names

# Exit if repos_full_names is empty(failure in previous steps).
if not repos_full_names:
return []

# Update the status, since aborting wasn't requested.
scans.update_scan_status(scan_id, ScanStatus.SCAN_ANALYZING)

clones_base_dir = os.environ.get('CLONES_DIR', '/tmp/leaktopus-clones/')

enhance_tasks = []
for repo_name in repos_full_names:
# Create the group of enhancement tasks, one per repository.
enhance_tasks.append(enhance_repo.s(
repo_name=repo_name,
scan_id=scan_id,
clones_base_dir=clones_base_dir,
organization_domains=organization_domains,
sensitive_keywords=sensitive_keywords)
)

# Run the enhance in async
task_group = group(enhance_tasks)
result_group = task_group.apply_async()

# Waiting for all analysis tasks to finish.
while result_group.waiting():
continue

if result_group.successful():
print('Done analyzing leaks.')
else:
print('Error in one of the enhancement tasks.')

return repos_full_names
2 changes: 0 additions & 2 deletions leaktopus_backend/leaktopus/common/scanner_async.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
from flask import abort
import os
from github import Github, RateLimitExceededException, BadCredentialsException, GithubException
from datetime import datetime
import re
import json
import leaktopus.common.db_handler as dbh
from leaktopus.app import create_celery_app
from leaktopus.common.leak_handler import leaks_result

celery = create_celery_app()

Expand Down
71 changes: 36 additions & 35 deletions leaktopus_backend/leaktopus/common/secrets_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,38 +54,38 @@ def parse_secrets_results(repo_path, csv_path):
return secrets


def trufflehog_parse_output(url, output):
secrets = []

results = []
# Limit the number of secrets to handle.
for foundIssue in output["foundIssues"]:
with open(foundIssue, "r") as issue_file:
results.append(json.loads(issue_file.read()))
# Structure
# dict_keys(['date', 'path', 'branch', 'commit'-msg, 'diff', 'stringsFound', 'printDiff', 'commitHash', 'reason'])

# Iterate over the results, and separate the strings found.
for res in results:
for string_found in res["stringsFound"]:
secrets.append({
"signature_name": res["reason"],
"commit_sha": res["commitHash"],
"match_string": string_found,
"html_url": url[:-4] + "/commit/" + res["commitHash"]
})

return secrets


def scan_git(url):
output = truffleHog.find_strings(url, printJson=True, surpress_output=True, do_regex=False, do_entropy=True)
if output["foundIssues"]:
return trufflehog_parse_output(url, output)

truffleHog.clean_up(output)

return []
# def trufflehog_parse_output(url, output):
# secrets = []
#
# results = []
# # Limit the number of secrets to handle.
# for foundIssue in output["foundIssues"]:
# with open(foundIssue, "r") as issue_file:
# results.append(json.loads(issue_file.read()))
# # Structure
# # dict_keys(['date', 'path', 'branch', 'commit'-msg, 'diff', 'stringsFound', 'printDiff', 'commitHash', 'reason'])
#
# # Iterate over the results, and separate the strings found.
# for res in results:
# for string_found in res["stringsFound"]:
# secrets.append({
# "signature_name": res["reason"],
# "commit_sha": res["commitHash"],
# "match_string": string_found,
# "html_url": url[:-4] + "/commit/" + res["commitHash"]
# })
#
# return secrets


# def scan_git(url):
# output = truffleHog.find_strings(url, printJson=True, surpress_output=True, do_regex=False, do_entropy=True)
# if output["foundIssues"]:
# return trufflehog_parse_output(url, output)
#
# truffleHog.clean_up(output)
#
# return []


def scan(url, full_diff_dir):
Expand All @@ -103,6 +103,7 @@ def scan(url, full_diff_dir):
])

base_secrets = parse_secrets_results(url, results_path)
git_secrets = scan_git(url)
total_secrets = base_secrets + git_secrets
store_secrets(url, total_secrets)
# git_secrets = scan_git(url)
# total_secrets = base_secrets + git_secrets
# store_secrets(url, total_secrets)
store_secrets(url, base_secrets)
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from subprocess import run, CalledProcessError, STDOUT
from subprocess import run, CalledProcessError
import hashlib
from leaktopus.common.sensitive_keywords import add_sensitive_keyword, get_sensitive_keywords
from leaktopus.common.leak_handler import get_leak_by_url
Expand Down Expand Up @@ -58,6 +58,7 @@ def parse_sensitive_keywords_results(url, output):
existing_sensitive_keywords_checksums = get_existing_sensitive_keywords_checksums(leak)

for row in output.splitlines():
# @todo Support the case where there is ":" in the keyword.
commit_hash, keyword = row.lstrip('./').split(': ')
sensitive_keyword = {
'keyword': keyword.strip('"'),
Expand All @@ -75,8 +76,8 @@ def scan(url, full_diff_dir, sensitive_keywords):
return False

# Add the -e prefix to all keywords for our grep.
grep_keywords = [f'-e "{keyword}"' for keyword in sensitive_keywords]
grep_cmd = ['grep', '-IroF']
grep_keywords = [f'-e {keyword}' for keyword in sensitive_keywords]
grep_cmd = ['grep', '-IiroF']
grep_cmd.extend(grep_keywords)
grep_cmd.append('.')

Expand Down
3 changes: 2 additions & 1 deletion leaktopus_backend/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ celery==5.0.5
PyGithub>=1.55
elasticsearch==7.17.4
GitPython
truffleHog==2.2.1
# Disabled.
# truffleHog==2.2.1
pyjwt>=2.4.0 # not directly required, pinned to avoid a vulnerability

# Documentation libs
Expand Down

0 comments on commit 5a3b2e4

Please sign in to comment.