Improve the leak enhancement with tasks group and retry. Disable Truf…

…fleHog
PlaytikaOSS · Jun 28, 2022 · 5a3b2e4 · 5a3b2e4
1 parent 6f1be4f
commit 5a3b2e4
Show file tree

Hide file tree

Showing 8 changed files with 131 additions and 72 deletions.
diff --git a/.env.example b/.env.example
@@ -116,6 +116,7 @@ export ES_PORT=9200
 export ES_USER=elastic
 export ES_PASS=changeme
 export CLONES_DIR=/data/
+export REPO_MAX_CLONE_SIZE=100000
 
 # Cron Interval in seconds, default of 15 minutes.
 export CRON_INTERVAL=900

diff --git a/README.md b/README.md
@@ -36,7 +36,6 @@ Based on the **Code C.A.I.N** framework:
   - **Enhance results with IOLs** (Indicators Of Leak):
       - Secrets in the found sources (including Git repos commits history):
         - With [Shhgit][1] (using a customized rules list).
-        - With [TruffleHog][2].
       - URIs (Including incdication of your organization's domains)
       - Emails (Including indication of your organization's email addresses)
       - Contributors
@@ -119,8 +118,7 @@ Make sure that you are not exposing it to the world, and doing your best to **re
 ## Contributing
 Contributions are very welcomed.
 
-Please follow our [contribution guidelines and documentation][3].
+Please follow our [contribution guidelines and documentation][2].
 
 [1]: <https://github.com/eth0izzle/shhgit>
-[2]: <https://github.com/trufflesecurity/trufflehog>
-[3]: <https://github.com/Playtika/leaktopus/blob/main/CONTRIBUTING.md>
+[2]: <https://github.com/Playtika/leaktopus/blob/main/CONTRIBUTING.md>
diff --git a/leaktopus_backend/leaktopus/common/db_handler.py b/leaktopus_backend/leaktopus/common/db_handler.py
@@ -289,8 +289,11 @@ def delete_leak_by_url(url):
     cur = db.cursor()
 
     cur.execute('''DELETE FROM leak WHERE url REGEXP ?''', (url,))
+    # @todo Get the leak id and delete by it.
     cur.execute('''DELETE FROM secret WHERE url REGEXP ?''', (url,))
     cur.execute('''DELETE FROM domain WHERE url REGEXP ?''', (url,))
+    # cur.execute('''DELETE FROM contributors WHERE url REGEXP ?''', (url,))
+    # cur.execute('''DELETE FROM sensitive_keywords WHERE url REGEXP ?''', (url,))
 
     db.commit()
 

diff --git a/leaktopus_backend/leaktopus/common/leak_enhancer.py b/leaktopus_backend/leaktopus/common/leak_enhancer.py
@@ -1,62 +1,118 @@
 import os
-import datetime
 import shutil
 from git.repo.base import Repo
 import subprocess
 
 from leaktopus.app import create_celery_app
 
 celery = create_celery_app()
-
-
-@celery.task
-def leak_enhancer(repos_full_names, scan_id, organization_domains=[], sensitive_keywords=[]):
+# How many times to retry the analysis task before failing.
+ANALYSIS_MAX_RETRIES = 5
+# Interval between analysis task retry.
+RETRY_INTERVAL = 30
+# Maximum size (in KB) of repository to clone. Reps bigger than that will be skipped.
+# @todo Increase and allow to control via environment variable.
+REPO_MAX_SIZE = os.environ.get('REPO_MAX_CLONE_SIZE', 100000)
+
+
+def is_repo_max_size_exceeded(repo_name):
+    import requests
+    res = requests.get(f"https://api.github.com/repos/{repo_name}")
+    if res.status_code == 200:
+        repo_metadata = res.json()
+        if "size" in repo_metadata:
+            return int(repo_metadata["size"]) > int(REPO_MAX_SIZE)
+
+    # Fallback to true so the repository won't be tested.
+    return True
+
+
+@celery.task(bind=True, max_retries=ANALYSIS_MAX_RETRIES)
+def enhance_repo(self, repo_name, scan_id, clones_base_dir, organization_domains, sensitive_keywords):
+    import datetime
+    import leaktopus.common.scans as scans
     from leaktopus.common.secrets_scanner import scan as secrets_scan
     from leaktopus.common.domains_scanner import scan as domains_scan
     from leaktopus.common.contributors_extractor import scan as contributors_extractor
     from leaktopus.common.sensitive_keywords_extractor import scan as sensitive_keywords_extractor
 
-    # Skip step if abort was requested.
-    import leaktopus.common.scans as scans
-    from leaktopus.models.scan_status import ScanStatus
     if scans.is_scan_aborting(scan_id):
-        return repos_full_names
+        return True
 
-    # Exit if repos_full_names is empty(failure in previous steps).
-    if not repos_full_names:
-        return []
+    if is_repo_max_size_exceeded(repo_name):
+        print(f"Skipped {repo_name} since max size exceeded")
+        return True
 
-    # Update the status, since aborting wasn't requested.
-    scans.update_scan_status(scan_id, ScanStatus.SCAN_ANALYZING)
-
-    clones_base_dir = os.environ.get('CLONES_DIR', '/tmp/leaktopus-clones/')
     ts = datetime.datetime.now().timestamp()
+    repo_path = "https://github.com/" + repo_name + ".git"
+    clone_dir = os.path.join(clones_base_dir, str(ts), repo_name.replace("/", "_"))
 
-    for repo_name in repos_full_names:
-        if scans.is_scan_aborting(scan_id):
-            continue
-
-        repo_path = "https://github.com/" + repo_name + ".git"
-        clone_dir = os.path.join(clones_base_dir, str(ts), repo_name.replace("/", "_"))
-
+    try:
         # Now, clone the repo.
         Repo.clone_from(repo_path, clone_dir)
 
         # Prepare the full Git diff for secrets scan.
         subprocess.call(['sh', '/app/secrets/git-extract-diff'], cwd=clone_dir)
-        # Run the secrets scanning tool (shhgit)
+        # Extract the commits history from the repository.
         full_diff_dir = os.path.join(clone_dir, 'commits_data')
 
         domains_scan(repo_path, full_diff_dir, organization_domains)
         sensitive_keywords_extractor(repo_path, full_diff_dir, sensitive_keywords)
         contributors_extractor(repo_path, full_diff_dir, organization_domains)
         secrets_scan(repo_path, full_diff_dir)
+    except Exception as e:
+        print(f'Exception raised on the analysis of {repo_name}, it would be retried soon.')
 
         # Cleanup of repo clone.
-        # @todo Cleanup even in case of an error.
-        shutil.rmtree(clone_dir, ignore_errors=True)
+        shutil.rmtree(os.path.join(clones_base_dir, str(ts)), ignore_errors=True)
 
-    # Cleanup of entire analysis directory.
+        raise self.retry(exc=e, countdown=RETRY_INTERVAL)
+
+    # Cleanup of repo clone.
     shutil.rmtree(os.path.join(clones_base_dir, str(ts)), ignore_errors=True)
 
+
+@celery.task
+def leak_enhancer(repos_full_names, scan_id, organization_domains=[], sensitive_keywords=[]):
+    from celery import group
+    import leaktopus.common.scans as scans
+    from leaktopus.models.scan_status import ScanStatus
+
+    # Skip step if abort was requested.
+    if scans.is_scan_aborting(scan_id):
+        return repos_full_names
+
+    # Exit if repos_full_names is empty(failure in previous steps).
+    if not repos_full_names:
+        return []
+
+    # Update the status, since aborting wasn't requested.
+    scans.update_scan_status(scan_id, ScanStatus.SCAN_ANALYZING)
+
+    clones_base_dir = os.environ.get('CLONES_DIR', '/tmp/leaktopus-clones/')
+
+    enhance_tasks = []
+    for repo_name in repos_full_names:
+        # Create the group of enhancement tasks, one per repository.
+        enhance_tasks.append(enhance_repo.s(
+                repo_name=repo_name,
+                scan_id=scan_id,
+                clones_base_dir=clones_base_dir,
+                organization_domains=organization_domains,
+                sensitive_keywords=sensitive_keywords)
+        )
+
+    # Run the enhance in async
+    task_group = group(enhance_tasks)
+    result_group = task_group.apply_async()
+
+    # Waiting for all analysis tasks to finish.
+    while result_group.waiting():
+        continue
+
+    if result_group.successful():
+        print('Done analyzing leaks.')
+    else:
+        print('Error in one of the enhancement tasks.')
+
     return repos_full_names
diff --git a/leaktopus_backend/leaktopus/common/scanner_async.py b/leaktopus_backend/leaktopus/common/scanner_async.py
@@ -1,12 +1,10 @@
-from flask import abort
 import os
 from github import Github, RateLimitExceededException, BadCredentialsException, GithubException
 from datetime import datetime
 import re
 import json
 import leaktopus.common.db_handler as dbh
 from leaktopus.app import create_celery_app
-from leaktopus.common.leak_handler import leaks_result
 
 celery = create_celery_app()
 

diff --git a/leaktopus_backend/leaktopus/common/secrets_scanner.py b/leaktopus_backend/leaktopus/common/secrets_scanner.py
@@ -54,38 +54,38 @@ def parse_secrets_results(repo_path, csv_path):
     return secrets
 
 
-def trufflehog_parse_output(url, output):
-    secrets = []
-
-    results = []
-    # Limit the number of secrets to handle.
-    for foundIssue in output["foundIssues"]:
-        with open(foundIssue, "r") as issue_file:
-            results.append(json.loads(issue_file.read()))
-            # Structure
-            # dict_keys(['date', 'path', 'branch', 'commit'-msg, 'diff', 'stringsFound', 'printDiff', 'commitHash', 'reason'])
-
-    # Iterate over the results, and separate the strings found.
-    for res in results:
-        for string_found in res["stringsFound"]:
-            secrets.append({
-                "signature_name": res["reason"],
-                "commit_sha": res["commitHash"],
-                "match_string": string_found,
-                "html_url": url[:-4] + "/commit/" + res["commitHash"]
-            })
-
-    return secrets
-
-
-def scan_git(url):
-    output = truffleHog.find_strings(url, printJson=True, surpress_output=True, do_regex=False, do_entropy=True)
-    if output["foundIssues"]:
-        return trufflehog_parse_output(url, output)
-
-    truffleHog.clean_up(output)
-
-    return []
+# def trufflehog_parse_output(url, output):
+#     secrets = []
+#
+#     results = []
+#     # Limit the number of secrets to handle.
+#     for foundIssue in output["foundIssues"]:
+#         with open(foundIssue, "r") as issue_file:
+#             results.append(json.loads(issue_file.read()))
+#             # Structure
+#             # dict_keys(['date', 'path', 'branch', 'commit'-msg, 'diff', 'stringsFound', 'printDiff', 'commitHash', 'reason'])
+#
+#     # Iterate over the results, and separate the strings found.
+#     for res in results:
+#         for string_found in res["stringsFound"]:
+#             secrets.append({
+#                 "signature_name": res["reason"],
+#                 "commit_sha": res["commitHash"],
+#                 "match_string": string_found,
+#                 "html_url": url[:-4] + "/commit/" + res["commitHash"]
+#             })
+#
+#     return secrets
+
+
+# def scan_git(url):
+#     output = truffleHog.find_strings(url, printJson=True, surpress_output=True, do_regex=False, do_entropy=True)
+#     if output["foundIssues"]:
+#         return trufflehog_parse_output(url, output)
+#
+#     truffleHog.clean_up(output)
+#
+#     return []
 
 
 def scan(url, full_diff_dir):
@@ -103,6 +103,7 @@ def scan(url, full_diff_dir):
     ])
 
     base_secrets = parse_secrets_results(url, results_path)
-    git_secrets = scan_git(url)
-    total_secrets = base_secrets + git_secrets
-    store_secrets(url, total_secrets)
+    # git_secrets = scan_git(url)
+    # total_secrets = base_secrets + git_secrets
+    # store_secrets(url, total_secrets)
+    store_secrets(url, base_secrets)
diff --git a/leaktopus_backend/leaktopus/common/sensitive_keywords_extractor.py b/leaktopus_backend/leaktopus/common/sensitive_keywords_extractor.py
@@ -1,4 +1,4 @@
-from subprocess import run, CalledProcessError, STDOUT
+from subprocess import run, CalledProcessError
 import hashlib
 from leaktopus.common.sensitive_keywords import add_sensitive_keyword, get_sensitive_keywords
 from leaktopus.common.leak_handler import get_leak_by_url
@@ -58,6 +58,7 @@ def parse_sensitive_keywords_results(url, output):
     existing_sensitive_keywords_checksums = get_existing_sensitive_keywords_checksums(leak)
 
     for row in output.splitlines():
+        # @todo Support the case where there is ":" in the keyword.
         commit_hash, keyword = row.lstrip('./').split(': ')
         sensitive_keyword = {
             'keyword': keyword.strip('"'),
@@ -75,8 +76,8 @@ def scan(url, full_diff_dir, sensitive_keywords):
         return False
 
     # Add the -e prefix to all keywords for our grep.
-    grep_keywords = [f'-e "{keyword}"' for keyword in sensitive_keywords]
-    grep_cmd = ['grep', '-IroF']
+    grep_keywords = [f'-e {keyword}' for keyword in sensitive_keywords]
+    grep_cmd = ['grep', '-IiroF']
     grep_cmd.extend(grep_keywords)
     grep_cmd.append('.')
 

diff --git a/leaktopus_backend/requirements.txt b/leaktopus_backend/requirements.txt
@@ -21,7 +21,8 @@ celery==5.0.5
 PyGithub>=1.55
 elasticsearch==7.17.4
 GitPython
-truffleHog==2.2.1
+# Disabled.
+# truffleHog==2.2.1
 pyjwt>=2.4.0 # not directly required, pinned to avoid a vulnerability
 
 # Documentation libs