ML expanded and retrained #2276

Workflow file for this run

.github/workflows/benchmark.yml at 9b2006d

	# This workflow runs benchmark
	# Separation of jobs helps to cache data even benchmark is fail

	name: Benchmark

	on:
	push:
	branches: [ main ]
	pull_request:
	branches: [ main ]

	jobs:

	# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

	download_data:

	runs-on: ubuntu-latest

	steps:

	- name: Checkout CredData
	uses: actions/checkout@v4
	with:
	repository: Samsung/CredData

	- name: Markup hashing
	run: \|
	md5sum snapshot.yaml >checksums.md5
	for f in $(find meta -type f\|sort); do md5sum $f; done >>checksums.md5
	for f in $(find . -maxdepth 1 -type f -name "*.py"\|sort); do md5sum $f; done >>checksums.md5
	cat checksums.md5
	sha256sum checksums.md5

	- name: Cache data
	id: cache-data
	uses: actions/cache@v4
	with:
	path: data
	key: cred-data-${{ hashFiles('checksums.md5') }}

	- name: Set up Python 3.8
	if: steps.cache-data.outputs.cache-hit != 'true'
	uses: actions/setup-python@v4
	with:
	python-version: "3.8"

	- name: Update PIP
	run: python -m pip install --upgrade pip

	- name: Install requirements of CredData
	if: steps.cache-data.outputs.cache-hit != 'true'
	run: python -m pip install --requirement requirements.txt

	- name: Generate Data Asset
	if: steps.cache-data.outputs.cache-hit != 'true'
	run: python download_data.py --data_dir data --jobs $(nproc)


	# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

	run_benchmark:

	if: ${{ 'pull_request' == github.event_name }}

	needs: [ download_data ]

	runs-on: ubuntu-latest

	steps:

	- name: Checkout CredData
	uses: actions/checkout@v4
	with:
	repository: Samsung/CredData

	- name: Markup hashing
	run: \|
	md5sum snapshot.yaml >checksums.md5
	for f in $(find meta -type f\|sort); do md5sum $f; done >>checksums.md5
	for f in $(find . -maxdepth 1 -type f -name "*.py"\|sort); do md5sum $f; done >>checksums.md5
	cat checksums.md5
	sha256sum checksums.md5

	- name: Cache data
	id: cache-data
	uses: actions/cache@v4
	with:
	path: data
	key: cred-data-${{ hashFiles('checksums.md5') }}

	- name: Failure in case when cache missed
	if: steps.cache-data.outputs.cache-hit != 'true'
	run: exit 1

	- name: Check Data Asset - DEBUG
	if: steps.cache-data.outputs.cache-hit == 'true'
	run: ls -al . && ls -al data

	- name: Set up Python 3.8
	uses: actions/setup-python@v4
	with:
	python-version: "3.8"

	- name: Update PIP
	run: python -m pip install --upgrade pip

	- name: Install requirements of CredData
	run: python -m pip install --requirement requirements.txt

	- name: Checkout CredSweeper
	uses: actions/checkout@v4
	with:
	ref: ${{ github.event.pull_request.head.sha }}
	path: temp/CredSweeper

	- name: Patch benchmark for PR work
	run: \|
	sed -i 's\|CREDSWEEPER = "https://github.com/Samsung/CredSweeper.git"\|CREDSWEEPER = "dummy://github.com/Samsung/CredSweeper.git"\|' benchmark/common/constants.py
	grep --with-filename --line-number 'dummy://github.com/Samsung/CredSweeper.git' benchmark/common/constants.py

	- name: Install CredSweeper
	run: \|
	python -m pip install temp/CredSweeper
	credsweeper_head=

	- name: Run CredSweeper tool
	run: \|
	credsweeper --banner --jobs $(nproc) --path data --save-json report.${{ github.event.pull_request.head.sha }}.json \| tee credsweeper.${{ github.event.pull_request.head.sha }}.log

	- name: Run Benchmark
	run: \|
	python -m benchmark --scanner credsweeper --load report.${{ github.event.pull_request.head.sha }}.json \| tee benchmark.${{ github.event.pull_request.head.sha }}.log

	- name: Upload CredSweeper log
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: credsweeper
	path: credsweeper.${{ github.event.pull_request.head.sha }}.log

	- name: Upload CredSweeper report
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: report
	path: report.${{ github.event.pull_request.head.sha }}.json

	- name: Upload benchmark output
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: benchmark
	path: benchmark.${{ github.event.pull_request.head.sha }}.log

	- name: Verify benchmark scores of the PR
	run: \|
	diff --ignore-all-space --ignore-blank-lines temp/CredSweeper/cicd/benchmark.txt benchmark.${{ github.event.pull_request.head.sha }}.log

	# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

	performance_benchmark:
	# put the benchmark in single job to keep constant environment during test
	needs: [ download_data ]

	runs-on: ubuntu-latest
	strategy:
	fail-fast: false
	matrix:
	python-version: [ "3.8", "3.9", "3.10", "3.11" ]

	steps:

	- name: Checkout CredData
	uses: actions/checkout@v4
	with:
	repository: Samsung/CredData

	- name: Markup hashing
	run: \|
	md5sum snapshot.yaml >checksums.md5
	for f in $(find meta -type f\|sort); do md5sum $f; done >>checksums.md5
	for f in $(find . -maxdepth 1 -type f -name "*.py"\|sort); do md5sum $f; done >>checksums.md5
	cat checksums.md5
	sha256sum checksums.md5

	- name: Cache data
	id: cache-data
	uses: actions/cache@v4
	with:
	path: data
	key: cred-data-${{ hashFiles('checksums.md5') }}

	- name: Failure in case when cache missed
	if: steps.cache-data.outputs.cache-hit != 'true'
	run: exit 1

	- name: Exclude very huge data
	if: steps.cache-data.outputs.cache-hit == 'true'
	run: rm -rf data/0* data/2* data/7* data/8* data/a* data/b* data/d* data/e* data/f*

	- name: Set up Python ${{ matrix.python-version }}
	uses: actions/setup-python@v4
	with:
	python-version: ${{ matrix.python-version }}

	- name: Add synthetic huge data
	if: steps.cache-data.outputs.cache-hit == 'true'
	run: python -c "for n in range(7654321):print(f'{n:08x}')" >data/test.text

	- name: Update PIP
	run: python -m pip install --upgrade pip

	- name: Install released CredSweeper
	run: \|
	python -m pip install credsweeper
	# check the banner
	credsweeper --banner

	- name: Run performance benchmark RELEASE
	run: \|
	START_TIME=$(date +%s)
	/usr/bin/time --verbose credsweeper --log error --path data --save-json /dev/null
	FINISH_TIME=$(date +%s)
	RELEASE_TIME=$(( ${FINISH_TIME} - ${START_TIME} ))
	if [ 0 -lt ${RELEASE_TIME} ]; then
	echo Elapsed $(date -ud "@${RELEASE_TIME}" +"%H:%M:%S")
	else
	echo "Wrong result '${RELEASE_TIME}'"
	exit 1
	fi
	echo "RELEASE_TIME=${RELEASE_TIME}" >> $GITHUB_ENV

	- name: Uninstall released CredSweeper
	run: \|
	python -m pip uninstall -y credsweeper

	- name: Checkout base CredSweeper
	uses: actions/checkout@v4
	with:
	ref: ${{ github.event.pull_request.base.sha }}
	path: temp/CredSweeper.base

	- name: Install base CredSweeper
	run: \|
	python -m pip install temp/CredSweeper.base
	# check the banner
	credsweeper --banner

	- name: Run performance benchmark BASE
	run: \|
	START_TIME=$(date +%s)
	/usr/bin/time --verbose credsweeper --log error --path data --save-json /dev/null
	FINISH_TIME=$(date +%s)
	BASE_TIME=$(( ${FINISH_TIME} - ${START_TIME} ))
	if [ 0 -lt ${BASE_TIME} ]; then
	echo Elapsed $(date -ud "@${BASE_TIME}" +"%H:%M:%S")
	else
	echo "Wrong result '${BASE_TIME}'"
	exit 1
	fi
	echo "BASE_TIME=${BASE_TIME}" >> $GITHUB_ENV

	- name: Checkout current CredSweeper
	uses: actions/checkout@v4
	with:
	ref: ${{ github.event.pull_request.head.sha }}
	path: temp/CredSweeper.head

	- name: Install current CredSweeper
	run: \|
	python -m pip install temp/CredSweeper.head
	# check the banner
	credsweeper --banner

	- name: Run performance benchmark CURRENT
	run: \|
	START_TIME=$(date +%s)
	/usr/bin/time --verbose credsweeper --log error --path data --save-json /dev/null
	FINISH_TIME=$(date +%s)
	HEAD_TIME=$(( ${FINISH_TIME} - ${START_TIME} ))
	if [ 0 -lt ${HEAD_TIME} ]; then
	echo Elapsed $(date -ud "@${HEAD_TIME}" +"%H:%M:%S")
	else
	echo "Wrong result '${HEAD_TIME}'"
	exit 1
	fi
	echo "HEAD_TIME=${HEAD_TIME}" >> $GITHUB_ENV

	- name: Compare results
	run: \|
	exit_code=0
	LOW_DELTA=10
	THRESHOLD=250

	# RELEASE
	if [ ${RELEASE_TIME} -le ${HEAD_TIME} ]; then
	d=$(( 1000 * ( ${HEAD_TIME} - ${RELEASE_TIME} ) / ${RELEASE_TIME} ))
	echo "RELEASE_TIME (sec) = ${RELEASE_TIME}, current (sec) = ${HEAD_TIME}. Diff (% * 10): ${d}"
	if [ $LOW_DELTA -ge ${d} ]; then
	echo "Almost the same."
	elif [ $THRESHOLD -lt ${d} ]; then
	echo "Significantly Slowdown."
	exit_code=1
	else
	echo "Slowdown."
	fi
	else
	d=$(( 1000 * ( ${RELEASE_TIME} - ${HEAD_TIME} ) / ${RELEASE_TIME} ))
	echo "RELEASE_TIME (sec) = ${RELEASE_TIME}, current (sec) = ${HEAD_TIME}. Diff (% * 10): ${d}"
	if [ $LOW_DELTA -ge ${d} ]; then
	echo "Almost the same."
	elif [ $THRESHOLD -lt ${d} ]; then
	echo "Significantly speed-up."
	else
	echo "Speed-up."
	fi
	fi

	# BASE
	if [ ${BASE_TIME} -le ${HEAD_TIME} ]; then
	d=$(( 1000 * ( ${HEAD_TIME} - ${BASE_TIME} ) / ${BASE_TIME} ))
	echo "BASE_TIME (sec) = ${BASE_TIME}, current (sec) = ${HEAD_TIME}. Diff (% * 10): ${d}"
	if [ $LOW_DELTA -ge ${d} ]; then
	echo "Almost the same."
	elif [ $THRESHOLD -lt ${d} ]; then
	echo "Significantly Slowdown."
	exit_code=1
	else
	echo "Slowdown."
	fi
	else
	d=$(( 1000 * ( ${BASE_TIME} - ${HEAD_TIME} ) / ${BASE_TIME} ))
	echo "BASE_TIME (sec) = ${BASE_TIME}, current (sec) = ${HEAD_TIME}. Diff (% * 10): ${d}"
	if [ $LOW_DELTA -ge ${d} ]; then
	echo "Almost the same."
	elif [ $THRESHOLD -lt ${d} ]; then
	echo "Significantly speed-up."
	else
	echo "Speed-up."
	fi
	fi

	exit ${exit_code}

	# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
	experiment:
	# the ml train test is placed here to use cached data set
	needs: [ download_data ]

	runs-on: ubuntu-latest

	steps:

	- name: Checkout CredData
	uses: actions/checkout@v4
	with:
	repository: Samsung/CredData

	- name: Markup hashing
	run: \|
	md5sum snapshot.yaml >checksums.md5
	for f in $(find meta -type f\|sort); do md5sum $f; done >>checksums.md5
	for f in $(find . -maxdepth 1 -type f -name "*.py"\|sort); do md5sum $f; done >>checksums.md5
	cat checksums.md5
	sha256sum checksums.md5

	- name: Cache data
	id: cache-data
	uses: actions/cache@v4
	with:
	path: data
	key: cred-data-${{ hashFiles('checksums.md5') }}

	- name: Failure in case when cache missed
	if: steps.cache-data.outputs.cache-hit != 'true'
	run: exit 1

	- name: Exclude some sets for speed-up
	run: \|
	rm -rf data/2* data/8* data/b*
	rm -rf meta/2* meta/8* meta/b*
	mkdir -vp ${{ github.workspace }}/CredData
	mv data ${{ github.workspace }}/CredData/
	mv meta ${{ github.workspace }}/CredData/

	- name: Set up Python 3.8
	if: steps.cache-data.outputs.cache-hit != 'true'
	uses: actions/setup-python@v3
	with:
	python-version: "3.8"

	- name: Update PIP
	run: python -m pip install --upgrade pip

	- name: Checkout current CredSweeper
	uses: actions/checkout@v4
	with:
	ref: ${{ github.event.pull_request.head.sha }}
	path: CredSweeper.head

	- name: Install development packages
	run: python -m pip install --requirement CredSweeper.head/requirements.txt

	- name: Install experimental packages
	# some versions will be changed for compatibility
	run: python -m pip install --requirement CredSweeper.head/experiment/requirements.txt

	- name: dbg
	run: echo ${{ github.workspace }} && ls -al ${{ github.workspace }} && tree ${{ github.workspace }}

	- name: Run the experiment
	run: \|
	cd CredSweeper.head
	ls -al #dbg
	pwd #dbg
	export PYTHONPATH=$(pwd):${PYTHONPATH}
	cd experiment
	# check whether credsweeper is available as module
	python -m credsweeper --banner
	# use only 2 epochs for the test
	sed -i 's/epochs=42,/epochs=2,/' main.py
	python main.py --data ${{ github.workspace }}/CredData -j $(( 2 * $(nproc) ))
	ls -al results #dbg
	python -m tf2onnx.convert --saved-model $(find results -mindepth 1 -maxdepth 1 -type d) --output ../credsweeper/ml_model/ml_model.onnx --verbose
	# dbg
	git diff
	# crc32 should be changed
	python -m credsweeper --banner
	# run quick scan
	python -m credsweeper --log debug --path ../tests/samples --save-json
	NEW_MODEL_FOUND_SAMPLES=$(jq '.\|length' output.json)
	if [ 100 -gt ${NEW_MODEL_FOUND_SAMPLES} ]; then
	echo "Failure: found ${NEW_MODEL_FOUND_SAMPLES} credentials"
	exit 1
	fi

	# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

	run_doc_benchmark:
	runs-on: ubuntu-latest
	if: ${{ 'Samsung/CredSweeper' == github.event.pull_request.head.repo.full_name }}
	steps:
	- name: Checkout CredSweeper
	if: ${{ 'pull_request' == github.event_name }}
	uses: actions/checkout@v4
	with:
	ref: ${{ github.event.pull_request.head.sha }}

	- name: Send cURL request with the commit SHA
	if: ${{ 'pull_request' == github.event_name }}
	run: \|
	COMMIT_SHA=$(git rev-parse HEAD)
	curl -X POST ${{ secrets.SLACK_URL }} \
	--data-urlencode \
	"payload={'text':'[BMT Request] ${{ github.event.repository.html_url }}/commit/${COMMIT_SHA}'}"

	# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

ML expanded and retrained #2276

Workflow file

ML expanded and retrained #2276

Jobs

Run details

Workflow file for this run