diff --git a/Build/build-reject-domainset.ts b/Build/build-reject-domainset.ts index b84d8e1f2..165602f38 100644 --- a/Build/build-reject-domainset.ts +++ b/Build/build-reject-domainset.ts @@ -11,7 +11,6 @@ import createKeywordFilter from './lib/aho-corasick'; import { readFileByLine, readFileIntoProcessedArray } from './lib/fetch-text-by-line'; import { sortDomains } from './lib/stable-sort-domain'; import { task } from './trace'; -import { getGorhillPublicSuffixPromise } from './lib/get-gorhill-publicsuffix'; import * as tldts from 'tldts'; import { SHARED_DESCRIPTION } from './lib/constants'; import { getPhishingDomains } from './lib/get-phishing-domains'; diff --git a/Build/lib/get-phishing-domains.test.ts b/Build/lib/get-phishing-domains.test.ts new file mode 100644 index 000000000..07a45906a --- /dev/null +++ b/Build/lib/get-phishing-domains.test.ts @@ -0,0 +1,10 @@ +// eslint-disable-next-line import-x/no-unresolved -- bun +import { describe, expect, it } from 'bun:test'; + +import { calcDomainAbuseScore } from './get-phishing-domains'; + +describe('sortDomains', () => { + it('nmdj.pl', () => { + console.log(calcDomainAbuseScore('.01462ccca801fed55370d79231c876e5.nmdj.pl')); + }); +}); diff --git a/Build/lib/get-phishing-domains.ts b/Build/lib/get-phishing-domains.ts index 4c1254b8f..80efa3cff 100644 --- a/Build/lib/get-phishing-domains.ts +++ b/Build/lib/get-phishing-domains.ts @@ -103,21 +103,23 @@ const BLACK_TLD = new Set([ ]); export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('get phishing domains').traceAsyncFn(async (span) => { - const [domainSet, domainSet2, gorhill] = await Promise.all([ - processDomainLists(span, 'https://curbengh.github.io/phishing-filter/phishing-filter-domains.txt', true, TTL.THREE_HOURS()), - isCI - ? processDomainLists(span, 'https://phishing.army/download/phishing_army_blocklist.txt', true, TTL.THREE_HOURS()) - : null, - getGorhillPublicSuffixPromise() - ]); - if (domainSet2) { + const gorhill = await getGorhillPublicSuffixPromise(); + + const domainSet = await span.traceChildAsync('download/parse/merge phishing domains', async (curSpan) => { + const [domainSet, domainSet2] = await Promise.all([ + processDomainLists(curSpan, 'https://curbengh.github.io/phishing-filter/phishing-filter-domains.txt', true, TTL.THREE_HOURS()), + processDomainLists(curSpan, 'https://phishing.army/download/phishing_army_blocklist.txt', true, TTL.THREE_HOURS()) + ]); + SetAdd(domainSet, domainSet2); - } - span.traceChildSync('whitelisting phishing domains', (parentSpan) => { - const trieForRemovingWhiteListed = parentSpan.traceChildSync('create trie for whitelisting', () => createTrie(domainSet)); + return domainSet; + }); - return parentSpan.traceChild('delete whitelisted from domainset').traceSyncFn(() => { + span.traceChildSync('whitelisting phishing domains', (curSpan) => { + const trieForRemovingWhiteListed = curSpan.traceChildSync('create trie for whitelisting', () => createTrie(domainSet)); + + return curSpan.traceChild('delete whitelisted from domainset').traceSyncFn(() => { for (let i = 0, len = WHITELIST_DOMAIN.length; i < len; i++) { const white = WHITELIST_DOMAIN[i]; domainSet.delete(white); @@ -134,68 +136,28 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g const domainArr = Array.from(domainSet); for (let i = 0, len = domainArr.length; i < len; i++) { - const line = processLine(domainArr[i]); - if (!line) continue; - - const apexDomain = gorhill.getDomain(line); - if (!apexDomain) continue; - - domainCountMap[apexDomain] ||= 0; + const line = domainArr[i]; - const isPhishingDomainMockingCoJp = line.includes('-co-jp'); - if (isPhishingDomainMockingCoJp) { - domainCountMap[apexDomain] += 0.5; - } + const safeGorhillLine = line[0] === '.' ? line.slice(1) : line; - if (line.startsWith('.amaz')) { - domainCountMap[apexDomain] += 0.5; - - if (line.startsWith('.amazon-')) { - domainCountMap[apexDomain] += 4.5; - } - if (isPhishingDomainMockingCoJp) { - domainCountMap[apexDomain] += 4; - } - } else if (line.startsWith('.customer')) { - domainCountMap[apexDomain] += 0.25; + const apexDomain = gorhill.getDomain(safeGorhillLine); + if (!apexDomain) { + console.log({ line }); + continue; } - const tld = gorhill.getPublicSuffix(line[0] === '.' ? line.slice(1) : line); + const tld = gorhill.getPublicSuffix(safeGorhillLine); if (!tld || !BLACK_TLD.has(tld)) continue; - // Only when tld is black will this 1 weight be added - domainCountMap[apexDomain] += 1; - - const lineLen = line.length; - - if (lineLen > 19) { - // Add more weight if the domain is long enough - if (lineLen > 44) { - domainCountMap[apexDomain] += 3.5; - } else if (lineLen > 34) { - domainCountMap[apexDomain] += 2.5; - } else if (lineLen > 29) { - domainCountMap[apexDomain] += 1.5; - } else if (lineLen > 24) { - domainCountMap[apexDomain] += 0.75; - } else { - domainCountMap[apexDomain] += 0.25; - } - - if (domainCountMap[apexDomain] < 5) { - const subdomain = tldts.getSubdomain(line, { detectIp: false }); - if (subdomain?.includes('.')) { - domainCountMap[apexDomain] += 1.5; - } - } - } + domainCountMap[apexDomain] ||= 0; + domainCountMap[apexDomain] += calcDomainAbuseScore(line); } }); const results = span.traceChildSync('get final phishing results', () => { const res: string[] = []; for (const domain in domainCountMap) { - if (domainCountMap[domain] >= 5) { + if (domainCountMap[domain] >= 8) { res.push(`.${domain}`); } } @@ -204,3 +166,61 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g return [results, domainSet] as const; }); + +export function calcDomainAbuseScore(line: string) { + let weight = 1; + + const isPhishingDomainMockingCoJp = line.includes('-co-jp'); + if (isPhishingDomainMockingCoJp) { + weight += 0.5; + } + + if (line.startsWith('.amaz')) { + weight += 0.5; + + if (line.startsWith('.amazon-')) { + weight += 4.5; + } + if (isPhishingDomainMockingCoJp) { + weight += 4; + } + } else if (line.includes('.customer')) { + weight += 0.25; + } + + const lineLen = line.length; + + if (lineLen > 19) { + // Add more weight if the domain is long enough + if (lineLen > 44) { + weight += 3.5; + } else if (lineLen > 34) { + weight += 2.5; + } else if (lineLen > 29) { + weight += 1.5; + } else if (lineLen > 24) { + weight += 0.75; + } else { + weight += 0.25; + } + } + + const subdomain = tldts.getSubdomain(line, { detectIp: false }); + + if (subdomain) { + if (subdomain.slice(1).includes('.')) { + weight += 1; + } + if (subdomain.length > 40) { + weight += 3; + } else if (subdomain.length > 30) { + weight += 1.5; + } else if (subdomain.length > 20) { + weight += 1; + } else if (subdomain.length > 10) { + weight += 0.1; + } + } + + return weight; +} diff --git a/Source/domainset/reject_sukka.conf b/Source/domainset/reject_sukka.conf index 3328fe4a5..4ed68d893 100644 --- a/Source/domainset/reject_sukka.conf +++ b/Source/domainset/reject_sukka.conf @@ -302,6 +302,7 @@ inst.360safe.com .pages.net.br .myenotice.com .eu5.net +.jdie.pl # --- AD Block --- @@ -733,6 +734,8 @@ comments.gazo.space .footprintdns.com .measure.office.com +.opinionjet.com + # >> Tracking .mktg.tags.f5.com .trk.caseads.com