Skip to content

Commit

Permalink
Improve Regex
Browse files Browse the repository at this point in the history
Now uses 'Regex' module instead of built in 're' module, because it supports \p unicode category searching
  • Loading branch information
ThioJoe committed Feb 16, 2022
1 parent 9ea7f50 commit 0988c29
Show file tree
Hide file tree
Showing 6 changed files with 31 additions and 34 deletions.
32 changes: 16 additions & 16 deletions Scripts/filter_variables.py

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion Scripts/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from unicodedata import category as unicode_category
from datetime import datetime

import re
import rtfunicode
import os
import requests
Expand Down
23 changes: 13 additions & 10 deletions Scripts/operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -823,7 +823,6 @@ def check_against_filter(current, filtersDict, miscData, config, currentCommentD
# Receive Variables
compiledRegexDict = smartFilter['compiledRegexDict']
compiledObfuRegexDict = smartFilter['compiledObfuRegexDict']
basicFilterDict = smartFilter['basicFilterDict']
preciseRegexDict = smartFilter['preciseRegexDict']
numberFilterSet = smartFilter['spammerNumbersSet']
compiledNumRegex = smartFilter['compiledNumRegex']
Expand Down Expand Up @@ -859,9 +858,13 @@ def findObf(expression, chars, stringToSearch):
else:
for match in result:
lowerChars = chars.lower()
# Strips off buffer characters and specified unicode categories
for bufferChar in compiledRegexDict['bufferChars']:
match = match.strip(bufferChar)
#if match.lower() != lowerWord and match.lower() != lowerWord.translate(ignoredConfusablesConverter):
while unicodedata.category(match[0]) in smartFilter['unicodeCategoriesStrip']:
match = match[1:]
while unicodedata.category(match[-1]) in smartFilter['unicodeCategoriesStrip']:
match = match[:-1]
if any(char not in lowerChars for char in match) and any(char not in lowerChars.translate(ignoredConfusablesConverter) for char in match):
return True

Expand All @@ -880,10 +883,10 @@ def check_if_only_link(string):
# ------------------------------------------------------------------------

# Normalize usernames and text, remove multiple whitespace and invisible chars
commentText = re.sub(' +', ' ', commentText)
commentTextNormalized = re.sub(' +', ' ', commentText)
# https://stackoverflow.com/a/49695605/17312053
commentText = "".join(k if k in bufferChars else "".join(v) for k,v in itertools.groupby(commentText, lambda c: c))
commentText = remove_unicode_categories(commentText)
commentTextNormalized = "".join(k if k in bufferChars else "".join(v) for k,v in itertools.groupby(commentText, lambda c: c))
commentTextNormalized = remove_unicode_categories(commentText)

authorChannelName = re.sub(' +', ' ', authorChannelName)
authorChannelName = remove_unicode_categories(authorChannelName)
Expand Down Expand Up @@ -915,19 +918,19 @@ def check_if_only_link(string):
add_spam(current, config, miscData, currentCommentDict, videoID)
elif compiledRegexDict['blackAdWords'].search(authorChannelName):
add_spam(current, config, miscData, currentCommentDict, videoID)
elif compiledRegexDict['textBlackWords'].search(commentText):
elif compiledRegexDict['textBlackWords'].search(commentTextNormalized):
add_spam(current, config, miscData, currentCommentDict, videoID)
elif any(findObf(expressionPair[0], expressionPair[1], commentText) for expressionPair in compiledObfuRegexDict['textObfuBlackWords']):
add_spam(current, config, miscData, currentCommentDict, videoID)
elif preciseRegexDict['textExactBlackWords'].search(commentText.lower()):
elif preciseRegexDict['textExactBlackWords'].search(commentTextNormalized.lower()):
add_spam(current, config, miscData, currentCommentDict, videoID)
elif preciseRegexDict['textUpLowBlackWords'].search(commentText) and not upLowTextSet.intersection(lowAlSet):
elif preciseRegexDict['textUpLowBlackWords'].search(commentTextNormalized) and not upLowTextSet.intersection(lowAlSet):
add_spam(current, config, miscData, currentCommentDict, videoID)
elif any(findObf(expressionPair[0], expressionPair[1], authorChannelName) for expressionPair in compiledObfuRegexDict['usernameObfuBlackWords']):
add_spam(current, config, miscData, currentCommentDict, videoID)
elif spamListCombinedRegex.search(combinedString.lower()):
add_spam(current, config, miscData, currentCommentDict, videoID)
elif config['detect_link_spam'] and check_if_only_link(commentText.strip()):
elif config['detect_link_spam'] and check_if_only_link(commentTextNormalized.strip()):
add_spam(current, config, miscData, currentCommentDict, videoID)
elif sensitive and re.search(smartFilter['usernameConfuseRegex'], authorChannelName):
add_spam(current, config, miscData, currentCommentDict, videoID)
Expand Down Expand Up @@ -960,7 +963,7 @@ def check_if_only_link(string):
if yellowAdEmojiSet.intersection(combinedSet):
yellowCount += 1

if not sensitive and any(emoji in commentText for emoji in spamGenEmojiSet):
if not sensitive and any(emoji in commentTextNormalized for emoji in spamGenEmojiSet):
yellowCount += 1

if not sensitive and any(emoji in authorChannelName for emoji in spamGenEmojiSet):
Expand Down
6 changes: 0 additions & 6 deletions Scripts/prepare_modes.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,11 +317,6 @@ def prepare_filter_mode_smart(scanMode, config, miscData, sensitive=False):
'usernameObfuBlackWords': filter.usernameObfuBlackWordsCompiledPairs
}

basicFilterDict = {
'usernameRedWords': filter.usernameRedWordsCompiled,
'exactRedAdWords': filter.exactRedAdWords,
}

# General Settings
unicodeCategoriesStrip = ["Mn", "Cc", "Cf", "Cs", "Co", "Cn", "Sk"] # Categories of unicode characters to strip during normalization

Expand Down Expand Up @@ -430,7 +425,6 @@ def prepare_filter_mode_smart(scanMode, config, miscData, sensitive=False):
'rootDomainRegex': rootDomainRegex,
'compiledRegexDict': compiledRegexDict,
'compiledObfuRegexDict': compiledObfuRegexDict,
'basicFilterDict': basicFilterDict,
'preciseRegexDict': preciseRegexDict,
'usernameConfuseRegex': usernameConfuseRegex,
'languages': languages,
Expand Down
2 changes: 1 addition & 1 deletion Scripts/shared_imports.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import sys
import traceback
import re
import regex as re

from colorama import init, Fore as F, Back as B, Style as S

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ rtfunicode==2.0
certifi>=2021.10.8
six>=1.16.0
python-Levenshtein>=0.12.2
regex>=2022.1.18

0 comments on commit 0988c29

Please sign in to comment.