Improve Regex

Now uses 'Regex' module instead of built in 're' module, because it supports \p unicode category searching
ThioJoe · Feb 16, 2022 · 0988c29 · 0988c29
1 parent 9ea7f50
commit 0988c29
Show file tree

Hide file tree

Showing 6 changed files with 31 additions and 34 deletions.
diff --git a/Scripts/filter_variables.py b/Scripts/filter_variables.py
diff --git a/Scripts/logging.py b/Scripts/logging.py
@@ -7,7 +7,6 @@
 from unicodedata import category as unicode_category
 from datetime import datetime
 
-import re
 import rtfunicode
 import os
 import requests

diff --git a/Scripts/operations.py b/Scripts/operations.py
@@ -823,7 +823,6 @@ def check_against_filter(current, filtersDict, miscData, config, currentCommentD
       # Receive Variables
       compiledRegexDict = smartFilter['compiledRegexDict']
       compiledObfuRegexDict = smartFilter['compiledObfuRegexDict']
-      basicFilterDict = smartFilter['basicFilterDict']
       preciseRegexDict = smartFilter['preciseRegexDict']
       numberFilterSet = smartFilter['spammerNumbersSet']
       compiledNumRegex = smartFilter['compiledNumRegex']
@@ -859,9 +858,13 @@ def findObf(expression, chars, stringToSearch):
         else:
           for match in result:
             lowerChars = chars.lower()
+            # Strips off buffer characters and specified unicode categories
             for bufferChar in compiledRegexDict['bufferChars']:
               match = match.strip(bufferChar)
-            #if match.lower() != lowerWord and match.lower() != lowerWord.translate(ignoredConfusablesConverter):
+            while unicodedata.category(match[0]) in smartFilter['unicodeCategoriesStrip']:
+              match = match[1:]
+            while unicodedata.category(match[-1]) in smartFilter['unicodeCategoriesStrip']:
+              match = match[:-1]
             if any(char not in lowerChars for char in match) and any(char not in lowerChars.translate(ignoredConfusablesConverter) for char in match):
               return True
 
@@ -880,10 +883,10 @@ def check_if_only_link(string):
       # ------------------------------------------------------------------------
 
       # Normalize usernames and text, remove multiple whitespace and invisible chars
-      commentText = re.sub(' +', ' ', commentText)
+      commentTextNormalized = re.sub(' +', ' ', commentText)
       # https://stackoverflow.com/a/49695605/17312053
-      commentText = "".join(k if k in bufferChars else "".join(v) for k,v in itertools.groupby(commentText, lambda c: c))
-      commentText = remove_unicode_categories(commentText)
+      commentTextNormalized = "".join(k if k in bufferChars else "".join(v) for k,v in itertools.groupby(commentText, lambda c: c))
+      commentTextNormalized = remove_unicode_categories(commentText)
 
       authorChannelName = re.sub(' +', ' ', authorChannelName)
       authorChannelName = remove_unicode_categories(authorChannelName)
@@ -915,19 +918,19 @@ def check_if_only_link(string):
         add_spam(current, config, miscData, currentCommentDict, videoID)
       elif compiledRegexDict['blackAdWords'].search(authorChannelName):
         add_spam(current, config, miscData, currentCommentDict, videoID)
-      elif compiledRegexDict['textBlackWords'].search(commentText):
+      elif compiledRegexDict['textBlackWords'].search(commentTextNormalized):
         add_spam(current, config, miscData, currentCommentDict, videoID)
       elif any(findObf(expressionPair[0], expressionPair[1], commentText) for expressionPair in compiledObfuRegexDict['textObfuBlackWords']):
         add_spam(current, config, miscData, currentCommentDict, videoID)
-      elif preciseRegexDict['textExactBlackWords'].search(commentText.lower()):
+      elif preciseRegexDict['textExactBlackWords'].search(commentTextNormalized.lower()):
         add_spam(current, config, miscData, currentCommentDict, videoID)
-      elif preciseRegexDict['textUpLowBlackWords'].search(commentText) and not upLowTextSet.intersection(lowAlSet):
+      elif preciseRegexDict['textUpLowBlackWords'].search(commentTextNormalized) and not upLowTextSet.intersection(lowAlSet):
         add_spam(current, config, miscData, currentCommentDict, videoID)
       elif any(findObf(expressionPair[0], expressionPair[1], authorChannelName) for expressionPair in compiledObfuRegexDict['usernameObfuBlackWords']):  
         add_spam(current, config, miscData, currentCommentDict, videoID)
       elif spamListCombinedRegex.search(combinedString.lower()):
         add_spam(current, config, miscData, currentCommentDict, videoID)
-      elif config['detect_link_spam'] and check_if_only_link(commentText.strip()):
+      elif config['detect_link_spam'] and check_if_only_link(commentTextNormalized.strip()):
         add_spam(current, config, miscData, currentCommentDict, videoID)
       elif sensitive and re.search(smartFilter['usernameConfuseRegex'], authorChannelName):
         add_spam(current, config, miscData, currentCommentDict, videoID)
@@ -960,7 +963,7 @@ def check_if_only_link(string):
         if yellowAdEmojiSet.intersection(combinedSet):
           yellowCount += 1
 
-        if not sensitive and any(emoji in commentText for emoji in spamGenEmojiSet):
+        if not sensitive and any(emoji in commentTextNormalized for emoji in spamGenEmojiSet):
           yellowCount += 1
 
         if not sensitive and any(emoji in authorChannelName for emoji in spamGenEmojiSet):

diff --git a/Scripts/prepare_modes.py b/Scripts/prepare_modes.py
@@ -317,11 +317,6 @@ def prepare_filter_mode_smart(scanMode, config, miscData, sensitive=False):
     'usernameObfuBlackWords': filter.usernameObfuBlackWordsCompiledPairs
   }
 
-  basicFilterDict = {
-    'usernameRedWords': filter.usernameRedWordsCompiled,
-    'exactRedAdWords': filter.exactRedAdWords,
-  }    
-
   # General Settings
   unicodeCategoriesStrip = ["Mn", "Cc", "Cf", "Cs", "Co", "Cn", "Sk"] # Categories of unicode characters to strip during normalization
 
@@ -430,7 +425,6 @@ def prepare_filter_mode_smart(scanMode, config, miscData, sensitive=False):
     'rootDomainRegex': rootDomainRegex,
     'compiledRegexDict': compiledRegexDict,
     'compiledObfuRegexDict': compiledObfuRegexDict,
-    'basicFilterDict': basicFilterDict,
     'preciseRegexDict': preciseRegexDict,
     'usernameConfuseRegex': usernameConfuseRegex,
     'languages': languages,

diff --git a/Scripts/shared_imports.py b/Scripts/shared_imports.py
@@ -1,7 +1,7 @@
 import os
 import sys
 import traceback
-import re
+import regex as re
 
 from colorama import init, Fore as F, Back as B, Style as S
 

diff --git a/requirements.txt b/requirements.txt
@@ -6,3 +6,4 @@ rtfunicode==2.0
 certifi>=2021.10.8
 six>=1.16.0
 python-Levenshtein>=0.12.2
+regex>=2022.1.18