In [2]:
import hashlib
import re
import time
from langdetect import detect
from nltk import PorterStemmer
from nltk import downloader
from nltk import data

In [61]:
OnionDir = 'D:/Wesam/Onion_Dataset/{0}/{0}.txt'

class OnionTextPreprocessor(object):
    """
    Class to pre-process the text sample
    """
    def __init__(self):

        self.FLAGS = re.MULTILINE | re.DOTALL
        self.URL = ' <url>'
        self.IMG = ' <image>'
        self.NUMBER = ' <number> '
        self.EMAIL = ' <email> '

        with open('stops.txt', 'r', encoding='utf-8')as ins:
            stop = ins.readlines()
        stop = [PorterStemmer().stem_word(wrd.strip().lower()) for wrd in stop]
        stop = list(set(stop))
        self.stop = stop

        # Download NLTK tokenizer package
        try:
            data.find("tokenizers/punkt")
        except LookupError:
            downloader.download("punkt")

    def clean_onion(self, document, max_text_len=2000, min_text_len=5):
        """

        :param document:
        :param max_text_len: Maximum document length to cut ( to avoid large text)
        :param min_text_len: Minimum document length to cut ( to avoid short text)
        :return: cleaned onion document
        """
        start_time = time.time()

        # Decode the text from binary.
        text_content = document#.decode(errors='replace').strip()

        # Count the document words by space
        text_list = text_content.split(' ')

        # If the text length is more than max_text_len words, take only the first max_text_len words
        if len(text_list) > max_text_len:
            text_content = ' '.join(text_list[:max_text_len])

        # Start cleaning the Text
        text_content = [self.clean_text_with_reg(ln) for ln in text_content.split('\n')]

        # Remove Ref part in the text ( old scrap version)
        if 'references' in text_content:
            idx = text_content.index('references')
            text_content = text_content[:idx]

        # Remove duplicated lines
        text_content = ' '.join(list(set(text_content)))

        # Calculate the text hash
        text_hash = hashlib.md5(text_content.encode()).hexdigest()

        # Check if we still have text after the pre-processing (level_1)

        if text_content:
            # If the text is not empty, start pre-processing (Level_2)

            # 1- Detect the language
            lang = detect(text_content)

            # 2- Stem text:
            text_content = ' '.join([PorterStemmer().stem_word(word) for word in text_content.split()])

            # 3- Remove stop words
            text_content = re.sub(r'\b(' + r'|'.join(self.stop) + r')\b\s*', ' ', text_content).strip()

            # 4-  Remove extra space from the text:
            text_content = re.sub(r'\s+', ' ', text_content).strip()

            # 5- Count the document words by space after cleaning it.
            # If less than 5 words (after second level of preprocessing) , then set it to empty
            text_list = text_content.split(' ')
            if len(text_list) < min_text_len:
                # If the text is empty, return empty text
                text_content = ''
                # By default, english
                lang = 'en'
        else:
            # If the text is empty, return empty text
            text_content = ''

            # By default, english
            lang = 'en'

        spend_time = time.time() - start_time

        return text_content

    def clean_text_with_reg(self, text):
        """
        This function prepossess the text before classifying it with regular expression.
        :param text: input text of onion to clean
        :return: cleaned text
        """
        re_sub = lambda pattern, _: re.sub(pattern, _, text, flags=self.FLAGS)

        # Remove PGP line:
        text = re_sub(r'[^----]*(BEGIN|END) [^-----]*', ' ')

        # Replace some S.C:
        text = re_sub(r'[<>]', ' ')

        # Replace Email:
        text = re_sub(r'[\w\.-]+@[\w\.-]+', self.EMAIL)

        # Replace some spacial chars.
        text = re_sub('&', 'and')

        # Replace all types or URLs
        url_regex = r'^(?:http|ftp)s?://'\
        # domain name
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'\
        # localhost...
        r'localhost|'\
        # ...or ipv4
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|'\
        # ...or ipv6
        r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'\
        # optional port
        r'(?::\d+)?'\
        r'(?:/?|[/?]\S+)$'
        text = re_sub(url_regex, self.URL)

        # Replace ordered list  [1] by <ol>
        text = re_sub(r'(\[)[\d|\w](\])', ' ')  # ,self.OL)

        # Replace @user by <user>
        text = re_sub(r"@\w+", ' ')  # , self.USER)

        # Replace IMG:
        text = re_sub(r'(\[*)(\w*)(\.)(png|jpg|jpeg|gif|GIF)(\]*)', r'\2 ')

        # Replace Currency:
        text = re_sub(r'([$€¢£¥฿]|euro|usd|USD|EUR|Euro|Dolar|dolar|BTC|GBP|gbp|btc)', ' ')

        # Replace repeated text
        text = re_sub(r'([!?\.\-:]){2,}', r'\1 ')

        # Replace some - with space or leave it:
        text = re_sub(r'(^\-)|(\s*)(\-|\^|\+|\*|\–)(\s)|(\s)(\-|\^|\+|\*|\–)(\s*)', ' ')

        # Replace some S.C:
        text = re_sub(r'[(_)\u00A9•\|#%\^~*\[\]{}\.,:!";\?\\\'@“”]', ' ')

        # Replace - with space:
        text = re_sub(r'-', ' ')

        # Remove text ends with number
        text = re_sub(r'(\w{3,})([0-9])', r'\1')

        # Remove long words
        text = ' '.join([word for word in text.split() if 15 > len(word) > 2])

        # Replace NUM
        text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]", ' ')

        # Replace some S.C:
        text = re_sub(r'[+=\/]', ' ')

        # Replace Single letters:
        text = re_sub(r'(?<!\S).(?!\S)\s*', '')

        # Lower Text:
        text = text.strip().lower()

        return text


In [64]:
url = 'diamachdwhqp7pem.onion'
with open(OnionDir.format(url), 'r', encoding ='utf-8') as fin:
    text = fin.readlines()
text = ' '.join(text)

In [65]:
textpreprocessor = OnionTextPreprocessor ()

In [66]:
textpreprocessor.clean_onion(text)

'марок биохаразд доб марок биохазард доб 1ое продукции случае возникновения вопроса читайте купить img кокаин classic 1gr кокаин vip гр 3ое продукции так другие утешительные призы всем кто участвовал всем lsd stamp hoffman bicycl microdot вопросам находа уточнение опта писать jabber кокаина целое множество различных наркотиков всех сортов расцветок упаковок кислоты разных кислых лсд мео дип доб пол солонки meo mipt триптамин новости http obmen либо пишите jabber дсков ена дисков фунт добро пожаловать 2ое продукции dob stamp biohazard идёт конкурс лучшие сиськи подробности тут ветка продавца микродотов мгк лсд микродотов мгк лсд руб руб mdma pill также текила ром еще вкусностей mephedron type гр амнезия амнезии спасибо что выбрали нас самый качественный товар доступным ценам идёт конкурс лучший рассказ своих ощущений подробности тут hash diamond shop moscow citi москва <email> график мск субботу приз рассказ чемодан пакета травы гр таблеток перед покупкой обязательно прочитайте правила 