In [32]:
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import requests
import pandas as pd
import bs4
from langdetect import detect
import chardet

In [15]:
class Scraping:
    """
        A class that implements several methods and whose purpose is to retrieve the content of web pages.
    """
    
    def url_content(self, url):
        """
        return de content of web page
        
        Args:
            url (str): url for the web page

        Returns:
            Series: content of the web page
        """
        
        # bypass browser restrictions
        headers = { "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/109.0"}
        try:
            response = requests.get(url, timeout=60, headers=headers)

            if response.status_code != 200:
                result = {
                    "lang": "None",
                    "url": "None",
                    "website_name": "None",
                    "content_text": "None"
                }
            else:
                soup = BeautifulSoup(response.content, "html.parser", from_encoding="iso-8859-1")


                result = {
                    "lang": self.get_language(soup),
                    "url": url,
                    "website_name": self.get_website_name(url),
                    "content_text": self.get_title(soup)+ self.get_meta(soup) + self.get_header(soup) + self.get_content(soup)
                }

            return pd.Series(result)
        except requests.exceptions.RequestException:
            result = {
                "lang": "None",
                "url": url,
                "website_name": "None",
                "content_text": "None"
            }
            return pd.Series(result)
    
    def get_website_name(self, url):
        """
        this function allows to obtain the name which is in url, for example for this url https://www.specshop.pl, the name will be specshop

        Args:
            url (url): url for the web page

        Returns:
            str: the name that located of the url
        """
        
        return "".join(urlparse(url).netloc.split(".")[-2])
    
    def get_title(self, soup):
        """

        Args:
            soup (BeautifulSoup): get the whole html parse document of the web page 

        Returns:
            str: title of the page
        """
        return " ".join(soup.title.contents) if soup.title is not None else ""

    
    def get_language(self, soup):
        """

        Args:
            soup (BeautifulSoup): get the whole html parse document of the web page 

        Returns:
            str: language of the page
        """
        try:
            language = detect(soup.get_text())
        except Exception:
            language = "en"

        return language

 
 
    def get_meta(self, soup):
        """
        gets some meta data from the web page header
        
        Args:
            soup (BeautifulSoup): get the whole html parse document of the web page

        Returns:
            str: the description of our page in the header
        """
        
        tags = soup.find_all(lambda tag: (tag.name=='meta') & (tag.has_attr('name') & tag.has_attr('content')))
        
        content = [str(tag['content']) for tag in tags if tag['name'] in ['keywords', 'description']]
        return " ".join(content)
    
    
    def get_header(self, soup):
        """

        Args:
            soup (BeautifulSoup): get the whole html parse document of the web page

        Returns:
            str: all titles from h1 to h6 of the web page
        """
        
        tags = soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
        if not tags:
            return ""
        content = [" ".join(tag.stripped_strings) for tag in tags]
        return " ".join(content)
    
    def get_content(self, soup):
        """
        
        Args:
            soup (BeautifulSoup): get the whole html parse document of the web page


        Returns:
            str: any other element of our html that is not a title, css style, etc...
        """
        
        tags_to_ignore = ["h1", "h2", "h3", "h4", "h5","h6", "noscript", "style", "script", "head", "title", "meta", "[document]"]
        contends = soup.find_all(text=True)
        result = []
        for word in contends:
            stripped_word = word.strip()
            if (
                word.parent.name in tags_to_ignore
                or isinstance(word, bs4.element.Comment)
                or stripped_word.isnumeric()
                or len(stripped_word) <= 0
            ):
                return ""
            result.append(stripped_word)
        return " ".join(result)
        
        

In [43]:
import spacy

class TextCleaner:
    """
    it is the class that will allow us to normalise our texts by deleting unused words
    """
    
    def __init__(self):
        self.nlp_dict = {}
        
    def clean_text(self, document, lang):
        """
        Cleans and normalizes a text according to the given language

        Args:
            document (str): a text to clean
            lang (str): the language of the document

        Returns:
            str: the normalized text
        """
        
        # load the appropriate template if you have not already done it
        if lang not in self.nlp_dict:
            lang_dict = {
                'en': 'en_core_web_sm',
                'fr': 'fr_core_news_sm',
                'de': 'de_core_news_sm',
                # add other languages if necessary
            }

            self.nlp_dict[lang] = spacy.load(lang_dict.get(lang, 'en_core_web_sm'))

        # normalizes the text with the loaded template
        doc = self.nlp_dict[lang](document)
        tokens = []
        exclusion_list = ["nan"]

        for token in doc:
            if token.is_stop or token.is_punct or token.text.isnumeric() or (token.text.isalnum() == False) or token.text in exclusion_list:
                continue

            # Normalization of the token to lowercase lemmas
            token = str(token.lemma_.lower().strip())
            tokens.append(token)
        return " ".join(tokens)
    
 

    def decode_text(self, document):
        """
        detects the encoding of the document and decodes it

        Args:
            document (str): the normalized text

        Returns:
            str: plain text
        """
        if document is None:
            return ""
        if isinstance(document, str):
            document = document.encode('utf-8')
        if detected_encoding := chardet.detect(document)['encoding']:
            return document.decode(detected_encoding)
        else:
            return document.decode('utf-8', 'ignore')




In [49]:
import json
import pandas as pd
from concurrent.futures import ThreadPoolExecutor  # module for running functions in parallel
from tqdm import tqdm  # module to create a progress bar


def extract_data(domaine, scrap, cleaner):
    """
    (upgrade)Function to extract data from a line in our json file

    Args:
        domaine (dict): Line of the json file
        scrap (Scraping): instance of the Scraping class

    Returns:
        dict: dictionary containing the extracted information
    """
    
    try:
        dict_domaine = json.loads(domaine)
        name = dict_domaine['name']
        category = dict_domaine['category']
        address = dict_domaine['address']
        
        content = dict(scrap.url_content(address))
        if content["lang"] != "None":
            content['content_text'] = content['content_text'].strip()
            text = cleaner.clean_text(content['content_text'], content['lang'])  # clean up the extracted text
            # return a dictionary containing the extracted information
            return {'name': name, 'category': category, 'address': address, 'words': text}
    except Exception as e:
        print(f"Error when extracting data for the domain {address}: {e}")
        return None

def get_dataSet(file_name: str):
    """
    (upgrade)Read a JSON file containing domain names, extract the keywords associated with each address and return a pandas Dataframe containing the extracted information

    Args:
        file_name (str): json file 

    Returns:
        Dataframe: Panda Dataframe with extracted data
    """
    
    
    # create an instance of the Scraping class and TextCleaner to extract data from each domain
    scrap = Scraping()
    cleaner = TextCleaner()
    
    # open the JSON file and extract the information
    with open(file_name, "r") as f:
        domaines = f.readlines()

    with ThreadPoolExecutor(max_workers=60) as executor:  # execute functions in parallel with 40 threads
        futures = [executor.submit(extract_data, domaine, scrap, cleaner) for domaine in domaines[:20]]  # submit tasks
        results = []  # initialise the list to store the results
        
        with tqdm(total=len(futures)) as pbar:  # create a progress bar to display progress
            for future in futures:
                if future.result():  # if the task has been executed successfully, add the result to the list
                    results.append(future.result())
                pbar.update(1)  # update of the progress bar
                
    # Create a pandas Dataframe containing the extracted data
    data = pd.DataFrame(results)
    data['indice'] = range(len(data))
    data = data.set_index('indice')
    return data

if __name__ == '__main__':
    df = get_dataSet('urls.txt') 
    df 

100%|██████████| 20/20 [00:22<00:00,  1.12s/it]


In [45]:
df

Unnamed: 0_level_0,name,category,address,words
indice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Criminal Activities,Warez / Software Piracy,https://3mp3.ru,âåóðìáôîï óëáþáôø mp3 íõúùëõ mp3 music free do...
1,Criminal Activities,Political Extreme / Hate / Discrimination,https://algerie-francaise.org,algerie francaise verite guerre algã rieapre a...
2,Criminal Activities,Illegal Activities,https://alfasex.net,
3,Criminal Activities,Political Extreme / Hate / Discrimination,https://anaporn.com,warn kffn ktkt kcmt fm kmxz fm klpx fm kfma fm...
4,Criminal Activities,Illegal Activities,https://ahoj.sk,ahojahoj main page
5,Criminal Activities,Illegal Activities,https://ageplayteens.com,age play phone sex age play teen age play phon...
6,Criminal Activities,Illegal Activities,https://4jokerscasino.com,joker gaming joker casinojoker gaming joker ca...
7,Criminal Activities,Illegal Activities,https://2girls1cup.ca,watch girls cup uncensored original videothe o...
8,Criminal Activities,Illegal Activities,https://adultvideotop.com,adult porn video adult movies adult sex videos...
9,Criminal Activities,Illegal Activities,https://amateurspankings.com,amateur spanking


In [36]:
df['words']

indice
0    âåóðìáôîï óëáþáôø mp3 íõúùëõ mp3 music free do...
1    algerie francaise verite guerre algã rieapre a...
2                                                     
3    warn kffn ktkt kcmt fm kmxz fm klpx fm kfma fm...
4                                   ahojahoj main page
Name: words, dtype: object

In [None]:
#first without threads
import json
import pandas as pd
from tqdm import tqdm
from typing import List

def get_dataSet(file_name: str):
    """
    Read a JSON file containing domain names, extract the keywords associated with each address and return a pandas Dataframe containing the extracted information

    Args:
        file_name (str): json file 

    Returns:
        Dataframe: Panda Dataframe with extracted data
    """
       
    # creates an instance of the Scraping class   
    scrap = Scraping()

    # open the JSON file and extract the information
    with open(file_name, "r") as f:
        domaines = f.readlines()

    # Initialise lists to store data
    indices = []
    names = []
    categories = []
    addresses = []
    keywords = []

    # Iterate through each domain name, extract the keywords associated with the address and store the data in the lists
    for index, domaine in tqdm(enumerate(domaines[:50]), total=len(domaines)):
        try:
            dict_domaine = json.loads(domaine)
            name = dict_domaine['name'] 
            category = dict_domaine['category']
            address = dict_domaine['address']
            content = dict(scrap.url_content(address))
            
            if content["lang"] != "None":
                text = clean_text(content['content_text'], content['lang'])
                keywords.append(text)

            indices.append(index)
            names.append(name)
            categories.append(category)
            addresses.append(address)
        except Exception as e:
            print(f"Erreur lors de l'extraction des données pour le domaine à l'index {index}: {e}")

    # Create a pandas dataframe containing the extracted data
    data = {'indice': indices, 'name': names, 'category': categories, 'address': addresses, 'words': keywords}
    return pd.DataFrame(data)
if __name__ == '__main__':
    df = get_dataSet('urls.txt')
    df


In [40]:
scrap = Scraping()
url = "https://alfasex.net"

content = dict(scrap.url_content(url))

#print(content)
print()

if content["lang"] != "None":
    text = clean_text(content['content_text'], content['lang'])
    print(text)





