In [2]:
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import requests
import pandas as pd
import bs4
from langdetect import detect

In [73]:
class Scraping:
    """
    this class will allow us to retrieve the content of our web pages
    """
    
    
    
    def web_name(self, url):
        """
        this function allows to obtain the name which is in url, for example for this url https://www.specshop.pl, the name will be specshop

        Args:
            url (url): url for the web page

        Returns:
            str: the name that located of the url
        """
        
        return "".join(urlparse(url).netloc.split(".")[-2])
    
    def web_title(self, soup):
        """

        Args:
            soup (BeautifulSoup): get the whole html parse document of the web page 

        Returns:
            str: title of the page
        """
        return " ".join(soup.title.contents) if soup.title is not None and "title" in soup.html else " "

    
    def web_language(self, soup):
        """

        Args:
            soup (BeautifulSoup): get the whole html parse document of the web page 

        Returns:
            str: language of the page
        """
        try:
            language = detect(soup.get_text())
        except Exception:
            language = "en"

        return language

 
 
    def web_meta(self, soup):
        """
        gets some meta data from the web page header
        
        Args:
            soup (BeautifulSoup): get the whole html parse document of the web page

        Returns:
            str: the description of our page in the header
        """
        
        tags = soup.find_all(lambda tag: (tag.name=='meta') & (tag.has_attr('name') & tag.has_attr('content')))
        
        content = [str(tag['content']) for tag in tags if tag['name'] in ['keywords', 'description']]
        return " ".join(content)
    
    
    def web_header(self, soup):
        """
    
        Args:
            soup (BeautifulSoup): get the whole html parse document of the web page

        Returns:
            str: all titles from h1 to h6 of the web page
        """
        
        tags = soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
        if not tags:
            return ""
        content = [" ".join(tag.stripped_strings) for tag in tags]
        
        return " ".join(content)
    
    def web_contents(self, soup):
        """
        
        Args:
            soup (BeautifulSoup): get the whole html parse document of the web page


        Returns:
            str: any other element of our html that is not a title, css style, etc...
        """
        
        tags_to_ignore = ["h1", "h2", "h3", "h4", "h5","h6", "noscript", "style", "script", "head", "title", "meta", "[document]"]
        contends = soup.find_all(text=True)
        result = []
        for word in contends[:50]:
            stripped_word = word.strip()
        
            if (
                word.parent.name not in tags_to_ignore
                and not isinstance(word, bs4.element.Comment)
                and not stripped_word.isnumeric()
                and  len(stripped_word) > 0 
            ): 
               
                result.append(stripped_word) 
        return " ".join(result)
        
        
    def url_contents(self, url):
        """
        return de content of web page
        
        Args:
            url (str): url for the web page

        Returns:
            Series: content of the web page
        """
        
        
        try:
            headers = { "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/109.0"}

            response = requests.get(url, timeout=60, headers=headers)

            if response.status_code != 200:
                result = {
                    "lang": "None",
                    "url": "None",
                    "website_name": "None",
                    "content_text": "None"
                }
            else:
                soup = BeautifulSoup(response.content, "html.parser")


                result = {
                    "lang": self.web_language(soup),
                    "url": url,
                    "website_name": self.web_name(url),
                    "content_text": self.web_title(soup)+ self.web_meta(soup) + self.web_header(soup) + self.web_contents(soup)
                }

            return pd.Series(result)
        except requests.exceptions.RequestException:
            result = {
                "lang": "None",
                "url": url,
                "website_name": "None",
                "content_text": "None"
            }
            return pd.Series(result) 
        

In [72]:
import spacy

class TextCleaner:
    
    
    def __init__(self):
        self.nlp_dict = {}
        
    def clean_text(self, document, lang):
        """
        Cleans and normalizes a text according to the given language

        Args:
            document (str): a text to clean
            lang (str): the language of the document

        Returns:
            str: the normalized text
        """
        
        # load the appropriate template if you have not already done it
        if lang not in self.nlp_dict:
            lang_dict = {
                'en': 'en_core_web_sm',
                'fr': 'fr_core_news_sm',
                'de': 'de_core_news_sm',
                # add other languages if necessary
            }

            self.nlp_dict[lang] = spacy.load(lang_dict.get(lang, 'en_core_web_sm'))

        # normalizes the text with the loaded template
        doc = self.nlp_dict[lang](document)
        tokens = []
        exclusion_list = ["nan", "vml", "endif"]

        for token in doc:
            if token.is_stop or token.is_punct or token.text.isnumeric() or (token.text.isalnum() == False) or token.text in exclusion_list:
                continue

            # Normalization of the token to lowercase lemmas
            token = str(token.lemma_.lower().strip())
            tokens.append(token)
        return " ".join(tokens)
    
 



In [78]:
import concurrent.futures # module for running functions in parallel
import json
import pandas as pd 
from tqdm import tqdm  # module to create a progress bar



def extract_data(domaine, scrap, cleaner):
    """
    (upgrade)Function to extract data from a line in our json file

    Args:
        domaine (dict): Line of the json file
        scrap (Scraping): instance of the Scraping class

    Returns:
        dict: dictionary containing the extracted information
    """
    list_lang = []
    try:
        dict_domaine = json.loads(domaine)
        name = dict_domaine['name']
        category = dict_domaine['category']
        address = dict_domaine['address']
        
        content = dict(scrap.url_contents(address))
        if content["lang"] != "None":
            list_lang.append(content['lang'])
            content['content_text'] = content['content_text'].strip()
            if len(content['content_text'])>=20:
                text = cleaner.clean_text(content['content_text'], content['lang'])  # clean up the extracted text
                # return a dictionary containing the extracted information
                return {'name': name, 'category': category, 'address': address, 'language':content['lang'], 'words': text}
    except Exception as e:
        print(f"Error when extracting data for the domain {address}: {e}")
        return None
    
    
def get_dataSet(file_name: str):
    """
    (upgrade)Read a JSON file containing domain names, extract the keywords associated with each address and return a pandas Dataframe containing the extracted information

    Args:
        file_name (str): json file 

    Returns:
        Dataframe: Panda Dataframe with extracted data
    """
    
    # create an instance of the Scraping class and TextCleaner to extract data from each domain
    scrap = Scraping()
    cleaner = TextCleaner()
    
    # open the JSON file and extract the information
    with open(file_name, "r") as f:
        domaines = f.readlines()

    with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor:  # execute functions in parallel with 60 threads
        futures = [executor.submit(extract_data, domaine, scrap, cleaner) for domaine in domaines]  # submit tasks
        results = []  # initialise the list to store the results
        
        with tqdm(total=len(domaines)) as pbar:  # create a progress bar to display progress
            for future in concurrent.futures.as_completed(futures):
                try:
                    result = future.result()
                    if result is not None:  # if the task has been executed successfully, add the result to the list
                        results.append(result)
                except Exception as e:
                    print(f"Error when extracting data: {e}")
                pbar.update(1)  # update of the progress bar
                
    # Create a pandas Dataframe containing the extracted data
    data = pd.DataFrame(results)
    data['indice'] = range(len(data))
    data = data.set_index('indice')
    return data

if __name__ == '__main__':
    df = get_dataSet('urls.txt') 

  0%|          | 9/119321 [00:33<124:14:17,  3.75s/it]


In [None]:

df.to_csv("data.csv", index=False)

In [76]:
df = pd.read_csv("DataSete.csv")

In [None]:
df["category"].unique()

In [None]:
#first without threads
import json
import pandas as pd
from tqdm import tqdm
from typing import List

def get_dataSet(file_name: str):
    """
    Read a JSON file containing domain names, extract the keywords associated with each address and return a pandas Dataframe containing the extracted information

    Args:
        file_name (str): json file 

    Returns:
        Dataframe: Panda Dataframe with extracted data
    """
       
    # creates an instance of the Scraping class   
    scrap = Scraping()
    clean = TextCleaner()

    # open the JSON file and extract the information
    with open(file_name, "r") as f:
        domaines = f.readlines()

    # Initialise lists to store data
    indices = []
    names = []
    categories = []
    addresses = []
    keywords = []

    # Iterate through each domain name, extract the keywords associated with the address and store the data in the lists
    for index, domaine in tqdm(enumerate(domaines[:50]), total=len(domaines)):
        try:
            dict_domaine = json.loads(domaine)
            name = dict_domaine['name'] 
            category = dict_domaine['category']
            address = dict_domaine['address']
            content = dict(scrap.url_content(address))
            
            if content["lang"] != "None":
                text = clean.clean_text(content['content_text'], content['lang'])
                keywords.append(text)

            indices.append(index)
            names.append(name)
            categories.append(category)
            addresses.append(address)
        except Exception as e:
            print(f"Erreur lors de l'extraction des données pour le domaine à l'index {index}: {e}")

    # Create a pandas Dataframe containing the extracted data
    data = {'name': names, 'category': categories, 'address': addresses, 'words': keywords}
    return pd.DataFrame(data)


df = get_dataSet('urls.txt')



In [75]:
scrap = Scraping()
clean = TextCleaner()

url = "https://1kino.in"

content = dict(scrap.url_contents(url))

#print(content)
print()

if content["lang"] != "None":
    print(content["lang"])

    text = clean.clean_text(content['content_text'], content['lang'])
    print(text)




en

