In [8]:
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
from urllib.parse import urlparse
import requests
import pandas as pd
import bs4
from langdetect import detect
import warnings
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time
from selenium.common.exceptions import NoSuchElementException

In [9]:
class Scraping:
    """
    this class will allow us to retrieve the content of our web pages
    """
    
    
    
    def web_name(self, url):
        """
        this function allows to obtain the name which is in url, for example for this url https://www.specshop.pl, the name will be specshop

        Args:
            url (url): url for the web page

        Returns:
            str: the name that located of the url
        """
        
        return "".join(urlparse(url).netloc.split(".")[-2])
    
    def web_title(self, soup):
        """
        
        Args:
            soup (BeautifulSoup): get the whole html parse document of the web page 

        Returns:
            str: title of the page
        """
        title_text = ""
        if head_tag := soup.find('head'):
            if title_tag := head_tag.find('title'):
                title_text = title_tag.text
        return title_text
    
    def web_language(self, soup):
        """

        Args:
            soup (BeautifulSoup): get the whole html parse document of the web page 

        Returns:
            str: language of the page
        """
        try:
            language = detect(soup.get_text())
        except Exception:
            language = "en"

        return language

 
 
    def web_meta(self, soup):
        """
        gets some meta data from the web page header
        
        Args:
            soup (BeautifulSoup): get the whole html parse document of the web page

        Returns:
            str: the description of our page in the header
        """
        
        tags = soup.find_all(lambda tag: (tag.name=='meta') & (tag.has_attr('name') & tag.has_attr('content')))
        
        content = [str(tag['content']) for tag in tags if tag['name'] in ['keywords', 'description']]
        return " ".join(content)
    
    
    def web_header(self, soup):
        """
    
        Args:
            soup (BeautifulSoup): get the whole html parse document of the web page

        Returns:
            str: all titles from h1 to h6 of the web page
        """
        
        tags = soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6",])
        if not tags:
            return ""
        content = [" ".join(tag.stripped_strings) for tag in tags]
        
        return " ".join(content)
    
    def web_contents(self, soup):
        """
        
        Args:
            soup (BeautifulSoup): get the whole html parse document of the web page


        Returns:
            str: any other element of our html that is not a title, css style, etc...
        """
        
        tags_to_ignore = ["h1", "h2", "h3", "h4", "h5","h6", "noscript", "style", "script", "head", "title", "meta", "[document]"]
        contends = soup.find_all(text=True)
        result = []
        for word in contends:
            stripped_word = word.strip()
        
            if (
                word.parent.name not in tags_to_ignore
                and not isinstance(word, bs4.element.Comment)
                and not stripped_word.isnumeric()
                and  len(stripped_word) > 0 
            ): 
               
                result.append(stripped_word) 
        return " ".join(result)
        
        
    def url_contents(self, url):
        """
        return de content of web page
        
        Args:
            url (str): url for the web page

        Returns:
            Series: content of the web page
        """
        
        
        try:
            headers = { "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/109.0",
                       "Connection": "keep-alive"

                       }
            response = requests.get(url, timeout=60, headers=headers)

            if response.status_code != 200:
                result = {
                    "lang": "None",
                    "url": "None",
                    "website_name": "None",
                    "content_text": "None"
                }
            else:
                verify= "You must enable javascript in your browser in order to use this site. You can click the link below for instructions."
                cover_page = ""
                warnings.filterwarnings("error", category=MarkupResemblesLocatorWarning)
                warnings.filterwarnings("error", category=UnicodeWarning)
                warnings.simplefilter('ignore')

                soup = BeautifulSoup(response.content, "html.parser", from_encoding=response.encoding) if response.encoding else BeautifulSoup(response.content, "html.parser", from_encoding='iso-8859-1')
                word = soup.find('p').text if soup.find('p') else ""
                if word == verify:
                    chrome_options = Options()
                    chrome_options.add_argument("--disable-gpu")
                    chrome_options.add_argument("--disable-extensions")
                    chrome_options.add_argument("--disable-popup-blocking")
                    chrome_options.add_argument("--disable-default-apps")
                    chrome_options.add_argument("--user-agent=Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/109.0")
                    chrome_options.add_argument('--enable-javascript')
                    service = Service("/usr/bin/chromedriver")
                    driver = webdriver.Chrome(service=service, options=chrome_options)
                    driver.get(url)

                    try:
                        bouton = driver.find_element(By.CSS_SELECTOR, ".RaisedButton-primary-0-2-33")

                        elements = driver.find_elements(By.TAG_NAME, "p")  
                        cover_page = "".join([el.text for el in elements])    
                        bouton.click()

                        time.sleep(5)
                        response1 = driver.page_source 
                        soup = BeautifulSoup(response1,"html.parser")

                    except NoSuchElementException:
                            print(f'The button cannot be found for {url}')
                    finally:
                        driver.quit()


                result = {
                "lang": self.web_language(soup),
                "url": url,
                "website_name": self.web_name(url),
                "content_text": cover_page + self.web_meta(soup) + self.web_title(soup)+ self.web_header(soup) + self.web_contents(soup)
                }


            return pd.Series(result)


        except (requests.exceptions.RequestException, MarkupResemblesLocatorWarning, UnicodeDecodeError):
            result = {
                "lang": "None",
                "url": url,
                "website_name": "None",
                "content_text": "None"
            }
            return pd.Series(result) 
        

In [41]:
scrap = Scraping()
clean = TextCleaner()
required_lang = ["en", "de", "fr", "ca"]

url = "https://toptracker.ru"

content = dict(scrap.url_contents(url))



if content["lang"] in required_lang:
    print(content["lang"])

    text = clean.clean_text(content['content_text'], content['lang'])
    print(len(text))
    print(text)

12382


In [39]:
import spacy

class TextCleaner:
    
    
    def __init__(self):
        self.nlp_dict = {}
        self.max_length_segment = 100000
        
    def clean_text(self, document, lang):
        """
        Cleans and normalizes a text according to the given language

        Args:
            document (str): a text to clean
            lang (str): the language of the document

        Returns:
            str: the normalized text
        """
        tokens = []
        exclusion_list = ["nan", "vml", "endif"]
        #liste_stop_words = []

        # load the appropriate template if you have not already done it
        #'xx': "xx_ent_wiki_sm"
        if lang not in self.nlp_dict:
            lang_dict = {
                'en': 'en_core_web_sm',
                'fr': 'fr_core_news_sm',
                'de': 'de_core_news_sm',
                # add other languages if necessary
            }

            self.nlp_dict[lang] = spacy.load(lang_dict.get(lang, 'en_core_web_sm'))
            
            # for lang in lang_dict:
            #     #self.nlp_dict[lang] = spacy.load(lang_dict.get(lang, 'en_core_web_sm'))
            #     liste_stop_words += self.nlp_dict[lang].Defaults.stop_words
            
            
        if len(document)> self.max_length_segment:
            # Split the text into smaller segments
            segments = [document[i:i+self.max_length_segment] for i in range(0, len(document), self.max_length_segment)]
            for segment in segments:
                doc = self.nlp_dict[lang](segment)
                for token in doc:
                    if token.is_stop or token.is_punct or token.text.isnumeric()  or (token.text.isalnum() == False)   or token.text in exclusion_list:
                        continue

                    # Normalization of the token to lowercase lemmas
                    token = str(token.lemma_.lower().strip())
                    tokens.append(token)

        else:
            # normalizes the text with the loaded template
            doc = self.nlp_dict[lang](document)
            for token in doc:
                if token.is_stop or token.is_punct or token.text.isnumeric() or (token.text.isalnum() == False)  or token.text in exclusion_list:
                    continue

                # Normalization of the token to lowercase lemmas
                token = str(token.lemma_.lower().strip())
                tokens.append(token)
        return " ".join(tokens)
    
 



In [17]:
import concurrent.futures # module for running functions in parallel
import json
import pandas as pd 
from tqdm import tqdm  # module to create a progress bar



def extract_data(domaine, scrap, cleaner):
    """
    (upgrade)Function to extract data from a line in our json file

    Args:
        domaine (dict): Line of the json file
        scrap (Scraping): instance of the Scraping class

    Returns:
        dict: dictionary containing the extracted information
    """
    try:
        required_lang = ["en", "de", "fr", "ca"]
        dict_domaine = json.loads(domaine)
        name = dict_domaine['name']
        category = dict_domaine['category']
        address = dict_domaine['address']
        
        content = dict(scrap.url_contents(address))
        if content["lang"] in required_lang:
            content['content_text'] = content['content_text'].strip()
            if len(content['content_text'])>=20:
                text = cleaner.clean_text(content['content_text'], content['lang'])  # clean up the extracted text
                # return a dictionary containing the extracted information
                return {'name': name, 'category': category, 'address': address, 'language':content['lang'], 'words': text}
    except Exception as e:
        print(f"Error when extracting data for the domain {address}: {e}")
        return None
    
    
def get_dataSet(file_name: str):
    """
    (upgrade)Read a JSON file containing domain names, extract the keywords associated with each address and return a pandas Dataframe containing the extracted information

    Args:
        file_name (str): json file 

    Returns:
        Dataframe: Panda Dataframe with extracted data
    """
    
    # create an instance of the Scraping class and TextCleaner to extract data from each domain
    scrap = Scraping()
    cleaner = TextCleaner()
    
    # open the JSON file and extract the information
    with open(file_name, "r") as f:
        domaines = f.readlines()

    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:  # execute functions in parallel with 10 threads
        futures = [executor.submit(extract_data, domaine, scrap, cleaner) for domaine in domaines]  # submit tasks
        results = []  # initialise the list to store the results
        
        with tqdm(total=len(domaines)) as pbar:  # create a progress bar to display progress
            for future in concurrent.futures.as_completed(futures):
                try:
                    result = future.result()
                    if result is not None:  # if the task has been executed successfully, add the result to the list
                        results.append(result)
                except Exception as e:
                    print(f"Error when extracting data: {e}")
                pbar.update(1)  # update of the progress bar
                
    # Create a pandas Dataframe containing the extracted data
    data = pd.DataFrame(results)
    data['indice'] = range(len(data))
    data = data.set_index('indice')
    return data

if __name__ == '__main__':
    df = get_dataSet('urls.txt') 

  0%|          | 86/119321 [00:49<8:27:23,  3.92it/s] Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
  0%|          | 148/119321 [01:11<10:20:57,  3.20it/s]Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
  0%|          | 155/119321 [01:13<8:45:26,  3.78it/s] 

In [5]:

df.to_csv("data.csv", index=False)

recall 

In [59]:
df = pd.read_csv("DataSet.csv")

In [9]:
data = pd.read_csv("data.csv")
lang = data.language
lang_u = lang.unique()
len(lang_u)

55

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108258 entries, 0 to 108257
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   name      108258 non-null  object
 1   category  108258 non-null  object
 2   address   108258 non-null  object
 3   language  108258 non-null  object
 4   words     107942 non-null  object
dtypes: object(5)
memory usage: 4.1+ MB


In [17]:
#data["words"].value_counts()
data.describe()

Unnamed: 0,name,category,address,language,words
count,108258,108258,108258,108258,107942
unique,20,250,100791,55,95497
top,Unknown,General Business,https://www.homeless.co.il,en,enable javascript browser order use site click...
freq,40182,38665,5,68443,837


In [18]:
data_b = data[data.address == "https://www.homeless.co.il"]
data_b.head()

Unnamed: 0,name,category,address,language,words
3951,Finance / Investment,Financial Services / Insurance / Real Estate,https://www.homeless.co.il,he,דירות להשכרה דירות למכירה דירות שותפים נדלן מס...
8575,Finance / Investment,Financial Services / Insurance / Real Estate,https://www.homeless.co.il,he,דירות להשכרה דירות למכירה דירות שותפים נדלן מס...
12977,Finance / Investment,Financial Services / Insurance / Real Estate,https://www.homeless.co.il,he,דירות להשכרה דירות למכירה דירות שותפים נדלן מס...
18942,Finance / Investment,Financial Services / Insurance / Real Estate,https://www.homeless.co.il,he,דירות להשכרה דירות למכירה דירות שותפים נדלן מס...
27115,Finance / Investment,Financial Services / Insurance / Real Estate,https://www.homeless.co.il,he,דירות להשכרה דירות למכירה דירות שותפים נדלן מס...


In [19]:
data_unique = data.drop_duplicates()

In [20]:
data_unique.describe()

Unnamed: 0,name,category,address,language,words
count,101514,101514,101514,101514,101212
unique,20,250,100791,55,95497
top,Unknown,General Business,https://dandelife.com,en,enable javascript browser order use site click...
freq,40158,38641,4,63997,825


In [21]:
data_drop = data.dropna()

In [22]:
data_drop.describe()

Unnamed: 0,name,category,address,language,words
count,107942,107942,107942,107942,107942
unique,20,250,100489,55,95497
top,Unknown,General Business,https://www.homeless.co.il,en,enable javascript browser order use site click...
freq,40004,38489,5,68383,837


In [23]:
data_unique = data_drop.drop_duplicates()

In [24]:
data_unique.describe()

Unnamed: 0,name,category,address,language,words
count,101212,101212,101212,101212,101212
unique,20,250,100489,55,95497
top,Unknown,General Business,https://gp24.pl,en,enable javascript browser order use site click...
freq,39980,38465,4,63939,825


In [51]:
# liste_supr = ['ko', 'th', 'ru', 'tr', 'bg', 'uk', 'af', 'lv', 'el', 'te', 'fi', 'bn', 'hi', 'lt', 'tl',
#        'sw', 'mk', 'sq', 'mr', 'so', 'cy', 'zh-tw', 'ur', 'gu', 'ne',
#        'pa', 'kn', 'sv', 'ja', 'ar', 'zh-cn', 'vi', 'cs', 'hu', 'he',
#        'id', 'it', 'hr', 'da', 'pt', 'sk', 'no', 'ro', 'ml', 'ta', 'et']

# masque = data_unique["language"].isin(liste_supr)
# lignes_a_supprimer = data_unique[masque].index
# data_unique1 = data_unique.drop(lignes_a_supprimer)

In [73]:
word1= data_unique[data_unique.address == 'https://gp24.pl'].words.unique()[0] 
word2= data_unique[data_unique.address == 'https://gp24.pl'].words.unique()[1] 
print(len(word1))
print(len(word2))

16917
17041


In [75]:
data_unique.describe().words.top

'enable javascript browser order use site click link instruction'

In [25]:
cut = 'enable javascript browser order use site click link instruction'
masque = data_unique['words'] == cut
move_line = data_unique.loc[masque].index
data_unique1 = data_unique.drop(move_line)
data_unique1.describe()
data_unique.loc[masque]

Unnamed: 0,name,category,address,language,words
347,Criminal Activities,Illegal Activities,https://wildgirls.com,en,enable javascript browser order use site click...
1869,Entertainment / Culture,Cinema / Television,https://livecamsex.com,en,enable javascript browser order use site click...
2400,Entertainment / Culture,Cinema / Television,https://spermcams.com,en,enable javascript browser order use site click...
3318,Finance / Investment,"Banking, Financial Services / Insurance / Real...",https://camboys.com,en,enable javascript browser order use site click...
3368,Finance / Investment,"Banking, Financial Services / Insurance / Real...",https://cums.com,en,enable javascript browser order use site click...
...,...,...,...,...,...
58770,Pornograhpy / Nudity,Pornography,https://ypmate.com,en,enable javascript browser order use site click...
70091,Unknown,General Business,https://cameglelive.com,en,enable javascript browser order use site click...
70128,Unknown,General Business,https://camwhores-tv.com,en,enable javascript browser order use site click...
83495,Unknown,General Business,https://livehdcams.com,en,enable javascript browser order use site click...


In [107]:
cut2 = "domain request domain security detailsthis domain digital publisher control access copyright content accordance digital millennium copyright act understand visitor access copyright content domain accept post request industry standard http port tcp tcp udp traffic originate consumer web browser request contain metric help site owner understanding authorize access site copyright content header return prevent cache discourage proxy intermediary cache store content domain utilize alive connection multiplex multiple request single connection order prevent have open multiple connection request domain use tls high possible certificate rotate frequently content serve domain consist javascript html css video image executable file available content routinely scan malware malicious software hostname fen hoothoot europe west1 w5gn datacenter gce europe west1"

masque1 = data_unique1['words'] == cut2
move_line = data_unique1.loc[masque1].index
data_unique2 = data_unique1.drop(move_line)
data_unique2.describe().words.top
data_unique2.loc[masque1]


Unnamed: 0,name,category,address,language,words


In [112]:
cut3 = 'welcome page nginx web server successfully instal work configuration require online documentation support refer commercial support available thank nginx'
masque2 = data_unique2['words'] == cut3
move_line = data_unique2.loc[masque2].index
data_unique3 = data_unique2.drop(move_line)
data_unique3.describe().words.top

'index modify size description cgi'

In [117]:
cut4 = 'index modify size description cgi'
masque3 = data_unique3['words'] == cut4
move_line = data_unique3.loc[masque3].index
data_unique4 = data_unique3.drop(move_line)
data_unique4.describe().words.top

'网站改版中our website construction'

In [123]:
cut5 = '网站改版中our website construction'
masque4 = data_unique3['words'] == cut5
move_line = data_unique4.loc[masque4].index
data_unique5 = data_unique4.drop(move_line)
#data_unique4.loc[masque4] 

In [126]:
cut6 = 'index modified size cgi bin proudly serve litespeed web server port'
masque5 = data_unique5['words'] == cut6
move_line = data_unique5.loc[masque5].index
data_unique6 = data_unique5.drop(move_line)
data_unique6.describe().words.top

'erstell deinen kostenlos account model live sex webcams verfügbar tritt weltweit groß webcam community'

In [None]:
categorie = df["category"].unique()
len(categorie)

257

In [None]:
#first without threads
import json
import pandas as pd
from tqdm import tqdm
from typing import List

def get_dataSet(file_name: str):
    """
    Read a JSON file containing domain names, extract the keywords associated with each address and return a pandas Dataframe containing the extracted information

    Args:
        file_name (str): json file 

    Returns:
        Dataframe: Panda Dataframe with extracted data
    """
       
    # creates an instance of the Scraping class   
    scrap = Scraping()
    clean = TextCleaner()

    # open the JSON file and extract the information
    with open(file_name, "r") as f:
        domaines = f.readlines()

    # Initialise lists to store data
    indices = []
    names = []
    categories = []
    addresses = []
    keywords = []

    # Iterate through each domain name, extract the keywords associated with the address and store the data in the lists
    for index, domaine in tqdm(enumerate(domaines[:50]), total=len(domaines)):
        try:
            dict_domaine = json.loads(domaine)
            name = dict_domaine['name'] 
            category = dict_domaine['category']
            address = dict_domaine['address']
            content = dict(scrap.url_content(address))
            
            if content["lang"] != "None":
                text = clean.clean_text(content['content_text'], content['lang'])
                keywords.append(text)

            indices.append(index)
            names.append(name)
            categories.append(category)
            addresses.append(address)
        except Exception as e:
            print(f"Erreur lors de l'extraction des données pour le domaine à l'index {index}: {e}")

    # Create a pandas Dataframe containing the extracted data
    data = {'name': names, 'category': categories, 'address': addresses, 'words': keywords}
    return pd.DataFrame(data)


df = get_dataSet('urls.txt')



In [54]:
scrap = Scraping()
clean = TextCleaner()

url = "https://wildgirls.com"

content = dict(scrap.url_contents(url))

#print(content)
print(content["lang"])

if content["lang"] != "None":
    print(content["lang"])

    text = clean.clean_text(content['content_text'], content['lang'])
    print(text)



en
en
enable javascript browser order use site click link instruction
