In [1]:
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import requests
from collections import Counter
import pandas as pd
import os
import bs4

In [2]:
class Scraping:
    def url_content(self, url):
        '''
        return de content of web page
        
        '''
        headers = {
        "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/109.0"}


        
        # content = requests.get(url, timeout=60)
        # soup = BeautifulSoup(content, "html.parser")
        
        response = requests.get(url, timeout=60, headers= headers)

        if response.status_code != 200:
            print(response.status_code)
            result = {
                "lang": "None",
                "url": "None",
                "website_name": "None",
                "content_text": "None"
            }
            return pd.Series(result)
        else:
            soup = BeautifulSoup(response.text, "html.parser")
        
            result = {
                "lang": self.get_language(soup),
                "url": url,
                "website_name": self.get_website_name(url),
                "content_text": self.get_title(soup) + self.get_meta(soup) + self.get_header(soup) + self.get_content(soup)
            }
        
        # return a objet pandas
        return pd.Series(result)
    
    def get_website_name(self, url):
        '''return the name that located of the url'''
        
        return "".join(urlparse(url).netloc.split(".")[-2])
    
    def get_title(self, soup):
        
        return " ".join(soup.title.contents)
    
    def get_language(self, soup):
        return soup.html.attrs['lang']
    
    def get_meta(self, soup):
        
        tags = soup.find_all(lambda tag: (tag.name=='meta') & (tag.has_attr('name') & tag.has_attr('content')))
        
        content = [str(tag['content']) for tag in tags if tag['name'] in ['keywords', 'description']]
        return " ".join(content)
    
    def get_header(self, soup):
        
        tags = soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
        content = [" ".join(tag.stripped_strings) for tag in tags]
        return " ".join(content)
    
    def get_content(self, soup):
        tags_to_ignore = ["h1", "h2", "h3", "h4", "h5","h6", "noscript", "style", "script", "head", "title", "meta", "[document]"]
        tags = soup.find_all(text=True)
        result = []
        for tag in tags:
            stripped_tag = tag.strip()
            if (
                tag.parent.name not in tags_to_ignore
                and not isinstance(tag, bs4.element.Comment)
                and not stripped_tag.isnumeric()
                and len(stripped_tag) > 0
            ):
                result.append(stripped_tag)
                return " ".join(result)
        
        

In [3]:
import spacy as sp
from collections import Counter

import re
sp.prefer_gpu()
nlp_en = sp.load("en_core_web_sm")
nlp_fr = sp.load("fr_core_news_sm")
nlp_de = sp.load("de_core_news_sm")

def clean_text(document, lang):
    lang_franz = ["fr", "fr-be", "fr-ca", "fr-ch", "fr-lu", "fr-FR"]
    lang_deutsch = ["de", "de-ch", "de-at", "de-lu", "de-li"]
               

    if lang in lang_franz:
        doc = nlp_fr(document)
        print("****************** Francais ****************************")
    elif lang in lang_deutsch:
        doc = nlp_de(document)
        print("****************** Deutsch ****************************")
    else:
        doc = nlp_en(document)
        print("****************** Autres ****************************")
    
    
    tokens = []
    exclusion_list = ["nan"]
    
    
    for token in doc:
        if token.is_stop or token.is_punct or token.text.isnumeric() or (token.text.isalnum()==False) or token.text in exclusion_list:
            continue

        
        token = str(token.lemma_.lower().strip())
        tokens.append(token)
    return " ".join(tokens)



In [12]:
url = "https://www.dnaindia.com/lifestyle/photo-gallery-inside-photos-of-mukesh-ambani-nita-ambani-luxurious-dubai-villa-worth-rs-650-crore-swimming-pool-interiors-3027060"

scrap = Scraping()
content = dict(scrap.url_content(url))

#print(content)
print()

if content["lang"] != "None":
    text = clean_text(content['content_text'], content['lang'])
    print(text)




****************** Autres ****************************
step inside mukesh ambani sea face ultra luxurious dubai villa worth rs crorein business tycoon mukesh ambani buy luxurious property palm jumeirah dubai worth r crore mukesh ambani mukesh ambani news ambani news nita ambani isha ambani isha ambani news ambani home photo mukesh ambani home photo antilia home photo ambani dubai home photosstep inside mukesh ambani sea face ultra luxurious dubai villa worth rs crore accord report reliance industries limited chairman managing director mukesh ambani buy beachside villa dubai usd million dubai villa mukesh ambani locate northern palm jumeirah luxurious property bedroom private spa swimming pool accord villa furnish italian marble royal masterpiece luxurious property spread area sq ft plot report mukesh ambani villa exterior swimming pool huge pool inside property trend news topics popular stories aditi rao hydari recreate viral tum tum dance rumour boyfriend siddharth watch crow man ind