In [1]:
from bs4 import BeautifulSoup
import requests
import re

In [2]:
def url_add_http(url):
    """ 
    Add https:// on url if it does not contain one
    """
    string_r = r"^(https://)"
    string_e = re.compile(string_r)
    if not string_e.search(url):
        url = f"https://{url}"
    return url

In [4]:
def url_test(url):
    """ Allows to test a url and to know if url has a status code of 200 """
    
    # msg of redirection
    redi_text = "If you are not redirected automatically, follow the www.ioam.de"
    
    headers = {
    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/109.0"}

    # increase of max_redirect
    #session = requests.Session()
    #session.max_redirects = 100000000000
    try:
        url = url_add_http(url)
        response = requests.get(url, timeout=60, headers= headers)
    except Exception:
        # if max_redirects has been exceeded
        response = requests.get(url, timeout=60, headers= headers, allow_redirects=False)
    
    # if the page content contains a redirection message to another site    
    soup = BeautifulSoup(response.text, "html.parser")
    tags = soup.find_all("body")
    header = soup.find_all("head")
    content = [" ".join(tag.stripped_strings) for tag in tags]
    content = " ".join(content).split(" ")[:-1]
    redi_text = redi_text.split(" ")[:-1]

    return set(content) != set(redi_text) and response.status_code == 200 and header != []

In [3]:
def reformat_url(url):
    """ reformat the domain name to get base url"""
    url_split = url.split(".")
    return f"{url_split[-2]}.{url_split[-1]}"

In [4]:
def get_url(url):
    """ get url with which the page content will be parsed """
    if not url_test(url):
        url = reformat_url(url)
    url = url_add_http(url)
    return url

In [51]:
def _extracted_from_get_url2(response, url):

    return len(response.content)>10 and response.status_code == 200 


def get_url2(domains):  # sourcery skip: use-contextlib-suppress
    
    unique_urls = {}
    headers = {
    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/109.0"}
    for domain in domains:
        domain = domain.strip()
        try:
            url = url_add_http(domain)
            response = requests.get(url, timeout=60, headers= headers)

            if len(response.content)>10 and response.status_code == 200:
                # Get the content of the URL
                response = requests.get(url)
                content = response.content
                 
                # Checks if the URL has already been added
                if content not in unique_urls.values():
                    unique_urls[url] = content 
             
        except Exception:
            pass

    return list(unique_urls.keys()) 



In [71]:
import concurrent.futures
import requests

def get_url3(domains):    # sourcery skip: use-contextlib-suppress
    unique_urls = {}
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/109.0"
    }

    def process_domain(domain):
        domain = domain.strip()
        try:
            url = url_add_http(domain)
            response = requests.get(url, timeout=60, headers=headers)

            if len(response.content) > 500 and response.status_code == 200:
                # Get the content of the URL
                response = requests.get(url)
                content = response.content

                # Checks if the URL has already been added
                if content not in unique_urls.values():
                    unique_urls[url] = content

        except Exception:
            pass

    with concurrent.futures.ThreadPoolExecutor() as executor:
        executor.map(process_domain, domains)

    return list(unique_urls.keys())

In [84]:
url = "https://accounts.indianexpress.com"
response = requests.get(url)
content = response.content
print(len(content))

12302


In [48]:
def single_url(urls):
    import requests

    # Dictionary to store unique URLs according to their content
    unique_urls = {}


    for url in urls:
        # Get the content of the URL
        response = requests.get(url)
        content = response.content

        # Checks if the URL has already been added
        if content not in unique_urls.values():
            unique_urls[url] = content

    return list(unique_urls.keys())
    

In [81]:
with open("news.txt",'r') as f:
    data = f.read()
    domain = data.split('\n')[2:]
    
    
urls = get_url3(domain[:200])

#urls_news = single_url(urls)


In [83]:
#1m12,1
print(urls)


['https://www.heise.de', 'https://api.livestrong.com', 'https://bilder1.n-tv.de', 'https://heise.de', 'https://g.msn.com', 'https://cdn.magazin.spiegel.de', 'https://assets.tagesspiegel.de', 'https://content.chip.de', 'https://css.etimg.com', 'https://de.euronews.com', 'https://de.yahoo.com', 'https://efahrer.chip.de', 'https://gutscheine.chip.de', 'https://bilder.t-online.de', 'https://en.wikinews.org', 'https://freemail.t-online.de', 'https://pur.familie.de', 'https://img.welt.de', 'https://mein.tagesspiegel.de', 'https://plus.tagesspiegel.de', 'https://pur.giga.de', 'https://pur.spieletipps.de', 'https://m.tagesspiegel.de', 'https://pur.t-online.de', 'https://news.google.com', 'https://spiele.spiegel.de', 'https://login.t-online.de', 'https://static.euronews.com', 'https://static.up.welt.de', 'https://vergleich.tagesspiegel.de', 'https://web.de', 'https://tarifbestellen.t-online.de', 'https://www.berliner-zeitung.de', 'https://www.ka-news.de', 'https://www.channelpartner.de', 'https

In [43]:
urls = [url for url in urls if url is not None]
#urls_news = single_url(urls)

In [33]:
urls = [url for url in urls if url is not None]
urls

['https://www.heise.de',
 'https://heise.de',
 'https://g.msn.com',
 'https://api-app.wetteronline.de',
 'https://api.livestrong.com',
 'https://cdn.magazin.spiegel.de',
 'https://bilder1.n-tv.de',
 'https://css.etimg.com',
 'https://data-7462ea72ec.augsburger-allgemeine.de',
 'https://data-92cf33b2ed.faz.net',
 'https://data-e3d4300b49.n-tv.de',
 'https://data-f1e447fbcf.fr.de',
 'https://etelection.indiatimes.com',
 'https://apps-cloud.n-tv.de',
 'https://bilder.t-online.de',
 'https://freemail.t-online.de',
 'https://data-af9f3dfb33.zeit.de',
 'https://data-fb7f8b3ae8.heise.de',
 'https://de.euronews.com',
 'https://de.yahoo.com',
 'https://efahrer.chip.de',
 'https://gutscheine.chip.de',
 'https://en.wikinews.org',
 'https://flipboard.com',
 'https://img.welt.de',
 'https://plus.tagesspiegel.de',
 'https://login.t-online.de',
 'https://m.tagesspiegel.de',
 'https://mein.tagesspiegel.de']