In [1]:
import requests


In [1]:
# first
import requests
import concurrent.futures # to run multiple tasks in parallel
from tqdm import tqdm

def get_valid_domains(file_name, new_file_name):
    """Function definition for extracting valid domain names

    Args:
        file_name (str): file containing all domain names
        new_file_name (str): file containing the result of the filter i.e. the valid urls
    """
    
    valid_domains = set() # to store valid domain names
    with open(file_name) as f:
        for line in f:
            line = line.strip() # Remove white space at the beginning and end of a line
            if line.startswith('www.') or line.count('.') == 1:
                domain = f"https://{line}" # Add the prefix 'https://
                valid_domains.add(domain)

    total_domains = len(valid_domains)
    with open(new_file_name, 'w') as f:
        # Create an execution pool to run tasks in parallel
        with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
            # Submit each domain name to the test_domain function to check if it is valid
            futures = [executor.submit(test_domain, domain) for domain in valid_domains]
            with tqdm(total=total_domains) as pbar: # Create a progress bar
                for future in concurrent.futures.as_completed(futures): 
                    if valid_domain := future.result(): # Write valid domain names to a new file
                        f.write(valid_domain + '\n')
                    pbar.update() # Update the progress bar to reflect progress
                    
def test_domain(domain):  
    """Definition of the function to test the validity of the domain name

    Args:
        domain (str): domain name

    Returns:
        str: valid domain name
    """
    
    if domain.startswith('https://www.'):
        return domain
    try:
        with requests.head(domain, timeout=5) as response: 
            if response.status_code == 200:
                return domain
    except Exception:
        pass

if __name__ == '__main__':
    get_valid_domains('news.txt', 'urls_news.txt')


100%|██████████| 2531/2531 [04:18<00:00,  9.78it/s]


In [1]:
# update

import requests
import concurrent.futures 
from tqdm import tqdm 
import json

def get_valid_domains(file_name, new_file_name):
    """(Update)Function definition for extracting valid domain names

    Args:
        file_name (str): file containing all domain names
        new_file_name (str): file containing the result of the filter i.e. the valid urls
    """

    # Open the file containing the domain names and extract the information
    liste_domaines = []
    with open(file_name, "r") as f:
        lignes = f.read().split("\n")[2:1185697] # We start reading from the 3rd line
        for ligne in lignes:
            colonnes = ligne.split("|")
            name = colonnes[0].strip()
            category = colonnes[1].strip()
            address = colonnes[2].strip()
            dictionnaire = {"name": name, "category": category, "address": address}
            liste_domaines.append(dictionnaire)

    # Filter valid domain names from the extracted list
    valid_domains = [d for d in liste_domaines if d['address'].startswith('www.') or d['address'].count('.') == 1]

    
    total_domains = len(valid_domains)
    with open(new_file_name, 'w') as f:
        # Create an execution pool to run tasks in parallel
        with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
            # Submit each domain name to the test_domain function to check if it is valid
            futures = [executor.submit(test_domain, domain['address']) for domain in valid_domains]
            with tqdm(total=total_domains) as pbar: # Create a progress bar
                for future in concurrent.futures.as_completed(futures): 
                    if result := future.result():
                        for d in valid_domains:
                            if d['address'] == result:
                                # add 'https://' if the domain does not start with 'https://'
                                if not result.startswith('https://'):
                                    result = f'https://{result}'
                                # write the results to the file in json format
                                json.dump({"name": d['name'], "category": d['category'], "address": result}, f)
                                f.write('\n')
                    pbar.update() 

                    
def test_domain(domain):  
    """Definition of the function to test the validity of the domain name

    Args:
        domain (str): domain name

    Returns:
        str: valid domain name
    """
    
    # Check if the domain name starts with 'https://www.' and return it if it is valid
    if domain.startswith('https://www.'):
        return domain
    # Otherwise, test the validity of the domain name by making an HTTP HEAD request
    try:
        with requests.head(f"https://{domain}", timeout=5) as response: 
            if response.status_code == 200:
                return domain
    except Exception:
        pass

if __name__ == '__main__':
    get_valid_domains('domaine.txt', 'urls_valid.txt')

100%|██████████| 544546/544546 [13:11:28<00:00, 11.47it/s]   
