In [None]:
""" Variáveis globais """

# Substituir posteriormente como um parâmetro enviado pelo comando.
# SEEDS = './Seeds/seeds-2024711370.txt'
# LIMIT = 1000
# DEBUG_MODE = True

# Variáveis de controle
# DELAY = 100
# MAX_THREADS = 10
# visited_URLs = set()
# frontier = {}
# trash_can = {}

In [None]:
""" Helper functions: Timestamp, Node, Minimal URLs, Debug """


def get_timestamp():
    """ Retorna o timestamp atual em segundos desde 1970 """
    return int(datetime.datetime.now().timestamp())


def get_node():
    """ Retorna um nó vazio """
    node = {
        'URL': None,
        'Title': None,
        'Timestamp': get_timestamp(),
        'Text': None,

        'time_elapsed': None,
        'raw_content': None,
        'quantity': 0,
        'url': None,

        'visited': {},
        'frontier': {},
        'trash_can': {},
        'children': {},
    }
    return node


def get_minimal_url(URL):
    """ Retorna a URL mínima
    A intenção é encontrar a URL mínima que leva ao mesmo conteúdo.
    Linf. Form.: htt(p|ps)://(www.|)
    """

    https = 'https://'
    http = 'http://'
    www = 'www.'

    no_start = ''
    only_www = ''
    only_https = ''
    only_https_www = ''
    only_http = ''
    only_http_www = ''

    minimal_urls = []

    return minimal_urls


def debug_as_json(node):
    """ Retorna um nó em formato JSON """
    print(json.dumps(node, indent=4))

In [None]:
""" Visiting seeds """


def visit_seed(seed):
    """ Função para visitar um seed e armazenar o conteúdo. """
    try:
        # Minimize seed URL
        # seed = get_minimal_url(seed)
        response = requests.get(seed, timeout=5)
        mime = response.headers.get('Content-Type', '').split(';')[0]
        if response.status_code == 200 and mime == 'text/html':
            new_node = get_node()
            # Explorando respostas
            info = {
                'url': response.url,
                'status_code': response.status_code,
                # 'headers': dict(response.headers),
                'encoding': response.encoding,
                'text': response.text,
                # 'content': response.content.decode('utf-8', errors='ignore'),
                'time_elapsed': response.elapsed.microseconds,
            }
            if DEBUG_MODE:
                # debug_as_json(info)
                # print(info['text'])
                # print(20*'\n')
                # print(info['content'])
                pass

            new_node['url'] = seed
            new_node['quantity'] += 1
            new_node['raw_content'] = response.text
            new_node['time_elapsed'] = response.elapsed.microseconds

            return new_node
        else:
            trash_can[seed] = response
            if DEBUG_MODE:
                print(f'Failed to visit {seed}: {response.status_code}')
            return trash_can
    except requests.RequestException as e:
        if DEBUG_MODE:
            print(f'Error visiting {seed}: {e}')


# for seed in base_seeds:
#     if DEBUG_MODE:
#         print(f'Seed: {seed}')
#     visited_URLs.add(seed)
#     new_node = visit_seed(seed)
#     debug_as_json(new_node)

In [None]:
""" Processing sitemaps and robots.txt """


def get_sitemap(seed):
    """ Função para obter o sitemap de um site. """
    sitemap_url = seed + '/sitemap.xml'
    try:
        response = requests.get(sitemap_url, timeout=5)
        if response.status_code == 200:
            return response.text
        else:
            trash_can[sitemap_url] = response
            if DEBUG_MODE:
                print(
                    f'Failed to get sitemap for {seed}: {response.status_code}')
            return trash_can
    except requests.RequestException as e:
        if DEBUG_MODE:
            print(f'Error getting sitemap for {seed}: {e}')
    return None


def get_robots_txt(seed):
    """ Função para obter o arquivo robots.txt de um site. """
    robots_url = seed + '/robots.txt'
    try:
        response = requests.get(robots_url, timeout=5)
        if response.status_code == 200:
            return response.text
        else:
            trash_can[robots_url] = response
            if DEBUG_MODE:
                print(
                    f'Failed to get robots.txt for {seed}: {response.status_code}')
            return trash_can
    except requests.RequestException as e:
        if DEBUG_MODE:
            print(f'Error getting robots.txt for {seed}: {e}')
    return None


def get_robots_rules(robots_txt):
    """ Função para processar o arquivo robots.txt e retornar as regras. """
    rules = {'sitemaps': set(), 'user_agents': {}}
    lines = robots_txt.split('\n')
    user_agent = None
    for line in lines:
        line = line.strip()
        if line.startswith('User-agent:'):
            user_agent = line.split(':')[1].strip()
            rules['user_agents'][user_agent] = {'allow': [], 'disallow': []}
        elif line.startswith('Disallow:') and user_agent:
            path = line.split(':')[1].strip()
            rules['user_agents'][user_agent]['allow'].append(path)
        elif line.startswith('Allow:') and user_agent:
            path = line.split(':')[1].strip()
            rules['user_agents'][user_agent]['disallow'].append(path)
        elif line.startswith('Sitemap:'):
            sitemap_url = line.split(': ')[1].strip()
            rules['sitemaps'].add(sitemap_url)
    return rules


def process_sitemaps(robots_rules):
    """ empty all xml sitemap files """
    def process_sitemap(sitemap):
        """ Função para processar cada sitemap. """

        try:
            response = requests.get(sitemap, timeout=5)
            mime = response.headers.get('Content-Type', '').split(';')[0]
            print(f'Processing sitemap: {sitemap}; mime: {mime}')
            # MIMEs: text/xml, application/xml
            if response.status_code == 200:
                soup = bs.BeautifulSoup(response.content, 'lxml')
                # try:
                #     soup = bs.BeautifulSoup(response.content, 'xml')
                # except (bs.FeatureNotFound):
                #     soup = bs.BeautifulSoup(response.content, 'html.parser')
                print(soup)
            #     urls = [url.text for url in soup.find_all('loc')]
            #     for url in urls:
            #         if url not in visited_URLs:
            #             visited_URLs.add(url)
            #             new_node = visit_seed(url)
            #             debug_as_json(new_node)
            # else:
            #     trash_can[sitemap] = response
            #     if DEBUG_MODE:
            #         print(f'Failed to process sitemap {sitemap}: {response.status_code}')
        except requests.RequestException as e:
            if DEBUG_MODE:
                print(f'Error processing sitemap {sitemap}: {e}')

    for sitemap in robots_rules['sitemaps']:
        process_sitemap(sitemap)


# print(base_seeds)

def process_seed(seed):
    """ Função para processar cada seed em uma thread. """
    if DEBUG_MODE:
        print(f'Processing seed: {seed}')
    robots_txt = get_robots_txt(seed)
    robots_rules = get_robots_rules(robots_txt)
    processed_sitemaps = process_sitemaps(robots_rules)


# for seed in base_seeds:
#     process_seed(seed)
#     print(100*'=')