In [1]:
from selenium.webdriver.firefox.service import Service
from selenium import webdriver
from bs4 import BeautifulSoup
from tqdm import tqdm

import funcs.ArticlePage as ArticlePage
import funcs.AuthorPage as AuthorPage
import funcs.Common as Common
import networkx as nx
import importlib
import time


In [2]:
firefox_service = Service()
web_driver = webdriver.Firefox(service = firefox_service)
url = "https://www.scopus.com//authid/detail.uri?authorId=10140494200"
url_prefix = "https://www.scopus.com/authid/detail.uri?authorId="

In [3]:
web_driver.get(url)

In [32]:
importlib.reload(ArticlePage)
_ = importlib.reload(AuthorPage)


In [33]:
# author_graph = nx.gml.read_gml("../gmls/140_authors.gml")
author_graph = nx.gml.read_gml("../gmls/gml_2_iterations.gml", destringizer=int)
nodes = dict(author_graph.nodes())

current_layer = set()
explored_ids = set()

for n in nodes:
	if('hIndex' in nodes[n]):
		explored_ids.add(nodes[n]['ID'])
	else:
		current_layer.add(n)

print(f"{len(explored_ids)} | {len(current_layer)}")


108 | 12171


In [34]:
MAX_TRIES = 2

progress_bar = tqdm(current_layer)
failed_articles = 0
failed_authors = 0

for author_id in progress_bar:
	if((len(explored_ids)%10) == 0):
		nx.write_gml(author_graph, f"../gmls/{len(explored_ids)}_authors.gml")

	progress_bar.set_description(f"Author {author_id} Articles {0:3d}% Fails {failed_authors:2d}|{failed_articles:<3d}", refresh=True)
	
	success = False
	attempts = 0
	while not success and attempts < MAX_TRIES: #Loop de tentativas para lidar com erros de carregamento (404, 500, etc.)
		try:
			web_driver.get(url_prefix + str(author_id)) #Acessa página do autor
			time.sleep(4)
			AuthorPage.add_author(web_driver, author_id, author_graph) #Adiciona autor e dados ao grafo
			explored_ids.add(author_id) #Adiciona autor aos visitados
			success = True
		except KeyboardInterrupt:
			raise
		except:
			attempts+=1

	if(not success): #Lida com falha de obter autor
		failed_authors+=1
		with open("Fails\\FailedList.txt", "a", encoding="utf-8") as log_file:
			log_file.write(f"Failed Author for {author_id}\n")

		try:
			soup=BeautifulSoup(web_driver.page_source,'lxml')
			file=open(f"Fails\\Author_{author_id}.html", "w", encoding="utf-8")
			file.write(str(soup))
			file.close()
		except KeyboardInterrupt:
			raise
		except:
			pass
		continue #Pula autor

	articles = AuthorPage.get_author_articles(web_driver, sleep_time=4) #Lista de artigos do pesquisador

	number_of_articles = len(articles)
	for current_article_index, article_url in enumerate(articles):
		progress_bar.set_description(f"Author:{author_id} Articles:{(100*current_article_index)//number_of_articles}% Fails {failed_authors:2d}|{failed_articles:<3d}", refresh=True)
		attempts = 0
		success = False
		while not success and attempts < MAX_TRIES: #Loop de tentativas para lidar com erros de carregamento (404, 500, etc.)
			try:
				web_driver.get(article_url) #Acessa página do Artigo
				time.sleep(0.8)
				ArticlePage.add_article_without_next_layer(web_driver, explored_ids, author_graph, author_id) #Adiciona conexões formadas pelo artigo
				success = True
			except KeyboardInterrupt:
				raise
			except:
				attempts+=1
			
			if(not success): #Lida com falha de obter artigo
				failed_articles+=1
				with open("Fails\\FailedList.txt", "a", encoding="utf-8") as log_file:
					log_file.write(f"Failed Author for {author_id}\n")
	
				try:
					soup=BeautifulSoup(web_driver.page_source,'lxml')
					file=open(f"Fails\\Article_{author_id}.html", "w", encoding="utf-8")
					file.write(str(soup))
					file.close()
				except KeyboardInterrupt:
					raise
				except:
					pass

Common.play_notification()


Author 7404685116 Articles   0% Fails 55|4481:   5%|▌         | 661/12171 [47:42:53<830:51:34, 259.87s/it]   


StaleElementReferenceException: Message: The element with the reference 2613d668-f601-44a4-b94b-9bb387bda9c8 is stale; either its node document is not the active document, or it is no longer connected to the DOM
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:183:5
StaleElementReferenceError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:495:5
element.getKnownElement@chrome://remote/content/marionette/element.sys.mjs:508:11
deserializeJSON@chrome://remote/content/marionette/json.sys.mjs:233:33
cloneObject/result<@chrome://remote/content/marionette/json.sys.mjs:50:52
cloneObject@chrome://remote/content/marionette/json.sys.mjs:50:25
deserializeJSON@chrome://remote/content/marionette/json.sys.mjs:244:16
cloneObject@chrome://remote/content/marionette/json.sys.mjs:56:24
deserializeJSON@chrome://remote/content/marionette/json.sys.mjs:244:16
json.deserialize@chrome://remote/content/marionette/json.sys.mjs:248:10
receiveMessage@chrome://remote/content/marionette/actors/MarionetteCommandsChild.sys.mjs:85:30


In [12]:
name = f"../gmls/main.gml"
nx.write_gml(author_graph, str(name))


In [29]:
web_driver.quit()