# Bibliotecas

In [0]:
from newsapi import NewsApiClient
from kafka import KafkaProducer
import pickle
import time
import datetime

# Variáveis

In [0]:
# Env
apikey = "COLOCAR A APIKEY"
create_folder = "/FileStore/Projeto/data"

In [0]:
# Cria a pasta de projeto
dbutils.fs.mkdirs(create_folder)

True

# API

In [0]:
# Objeto para manipulação da API
class APIClient:
    def __init__(self, key, query) -> None:
        self.__client = NewsApiClient(api_key=key)
        self.__query = query
        self.__all_data = None
        self.__articles = []

    
    # Realização do request
    def search(self, page):
        # Tenta pegar todos os dados do request
        try:
            self.__all_data = self.__client.get_everything(q=self.__query, page = page)
        except:
            self.__all_data = {"status": "end"}


    # Filtra os artigos do request feito
    def get_articles(self):
        page = 1
        self.search(page=page)
        while self.__all_data["status"] == "ok":
            self.__articles += self.__all_data["articles"]
            page += 1
            self.search(page=page)

    @property
    def articles(self):
        return self.__articles
    
    @property
    def all_data(self):
        return self.__all_data

# Critérios de Busca

In [0]:
# Objeto de definição de pesquisa e palavras chave
class Search_Criteria:
    def __init__(self, subject, keywords=[]):
        self.__subject = subject # "genomics"
        self.__keywords = keywords # ["DNA", "genetics", "treatment"]
    
    def add_keyworkd(self, new_keyword):
        self.__keywords.append(new_keyword)

    def remove_keyworkd(self, old_keyword):
        self.__keywords.pop(self.__keywords.index(old_keyword))

    @property
    def subject(self):
        return self.__subject
    
    @subject.setter
    def subject(self, new_subject):
        self.__subject = new_subject

    @property
    def keywords(self):
        return self.__keywords

# Producer

In [0]:
producer = KafkaProducer(bootstrap_servers=["localhost: 9092"])
topic = "genomics-news"

# Rotina

In [0]:
# Instanciando objeto de critérios de pesquisa
search_object = Search_Criteria("genomics", ["DNA", "genetic", "treatment"])

# Instanciando API
news_api = APIClient(apikey, search_object.subject)

# Minutos para adquirir dados
specific_time = [0, 10, 20, 30, 40, 50]

while True:
    now = datetime.datetime.now()
    current_minute = now.minute
    
    if current_minute in specific_time:
        
        # Adquirindo todos os artigos
        news_api.get_articles()

        # Enviando artigos
        producer.send(topic, pickle.dumps(news_api.articles))

        print("Dados recolhidos da API")

        time.sleep(60)  # Pausa de 60 segundos para evitar múltiplas execuções no mesmo minuto
    else:
        time.sleep(30)  # Verifica a cada 30 segundos