In [0]:
from pyspark.sql.functions import col, when, year, month, dayofmonth, sum

# Critérios de Busca

In [0]:
class Search_Criteria:
    def __init__(self, subject, keywords):
        self.__subject = subject # "genomics"
        self.__keywords = keywords # ["DNA", "genetics", "treatment"]
    

    # Adicionar palavra chave
    def add_keyworkd(self, new_keyword):
        self.__keywords.append(new_keyword)


    # Remover palavra chave
    def remove_keyworkd(self, old_keyword):
        self.__keywords.pop(self.__keywords.index(old_keyword))

    @property
    def subject(self):
        return self.__subject
    
    @subject.setter
    def subject(self, new_subject):
        self.__subject = new_subject

    @property
    def keywords(self):
        return self.__keywords

# Rotina

In [0]:
# Instanciando objeto de buscas
criteria = Search_Criteria("genomics", ["DNA", "genetics", "treatment"])

In [0]:

# Caminho para a tabela delta
delta_table_path = "/FileStore/Projeto/delta"

# Ler a tabela Delta
df = spark.read.format("delta").load(delta_table_path)

# Visualização

In [0]:
# Agrupamento por ano, mês e dia
df_time_stats = df.groupBy(
    year(col("data_publicacao")).alias("year"),
    month(col("data_publicacao")).alias("month"),
    dayofmonth(col("data_publicacao")).alias("day")).count()

# Ordenar pela soma
df_time_stats = df_time_stats.orderBy(col("count").desc())

# Exibir os resultados
df_time_stats.show()

+----+-----+---+-----+
|year|month|day|count|
+----+-----+---+-----+
|2024|   10| 10|   39|
|2024|    9| 25|   29|
|2024|    9| 19|   29|
|2024|    9| 26|   27|
|2024|    9| 23|   25|
|2024|    9| 18|   25|
|2024|   10| 16|   25|
|2024|   10| 11|   23|
|2024|    9| 20|   22|
|2024|    9| 17|   22|
|2024|   10|  1|   21|
|2024|   10| 17|   20|
|2024|   10| 14|   20|
|2024|   10| 18|   19|
|2024|    9| 30|   18|
|2024|   10| 15|   18|
|2024|   10|  4|   18|
|2024|    9| 16|   17|
|2024|    9| 28|   17|
|2024|   10|  2|   16|
+----+-----+---+-----+
only showing top 20 rows



In [0]:
# Agrupamento por fonte e autor
df_source_author_stats = df.groupBy("fonte", "autor").count()

# Ordenar pela soma
df_source_author_stats = df_source_author_stats.orderBy(col("count").desc())

# Exibir os resultados
df_source_author_stats.show()

+--------------------+--------------------+-----+
|               fonte|               autor|count|
+--------------------+--------------------+-----+
|      ETF Daily News|     MarketBeat News|  189|
|       GlobeNewswire|Research and Markets|   39|
|       Investing.com|       Investing.com|   24|
|       Science Daily|                NULL|   18|
|          Biztoc.com|      marketbeat.com|   17|
|Investor's Busine...|Investor's Busine...|    9|
|      Financial Post|       GlobeNewswire|    5|
|National Institut...|                NULL|    5|
|             Cdc.gov|                NULL|    5|
|      Financial Post|       Business Wire|    4|
|       GlobeNewswire|Transparency Mark...|    4|
|           [Removed]|                NULL|    4|
|       GlobeNewswire|SkyQuest Technolo...|    3|
|        Stanford.edu|                NULL|    3|
|       GlobeNewswire|DelveInsight Busi...|    3|
|       GlobeNewswire|Dimension Market ...|    3|
|        Histalk2.com|         Mr. HIStalk|    3|


In [0]:
# Adicionar colunas de contagem de palavras-chave (1 se a palavra aparecer, 0 caso contrário)
for keyword in criteria.keywords:
    df = df.withColumn(keyword, when(col("conteudo").contains(keyword), 1).otherwise(0))

# Agrupar por ano, mês e dia e somar as aparições das palavras-chave
df_keyword_stats = df.groupBy(
    year(col("data_publicacao")).alias("year"),
    month(col("data_publicacao")).alias("month")    
).agg(*[sum(col(keyword)).alias(f"sum_{keyword}") for keyword in criteria.keywords])

# Exibir os resultados
df_keyword_stats.show()

+----+-----+-------+------------+-------------+
|year|month|sum_DNA|sum_genetics|sum_treatment|
+----+-----+-------+------------+-------------+
|2024|    9|     12|           2|            1|
|2024|   10|     11|           3|            5|
+----+-----+-------+------------+-------------+

