In [None]:
!pip install lightfm scikit-learn numpy scipy tqdm transformers ipython

Collecting lightfm
  Downloading lightfm-1.17.tar.gz (316 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jedi>=0.16 (from ipython)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... [?25l[?25hdone
  Created wheel for lightfm: filename=lightfm-1.17-cp311-cp311-linux_x86_64.whl size=831160 sha256=e14a164ca09ea23262da65d587145ef95501975680b830a489e04e5a4477c123
  Stored in directory: /root/.cache/pip/wheels/b9/0d/8a/0729d2e6e3ca2a898ba55201f905da7db3f838a33df5b3fcdd
Successfully built lightfm
Installing collected packages: jedi, lightfm
Successfully installed jedi

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np

# Função para criar features de usuários
def make_user_features(dataframe):
    user_features_dict = {}

    for _, row in dataframe.iterrows():
        avg_time = np.mean(row['timeOnPageHistory'])
        avg_scroll = np.mean(row['scrollPercentageHistory'])
        avg_clicks = np.mean(row['numberOfClicksHistory'])
        total_visits = sum(row['pageVisitsCountHistory'])

        user_features_dict[row['userId']] = list(set([
            f"userType:{row['userType']}",
            f"avg_time_on_page:{avg_time:.2f}",
            f"avg_scroll:{avg_scroll:.2f}",
            f"avg_clicks:{avg_clicks:.2f}",
            f"total_visits:{total_visits}"
        ]))

    return user_features_dict

In [None]:
# Função para criar features de notícias
def make_news_features(dataframe):
    news_features_dict = {}

    pages = dataframe['page'].values
    titles = zip(
        dataframe['title_attention_mask'].values,
        dataframe['title_token_type_ids'].values,
        dataframe['title_input_ids'].values
    )
    captions = zip(
        dataframe['caption_attention_mask'].values,
        dataframe['caption_token_type_ids'].values,
        dataframe['caption_input_ids'].values
    )
    bodies = zip(
        dataframe['body_attention_mask'].values,
        dataframe['body_token_type_ids'].values,
        dataframe['body_input_ids'].values
    )

    for news_id, title, caption, body in zip(pages, titles, captions, bodies):
        arrays = [np.asarray(arr) for arr in (*title, *caption, *body)]

        # Se algum vetor for vazio, pula a iteração
        if any(arr.size == 0 for arr in arrays):
            continue

        flattened_embeddings = np.hstack(arrays)

        # Evita erro de divisão por zero
        norm_factor = np.clip(np.linalg.norm(flattened_embeddings), 1e-10, None)

        norm_embeddings = flattened_embeddings / norm_factor

        news_features_dict[news_id] = np.array2string(
            norm_embeddings, formatter={'float_kind': lambda x: f"{x:.4f}"}
        ).strip('[]').split()

    return news_features_dict

In [None]:
# Função para criar interações
def make_interactions_matrix(dataframe):
    interactions = []

    time_mean, time_std = dataframe['timeOnPageHistory'].apply(np.mean).mean(), dataframe['timeOnPageHistory'].apply(np.std).mean()
    scroll_mean, scroll_std = dataframe['scrollPercentageHistory'].apply(np.mean).mean(), dataframe['scrollPercentageHistory'].apply(np.std).mean()
    clicks_mean, clicks_std = dataframe['numberOfClicksHistory'].apply(np.mean).mean(), dataframe['numberOfClicksHistory'].apply(np.std).mean()
    visits_mean, visits_std = dataframe['pageVisitsCountHistory'].apply(np.mean).mean(), dataframe['pageVisitsCountHistory'].apply(np.std).mean()

    def normalize(value, mean, std):
        return (value - mean) / (std + 1e-10)

    for _, row in dataframe.iterrows():
        for i, news in enumerate(row['history']):
            interaction_strength = (
                normalize(row['numberOfClicksHistory'][i], clicks_mean, clicks_std) * 0.4 +
                normalize(row['timeOnPageHistory'][i], time_mean, time_std)  * 0.3 +
                normalize(row['scrollPercentageHistory'][i], scroll_mean, scroll_std)  * 0.2 +
                normalize(row['pageVisitsCountHistory'][i], visits_mean, visits_std)  * 0.1
            )
            interactions.append((row['userId'], news, interaction_strength))

    return interactions

In [None]:
def partition_dataframe(user_parts, news_parts, num_parts):

    num_parts = 19

    user_splits = np.array_split(user_parts, num_parts)

    user_partitions = []
    news_partitions = []

    for i, user_subset in enumerate(user_splits):
        news_ids = set(news_id for history in user_subset["history"] for news_id in history)

        news_subset = news_parts[news_parts["page"].isin(news_ids)]

        user_partitions.append(user_subset)
        news_partitions.append(news_subset)

        print(f"Parte {i+1}: {len(user_subset)} usuários, {len(news_subset)} notícias")

    return user_partitions, news_partitions

In [None]:
import pickle

with open("/content/drive/MyDrive/user_part_0.pkl", "rb") as user:
    user_partitions = pickle.load(user)

with open("/content/drive/MyDrive/news_part_0.pkl", "rb") as news:
    news_partitions = pickle.load(news)

## Teste parcial da seleção de dados

In [None]:
user_part_0 = user_partitions
news_part_0 = news_partitions

In [None]:
def get_user_history(userId: str):
    """
    Retorna o histórico de interações do usuário a partir do dataset user_part_0.

    Args:
        user_id (int): ID do usuário para recuperar o histórico.

    Returns:
        dict: Dicionário contendo as informações do usuário ou None se não encontrado.
    """
    user_data = user_part_0[['history']][user_part_0["userId"] == userId]

    if user_data.empty:
        return None  # Retorna None se o usuário não for encontrado

    user_data = user_data.values[0][0]
    user_data = [i.strip() for i in user_data]

    return user_data  # Retorna como lista de dicionários

hist_test = get_user_history('e5f68d5e7cdbe56d6984589b4baa6ebfc5e8a8a918e57d5092adc513f516b377')

print(hist_test)
print(type(hist_test))

['50028008-aa11-4519-9d75-452c84dd27fb', 'd031af1b-f939-47c1-a589-e6d691b66d91', 'bf257382-74fb-4392-ad6a-143240e39f81', '66a9efac-fd43-4fd1-9824-c404b08efa5d', '7a349b09-badc-40a9-a194-83d959aeb50c', 'd0b5ac09-591f-4f39-a325-25c242fe84b2', '35b75714-6aed-44bf-9a87-1d8054d5be98', 'd22af4c4-3aac-4d42-9eb8-50ad449a8972', '7456c88d-6473-46ed-83a6-dfc406994162', '458bf0ec-efb4-4bfd-9446-c80295e6aa87', '89fa73f0-4341-4de4-bb2a-e429ef96bd43', '50f85bf5-c153-4611-bf35-2f781ae4e234']
<class 'list'>


In [None]:
user_part_0

Unnamed: 0,userId,userType,historySize,history,numberOfClicksHistory,timeOnPageHistory,scrollPercentageHistory,pageVisitsCountHistory
0,e5f68d5e7cdbe56d6984589b4baa6ebfc5e8a8a918e57d...,Logged,12,"[50028008-aa11-4519-9d75-452c84dd27fb, d031af...","[4.0, 41.0, 6.0, 0.0, 0.0, 1.0, 1.0, 1.0, 21.0...","[10081.0, 70000.0, 12216.0, 90000.0, 161291.0,...","[16.34, 79.35, 13.36, 37.95, 73.83, 68.98, 51....","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
1,3a3b7f25a30a5a17a530685545e3a0be38cee0c6904c42...,Logged,93,"[561850c2-9ade-4985-bf36-402abf02153d, c4ab52...","[6.0, 9.0, 19.0, 20.0, 14.0, 9.0, 13.0, 21.0, ...","[28139.0, 70000.0, 230000.0, 120000.0, 78858.0...","[21.82, 51.35, 93.4, 81.35, 78.47, 37.43, 27.3...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
2,ea6728ebc30782516a7593a1143c47dc59428e1649c048...,Logged,8,"[c7505bef-22eb-49e3-bbde-43281d36981e, 9a7695...","[0.0, 0.0, 3.0, 2.0, 223.0, 29.0, 8.0, 13.0]","[10000.0, 70000.0, 10928.0, 10112.0, 1481132.0...","[8.21, 81.2, 24.18, 16.33, 81.15, 44.95, 16.09...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0]"
3,f909c50558c01ab790636e1b1918e6a5965bdcc271a860...,Non-Logged,19,"[598ed114-fd5a-4d82-90d8-f1e893cb0892, 92165a...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[30007.0, 43153.0, 41626.0, 39675.0, 12100.0, ...","[12.5, 12.5, 12.5, 12.5, 12.5, 12.5, 12.5, 12....","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
4,9725352f759a73aa977cb5eff2596a5f683fcd0c753200...,Logged,34,"[80753b6f-d65c-4962-9f2e-ef582cce0de0, 975936...","[6.0, 16.0, 0.0, 0.0, 0.0, 6.0, 1.0, 1.0, 0.0,...","[100000.0, 110000.0, 50000.0, 6455.0, 68291.0,...","[49.25, 66.86, 71.52, 17.58, 59.33, 37.27, 14....","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
...,...,...,...,...,...,...,...,...
30413,b48ab1d66661e76e5852595ec20786915d3c0621a47cc5...,Non-Logged,1,[ec0716fb-39e4-450c-a88a-12974aa3fde6],[31.0],[50000.0],[37.44],[1.0]
30414,f9c272b5c0e5629fb62ff877afb62a9238e85021951586...,Non-Logged,1,[2cb7ed29-19fa-43b9-bdcf-1f04f58a8b38],[49.0],[60000.0],[15.02],[1.0]
30415,609cffcfef9c180c6336997d221f57f1de37c2818a15b2...,Logged,1,[c9b993ac-ceba-492e-b4f8-79e41444221b],[46.0],[100000.0],[20.98],[1.0]
30416,add45903bd86d27cfe1d1535237adf0f1b16bbea10649e...,Non-Logged,1,[73c980ce-ed0e-43d6-8250-a173f920c917],[0.0],[10688.0],[77.28],[1.0]


In [None]:
import pandas as pd
from collections import Counter
import ast

def count_news_views(user_part_0: pd.DataFrame) -> dict:
    """
    Calcula o número de visualizações de cada notícia com base na frequência
    de ocorrência no histórico de cada usuário.

    Parâmetros:
        user_part_0 (pd.DataFrame): DataFrame com os dados dos usuários,
                                    contendo a coluna "history" que armazena
                                    o histórico de visualizações (listas ou strings).

    Retorna:
        dict: Dicionário onde as chaves são os IDs das notícias e os valores são
              o número de visualizações.
    """
    news_counter = Counter()

    # Itera sobre cada linha do dataset
    for idx, row in user_part_0.iterrows():
        history = row.get("history")
        if history:
            # Se o histórico estiver em formato string (ex: "['news1', 'news2']"),
            # converte-o para lista
            if isinstance(history, str):
                try:
                    history = ast.literal_eval(history)
                except Exception as e:
                    # Se ocorrer erro na conversão, tenta dividir por vírgula
                    history = history.split(",")
                    # Remove espaços e possíveis caracteres indesejados
                    history = [item.strip(" []'\"") for item in history if item.strip()]
            # Atualiza o contador com as ocorrências
            news_counter.update(history)

    return dict(news_counter)

# Exemplo de uso:
# Supondo que você tenha o DataFrame user_part_0 carregado:
# user_part_0 = pd.read_csv("user_part_0.csv")
# visualizacoes = count_news_views(user_part_0)
# print(visualizacoes)

count_news_views(user_part_0)

{'50028008-aa11-4519-9d75-452c84dd27fb': 33,
 ' d031af1b-f939-47c1-a589-e6d691b66d91': 130,
 ' bf257382-74fb-4392-ad6a-143240e39f81': 756,
 ' 66a9efac-fd43-4fd1-9824-c404b08efa5d': 458,
 ' 7a349b09-badc-40a9-a194-83d959aeb50c': 70,
 ' d0b5ac09-591f-4f39-a325-25c242fe84b2': 112,
 ' 35b75714-6aed-44bf-9a87-1d8054d5be98': 96,
 ' d22af4c4-3aac-4d42-9eb8-50ad449a8972': 85,
 ' 7456c88d-6473-46ed-83a6-dfc406994162': 247,
 ' 458bf0ec-efb4-4bfd-9446-c80295e6aa87': 655,
 ' 89fa73f0-4341-4de4-bb2a-e429ef96bd43': 753,
 ' 50f85bf5-c153-4611-bf35-2f781ae4e234': 356,
 '561850c2-9ade-4985-bf36-402abf02153d': 46,
 ' c4ab5277-4c11-444e-80e8-86ef7344d8a9': 92,
 ' b7e17c35-a165-4afc-9057-eab54a9036b0': 47,
 ' d7f17a3c-3885-4b47-95a8-c4204a3698f5': 392,
 ' 11cdab41-edc5-4af3-ba9a-bf53e5681263': 207,
 ' b7f30490-0b67-4ccc-b032-1eeeb179b464': 532,
 ' ce0811a0-778c-4740-949b-128f5fc33b31': 70,
 ' 2008f497-c05f-49e0-88dd-86aa1e395f15': 404,
 ' a16b85bf-08b2-4db5-9cd4-825d5226da28': 4,
 ' 23172c53-336a-4235-a08

In [None]:
news_label_0 = news_part_0[['page', 'title', 'body', 'caption']]
news_label_0

Unnamed: 0,page,title,body,caption
3,55ab912a-2bac-46d9-9fcf-8e9be376f1b3,Ato pela democracia recorda mortos na ditadur...,Carta em defesa da democracia foi lida nesta q...,Evento reuniu milhares dentro e fora da Faculd...
19,654e20df-fb61-4b7c-aa65-0be8c85ff8d8,"Brasil melhora acesso à escola, mas ainda prec...",Relatório da OCDE aponta avanços na educação b...,Análise sobre políticas adotadas nas últimas d...
28,253339a1-92b6-44d1-8fa2-59236c5251b1,Leia 8 exemplos de redações nota mil do Enem 2021,Exemplo de redação nota mil do Enem 2021\nRepr...,Inep divulgou as imagens digitalizadas das dis...
33,dfaad9a1-e0e8-475a-8cb8-7551441b8357,Leia exemplo de redação nota mil do Enem 2021 ...,Redação nota mil da candidata Evely Aparecida ...,"A potiguar Evely Aparecida Silva Lima, de Lago..."
38,66e39ec6-62c6-428e-b8fe-a097204ff00d,"Nobuhiro Watsuki, criador do mangá 'Samurai X'...","Personagens do mangá 'Rurouni Kenshin', conhec...",Autoridades japonesas encontraram DVDs com víd...
...,...,...,...,...
55393,2ddb14a0-3349-4f52-b69c-ff047bc0ed9d,Garçom registra boletim de ocorrência contra p...,Garçom registra boletim de ocorrência contra p...,"De acordo com o relato, o religioso se negou a..."
55464,abd928f7-b32e-433e-adbb-ee1487f81430,"Sessão de 1 minuto, acordo para manter texto e...","Lira abre nova sessão, mas com permissão de pr...",PEC prevê benefícios sociais a menos de três m...
55512,9b344221-0e16-48f4-a20b-85a654296004,Eduardo Suplicy é o vereador mais votado do pa...,Vereadores campeões de votos\nG1\nO vereador d...,Seis vereadores de São Paulo e quatro do Rio d...
55513,589c5ae9-7249-4e87-9fea-9e2ef3538caf,Ex-presidente da OAS reafirma que reforma do s...,Três ex-executivos da OAS prestam depoimento n...,"Léo Pinheiro, ex-presidente da empreiteira, e ..."


In [None]:
news_views = count_news_views(user_part_0)

news_count = pd.DataFrame(list(news_views.items()), columns=['page', 'count']).drop_duplicates()

news_label_0 = news_label_0.merge(news_count, on='page', how='left')

news_label_0.sort_values(by='count', inplace=True, ascending=False)
news_label_0

Unnamed: 0,page,title,body,caption,count
5252,83ebda93-e1cf-422d-ae6b-18298c55ed8e,"Salman Rushdie, autor de 'Versos satânicos' e ...",Salman Rushdie é atacado por homem nos EUA\nRe...,Escritor era apresentado em evento no estado d...,76
2097,83ebda93-e1cf-422d-ae6b-18298c55ed8e,"Salman Rushdie, autor de 'Versos satânicos' e ...",Salman Rushdie é atacado por homem nos EUA\nRe...,Escritor era apresentado em evento no estado d...,76
1459,3082a59f-4978-4d36-acf0-7d08804fb925,"Morre Paulo Roberto Costa, 1º delator da Lava ...","Paulo Roberto Costa, ex-diretor da Petrobras, ...",Engenheiro foi vítima de um câncer. Condenado ...,56
4614,3082a59f-4978-4d36-acf0-7d08804fb925,"Morre Paulo Roberto Costa, 1º delator da Lava ...","Paulo Roberto Costa, ex-diretor da Petrobras, ...",Engenheiro foi vítima de um câncer. Condenado ...,56
4840,b7b90e18-7613-4ca0-a8fc-fd69addfcd85,Jovem é encontrado morto após ser filmado send...,Vídeo mostra momento em que jovem é colocado e...,"Segundo a Polícia Civil, policiais militares a...",52
...,...,...,...,...,...
2964,b0447e49-52a4-4716-95eb-17b092844528,Banco de sangue de Itabuna tem estoque de O- z...,Bancos de sangue do sul da Bahia registram est...,"Em Ilhéus, cidade vizinha, a situação também é...",1
2963,fc258008-443a-499a-9ff1-5d00527f7d58,Homem morre e outros dois são baleados após cr...,Sirene de viatura da PM - 28/02/2021\nG1\nUm h...,Vítima fatal fugiu para dentro do estabelecime...,1
2961,4891a46e-5a33-4101-af69-646ce1845f8d,"'Era o orgulho da família', diz primo de enfer...",Ônibus tomba e deixa mortos e mais de 30 ferid...,Acidente aconteceu no quilômetro 301 da rodovi...,1
2960,002feaa9-4652-4c96-b86d-9582dd26c163,PM do Paraná vai investigar conduta do policia...,PM do Paraná vai investigar conduta do policia...,"A comerciante, de 28 anos, ainda se recupera d...",1


In [None]:
with open("news_label_0.pkl", "wb") as labels:
    pickle.dump(news_label_0, labels)

In [None]:
users = user_part_0['userId']
news = news_part_0['page']

user_features_dict = make_user_features(user_part_0)

news_features_dict = make_news_features(news_part_0)

interactions = make_interactions_matrix(user_part_0)

In [None]:
news_features_dict

{'55ab912a-2bac-46d9-9fcf-8e9be376f1b3': ['0.0000',
  '0.0000',
  '0.0000',
  '...',
  '0.0000',
  '0.0000',
  '0.0000'],
 '654e20df-fb61-4b7c-aa65-0be8c85ff8d8': ['0.0000',
  '0.0000',
  '0.0000',
  '...',
  '0.0000',
  '0.0000',
  '0.0000'],
 '253339a1-92b6-44d1-8fa2-59236c5251b1': ['0.0000',
  '0.0000',
  '0.0000',
  '...',
  '0.0000',
  '0.0000',
  '0.0000'],
 'dfaad9a1-e0e8-475a-8cb8-7551441b8357': ['0.0000',
  '0.0000',
  '0.0000',
  '...',
  '0.0000',
  '0.0000',
  '0.0000'],
 '66e39ec6-62c6-428e-b8fe-a097204ff00d': ['0.0000',
  '0.0000',
  '0.0000',
  '...',
  '0.0000',
  '0.0000',
  '0.0000'],
 '16834dee-0e39-4152-b3e5-4d2ac9f586c7': ['0.0000',
  '0.0000',
  '0.0000',
  '...',
  '0.0000',
  '0.0000',
  '0.0000'],
 'd063dcc0-a59f-42e3-980a-2200a064a52d': ['0.0000',
  '0.0000',
  '0.0000',
  '...',
  '0.0000',
  '0.0000',
  '0.0000'],
 '23517902-9df6-4d4a-ada6-a47db06529ec': ['0.0000',
  '0.0000',
  '0.0000',
  '...',
  '0.0000',
  '0.0000',
  '0.0000'],
 '692337ad-c1c0-42ad-9a3

In [None]:
from lightfm.data import Dataset

# Step 1: Extract unique users and items
users_from_interactions = {user for user, _, _ in interactions}
items_from_interactions = {news for _, news, _ in interactions}

all_users = set(users).union(users_from_interactions)
all_news = set(news).union(items_from_interactions)

# Criar dataset do LightFM
dataset = Dataset()

dataset.fit(
    users = all_users,
    items = all_news,
    user_features = list(set(f for sublist in user_features_dict.values() for f in sublist)),
    item_features = list(set(f for sublist in news_features_dict.values() for f in sublist))
)

In [None]:
print("Total de notícias cadastradas:", len(dataset.mapping()[2]))  # Verifica quantos IDs foram cadastrados
print("IDs de notícias cadastradas (10 primeiros):", list(dataset.mapping()[2]))


Total de notícias cadastradas: 64557
IDs de notícias cadastradas (10 primeiros): ['1d051eb0-f9ad-4bd8-bba5-5227de75a07f', ' c4322483-7a0f-4561-8a44-89aaeb14ff77', ' a674a807-757a-4408-98ad-15dc89ca5fa5', ' 6e76baf9-dd1d-4c83-8301-6d5b6022b936', ' f69599db-b095-436c-8d36-95648d3ba81d', '5c680603-ef47-4c11-9f58-ebb9e2b09f50', ' b5171fba-d3a3-498d-aeb9-da785acede1b', ' 61bb45a6-d459-4046-9d7d-1cbf8763e9a2', ' 3825d45a-98c6-4db7-bf9f-cbe9c9002b89', ' 67f5498a-8572-4d8d-80c9-fe326dfc1c41', ' cbe68b2d-7ade-425a-b38b-77f12c54d617', 'd83725d4-8340-45b5-8962-592cc5af089e', ' 6814c130-2673-4504-b043-88ac9227fde0', ' c3fdca0b-5e7e-4d08-b4ee-d77fb524a9aa', ' 046fe542-cc86-490c-929b-3f1a040a17a2', ' c698c075-d04d-4422-be40-fd73937f390e', ' b5fadce7-b816-48ee-8261-87a36b43aacd', ' a793909a-04bc-49f7-8272-cc3c2a03bbb5', ' 6ca23215-b849-47b3-931c-7d81b440ca35', ' fb28b59f-01cd-4563-bd18-bb2f1cdc966d', ' ec6dbf49-cd70-467d-994b-c7411ef64bcc', ' e1b73ccd-5fc0-4169-a316-873d7be4c48c', ' e6ec7b18-5901-453

In [None]:
# Construir interações e features
(interactions_matrix, weights_matrix) = dataset.build_interactions(
    ((user, news, weight) for user, news, weight in interactions ) #if user in users and news in news)
)

user_features_matrix = dataset.build_user_features(
    ((user, feats) for user, feats in user_features_dict.items())
)

news_features_matrix = dataset.build_item_features(
    ((news, feats) for news, feats in news_features_dict.items())
)

In [None]:
print(f"Total interactions before filtering: {len(interactions)}")
print(f"Unique users: {len(users)}, Unique items: {len(news)}")  # Check if users/items exist


Total interactions before filtering: 446469
Unique users: 30418, Unique items: 6310


In [None]:
test_interactions = make_interactions_matrix(user_part_0)
print(len(interactions))  # Should be > 0
print(interactions[:5])  # Show first 5 interactions for verification


446469
[('e5f68d5e7cdbe56d6984589b4baa6ebfc5e8a8a918e57d5092adc513f516b377', '50028008-aa11-4519-9d75-452c84dd27fb', -1.525610752438483), ('e5f68d5e7cdbe56d6984589b4baa6ebfc5e8a8a918e57d5092adc513f516b377', ' d031af1b-f939-47c1-a589-e6d691b66d91', 2.532179534696281), ('e5f68d5e7cdbe56d6984589b4baa6ebfc5e8a8a918e57d5092adc513f516b377', ' bf257382-74fb-4392-ad6a-143240e39f81', -1.378899157794668), ('e5f68d5e7cdbe56d6984589b4baa6ebfc5e8a8a918e57d5092adc513f516b377', ' 66a9efac-fd43-4fd1-9824-c404b08efa5d', -0.8783329927344541), ('e5f68d5e7cdbe56d6984589b4baa6ebfc5e8a8a918e57d5092adc513f516b377', ' 7a349b09-badc-40a9-a194-83d959aeb50c', 0.13898599790660623)]


In [None]:
from lightfm import LightFM

# Criar modelo LightFM
model = LightFM(loss='warp')
model.fit(
    interactions_matrix,
    user_features = user_features_matrix,
    item_features = news_features_matrix,
    epochs=10,
    num_threads=4
)

<lightfm.lightfm.LightFM at 0x7d65909fe010>

In [None]:
model.get_params()

{'loss': 'warp',
 'learning_schedule': 'adagrad',
 'no_components': 10,
 'learning_rate': 0.05,
 'k': 5,
 'n': 10,
 'rho': 0.95,
 'epsilon': 1e-06,
 'max_sampled': 10,
 'item_alpha': 0.0,
 'user_alpha': 0.0,
 'random_state': RandomState(MT19937) at 0x7D6582BDA040}

In [None]:
model.predict([0], [0])

array([-0.7193088], dtype=float32)

In [None]:
with open("lightfm_model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

In [None]:
model.get_user_representations()

(array([-0.43305403, -0.23023662, -0.2661459 , ..., -1.0751314 ,
        -0.8574045 , -0.43504217], dtype=float32),
 array([[ 0.04979854,  0.11211776,  0.04123031, ..., -0.02924087,
          0.00466649, -0.03835207],
        [-0.0087991 ,  0.01707268,  0.0302439 , ..., -0.01997683,
          0.0842242 ,  0.02282044],
        [ 0.14506827,  0.09932407,  0.15683885, ..., -0.15532397,
          0.09259848, -0.04443002],
        ...,
        [-0.03155738, -0.13009678, -0.08159868, ...,  0.05623879,
         -0.02251203,  0.03540983],
        [-0.14032653,  0.44594735,  0.37379423, ..., -0.42703497,
         -0.47716442,  0.06011612],
        [-0.00828966, -0.12124013, -0.00926756, ...,  0.05776473,
         -0.04451102, -0.0491684 ]], dtype=float32))

### Avaliação do modelo

In [None]:
from lightfm.cross_validation import random_train_test_split

# Split data into 80% train and 20% test
train, test = random_train_test_split(interactions_matrix, test_percentage=0.2)


In [None]:
from lightfm.evaluation import precision_at_k, recall_at_k

# Compute precision and recall at K=5
k = 5
precision = precision_at_k(model=model, test_interactions=test, train_interactions=train, k=k, item_features=news_features_matrix, user_features=user_features_matrix).mean()
recall = recall_at_k(model=model, test_interactions=test, train_interactions=train, k=k, item_features=news_features_matrix, user_features=user_features_matrix).mean()

print(f"Precision@{k}: {precision:.4f}")
print(f"Recall@{k}: {recall:.4f}")

Precision@5: 0.0206
Recall@5: 0.0144


In [None]:
dataset.mapping()

### Avaliando as recomendações

In [None]:
import numpy as np

user_test = user_part_0['userId'].iloc[0]

def recommend_news(user_id, model, dataset, news_list, top_n=5):

    news_ids = np.array([dataset.mapping()[2][news] for news in news_list])
    user_id_mapped = dataset.mapping()[0][user_id]

    scores = model.predict(user_id_mapped, news_ids)
    top_news_indices = np.argsort(-scores)[:top_n]

    return [news_list[i] for i in top_news_indices]

user_id_example = list(all_users)[0]
recommended_news = recommend_news(user_test, model, dataset, list(all_news), top_n=5)
print(f"Recommended news for user {user_test}: {recommended_news}")


Recommended news for user e5f68d5e7cdbe56d6984589b4baa6ebfc5e8a8a918e57d5092adc513f516b377: ['83ebda93-e1cf-422d-ae6b-18298c55ed8e', '3082a59f-4978-4d36-acf0-7d08804fb925', 'b7b90e18-7613-4ca0-a8fc-fd69addfcd85', '55ab912a-2bac-46d9-9fcf-8e9be376f1b3', '2b4f0f8f-ab7b-45e7-8a8a-353e0aa41249']


In [None]:
# Check user's history
user_history = user_part_0[user_part_0['userId'] == user_id_example]['history'].values[0]

print(f"User's past interactions: {user_history}")
print(f"Model recommended: {recommended_news}")

User's past interactions: ['a1c2a84e-0bc1-41c4-876c-58b7a25fa0a5', ' 33c4dffb-ddb1-466d-82ee-e30ee9762c0d']
Model recommended: ['83ebda93-e1cf-422d-ae6b-18298c55ed8e', '3082a59f-4978-4d36-acf0-7d08804fb925', 'b7b90e18-7613-4ca0-a8fc-fd69addfcd85', '55ab912a-2bac-46d9-9fcf-8e9be376f1b3', '2b4f0f8f-ab7b-45e7-8a8a-353e0aa41249']


# Planejamento

Estamos desenvolvendo um app no streamlit para hospedar um simples algoritmo de recomendação de notícias. Elas tem título, legenda e corpo. O modelo foi treinado no LightFM usando um conjunto de notícias da globo. Para suportar o modelo e os dados, faremos uma API no FastAPI para servir como o back-end. Como forma de monitoramento do modelo, do site e dos dados, nós usaremos o Mlflow para registrar e cadastrar os resultados de uso dos modelos e das execuções, como de sustentar a implementação do MLOps dentro do projeto. E no final, nós iremos utilizar o Docker para realizar o conteinerizamento de toda a aplicação.

O objetivo é gerar um ranking para a coluna history. Quando um usuário loga, o sistema deve retornar os próximos acesso á ele. Deve haver consideração para a diferença na recomendação dos usuários que fazem login, ou que precisam passar pelo cold-start. Também deve haver um critério de relevância para as notícias no momento em que são recomendadas, considerando o histórico dos usuário e também os dados que são associadas á ela.

In [None]:
import numpy as np
from lightfm import LightFM
from lightfm.evaluation import auc_score
from sklearn.model_selection import ParameterSampler

# Definir espaço de busca para os hiperparâmetros
param_grid = {
    'loss': ['warp', 'bpr', 'logistic'],
    'learning_rate': np.logspace(-3, -1, 5),  # De 0.001 a 0.1
    'no_components': [10, 20, 50, 100, 200],
    'epochs': [10, 20, 30, 50]
}

# Gerar amostras aleatórias de hiperparâmetros
n_iter = 10  # Número de combinações a testar
param_list = list(ParameterSampler(param_grid, n_iter=n_iter, random_state=42))

best_auc = 0
best_params = None

# Loop sobre diferentes combinações de hiperparâmetros
for params in param_list:
    model = LightFM(loss=params['loss'], learning_rate=params['learning_rate'], no_components=params['no_components'])

    model.fit(interactions_matrix, user_features=user_features_matrix, item_features=news_features_matrix,
              epochs=params['epochs'], num_threads=4)

    # Avaliação do modelo usando AUC Score nos dados de teste
    auc = auc_score(model, interactions_matrix, user_features=user_features_matrix, item_features=news_features_matrix).mean()

    print(f"Params: {params} - AUC: {auc:.4f}")

    # Salvar a melhor configuração
    if auc > best_auc:
        best_auc = auc
        best_params = params

print("\nMelhores Hiperparâmetros:", best_params)
print("Melhor AUC:", best_auc)


Params: {'no_components': 100, 'loss': 'bpr', 'learning_rate': 0.03162277660168379, 'epochs': 30} - AUC: 0.8036
Params: {'no_components': 20, 'loss': 'logistic', 'learning_rate': 0.01, 'epochs': 50} - AUC: 0.6780
Params: {'no_components': 50, 'loss': 'warp', 'learning_rate': 0.001, 'epochs': 30} - AUC: 0.6189
Params: {'no_components': 200, 'loss': 'bpr', 'learning_rate': 0.001, 'epochs': 10} - AUC: 0.4849
Params: {'no_components': 100, 'loss': 'bpr', 'learning_rate': 0.001, 'epochs': 50} - AUC: 0.7640
Params: {'no_components': 20, 'loss': 'warp', 'learning_rate': 0.001, 'epochs': 50} - AUC: 0.6220
Params: {'no_components': 20, 'loss': 'warp', 'learning_rate': 0.03162277660168379, 'epochs': 30} - AUC: 0.9324
Params: {'no_components': 200, 'loss': 'warp', 'learning_rate': 0.01, 'epochs': 20} - AUC: 0.9030
Params: {'no_components': 10, 'loss': 'bpr', 'learning_rate': 0.001, 'epochs': 10} - AUC: 0.4886
Params: {'no_components': 10, 'loss': 'logistic', 'learning_rate': 0.0031622776601683794

In [None]:
model = LightFM(loss='warp', learning_rate=0.03162277660168379, no_components=20)

model.fit(interactions_matrix, user_features=user_features_matrix, item_features=news_features_matrix,
            epochs=30, num_threads=4)

<lightfm.lightfm.LightFM at 0x7d657ac17850>

In [None]:
from lightfm.cross_validation import random_train_test_split

# Split data into 80% train and 20% test
train, test = random_train_test_split(interactions_matrix, test_percentage=0.2)


In [None]:
from lightfm.evaluation import precision_at_k, recall_at_k

# Compute precision and recall at K=5
k = 10
precision = precision_at_k(model=model, test_interactions=test, train_interactions=train, k=k, item_features=news_features_matrix, user_features=user_features_matrix).mean()
recall = recall_at_k(model=model, test_interactions=test, train_interactions=train, k=k, item_features=news_features_matrix, user_features=user_features_matrix).mean()

print(f"Precision@{k}: {precision:.4f}")
print(f"Recall@{k}: {recall:.4f}")

Precision@10: 0.0182
Recall@10: 0.0268


In [None]:
with open("lightfm_model_tuned.pkl", "wb") as model_file:
    pickle.dump(model, model_file)