In [13]:
import torch
import torch.nn as nn
import numpy as np
import scipy.sparse as sp
from scipy.io import mmread
import pandas as pd

# ------------------------- 1. Normalisation de la matrice d'adjacence -------------------------
def normalize_adj(adj):
    rowsum = np.array(adj.sum(1)).flatten()
    d_inv_sqrt = np.power(rowsum, -0.5, where=rowsum != 0)
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0. # "Failsafe"
    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
    return d_mat_inv_sqrt @ adj @ d_mat_inv_sqrt

# ------------------------- 2. Définition du modèle LightGCN -------------------------
class LightGCN(nn.Module):
    def __init__(self, num_nodes, embedding_dim=64, num_layers=3):
        super().__init__()
        self.embedding = nn.Embedding(num_nodes, embedding_dim)
        self.num_layers = num_layers
        nn.init.xavier_uniform_(self.embedding.weight)

    def forward(self, norm_adj):
        x = self.embedding.weight
        all_embeddings = [x]
        for _ in range(self.num_layers):
            x = torch.sparse.mm(norm_adj, x)
            all_embeddings.append(x)
        return torch.stack(all_embeddings, dim=0).mean(dim=0)  # Moyenne des couches

    def recommend(self, embeddings, node_ids, top_k=20):
        norm_embeddings = embeddings / embeddings.norm(dim=1, keepdim=True)
        selected = norm_embeddings[node_ids]
        scores = selected @ norm_embeddings.T
        scores[torch.arange(len(node_ids)).unsqueeze(1), node_ids.unsqueeze(1)] = -1e9  # Exclure soi-même
        return torch.topk(scores, top_k, dim=1).indices

# ------------------------- 3. Chargement des données -------------------------
adj_direct = mmread("data/TP4-matrice-adjacence.dgt").tocsr()
# Ajout de la co-citation
co_citation = adj_direct.T @ adj_direct
co_citation.setdiag(0)
co_citation.eliminate_zeros()
threshold = 2 # Seuil minimal pour réduire le bruit
co_citation.data[co_citation.data < threshold] = 0
co_citation.eliminate_zeros()
co_citation.data = np.log1p(co_citation.data)  # Pondération logarithmique
alpha = 0.8  # poids du graphe direct
adj = alpha * adj_direct + (1 - alpha) * co_citation

ids_df = pd.read_csv("data/TP4-ids.csv", dtype=str)
ids = ids_df["id"].values
ids_test_df = pd.read_csv("data/TP4-ids-test.csv", dtype=str).dropna()
ids_test = ids_test_df["id"].values
# Mapping entre identifiants et indices dans la matrice
id_to_index = {id_: idx for idx, id_ in enumerate(ids)}
test_indices = [id_to_index[x] for x in ids_test]

# ------------------------- 4. Préparation de la matrice normalisée -------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
norm_adj = normalize_adj(adj + adj.T).tocoo()  # symétriser le graphe
norm_adj_indices = torch.LongTensor(np.vstack((norm_adj.row, norm_adj.col)))
norm_adj_values = torch.FloatTensor(norm_adj.data)
norm_adj_tensor = torch.sparse_coo_tensor(norm_adj_indices, norm_adj_values, torch.Size(norm_adj.shape)).to(device)

# ------------------------- 5. Entraînement avec BPR Loss -------------------------
model = LightGCN(num_nodes=adj.shape[0]).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

def bpr_loss(u_emb, i_emb, j_emb):
    score_pos = (u_emb * i_emb).sum(dim=1)
    score_neg = (u_emb * j_emb).sum(dim=1)
    return -torch.log(torch.sigmoid(score_pos - score_neg)).mean()

def evaluate_and_log(epoch, loss, best_loss, best_epoch, epochs_no_improve):
    print(f"Epoch {epoch} - BPR Loss: {loss.item():.4f} | Best: {best_loss:.4f} (epoch {best_epoch})")

edge_index = np.vstack((adj.nonzero()[0], adj.nonzero()[1])).T
num_nodes = adj.shape[0]
batch_size = 8192
epochs = 800
k = 50  # Nombre de candidats négatifs

# Params early stopping
best_loss = float('inf')
best_epoch = -1
epochs_no_improve = 0
patience = 30
min_delta = 1e-4

model.train()
for epoch in range(epochs):
    emb = model(norm_adj_tensor)

    # Echantillonnage positifs et négatifs
    sampled_idx = np.random.randint(0, len(edge_index), size=batch_size)
    u_np, i_np = edge_index[sampled_idx].T
    neg_candidates = np.random.randint(0, num_nodes, size=(batch_size, k))

    i_np_broadcasted = np.repeat(i_np[:, np.newaxis], k, axis=1)
    invalid_j_eq_i = (neg_candidates == i_np_broadcasted)
    adj_csr = adj.tocsr()

    rows = np.repeat(u_np, k)
    cols = neg_candidates.flatten()
    is_edge = np.array(adj_csr[rows, cols] != 0).reshape(batch_size, k)
    mask_valid = ~(invalid_j_eq_i | is_edge)

    # Conversion en tensors
    u_tensor = torch.tensor(u_np, device=device)
    i_tensor = torch.tensor(i_np, device=device)
    j_tensor_all = torch.tensor(neg_candidates, device=device)
    mask_valid_tensor = torch.tensor(mask_valid, device=device)

    # Sélection des meilleurs j
    u_emb = emb[u_tensor]
    i_emb = emb[i_tensor]
    j_emb_all = emb[j_tensor_all]
    u_emb_exp = u_emb.unsqueeze(1)
    scores = (u_emb_exp * j_emb_all).sum(dim=2)
    scores[~mask_valid_tensor] = -1e9
    best_j_indices = scores.argmax(dim=1)
    j_tensor = j_tensor_all[torch.arange(batch_size), best_j_indices]
    j_emb = emb[j_tensor]

    # Optimisation
    loss = bpr_loss(u_emb, i_emb, j_emb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch % 5 == 0:
        evaluate_and_log(epoch, loss, best_loss, best_epoch, epochs_no_improve)

    if best_loss - loss.item() > min_delta:
        best_loss = loss.item()
        best_epoch = epoch
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print(f"Early stopping at epoch {epoch} (no improvement for {patience} epochs)")
            break

# ------------------------- 6. Génération des recommandations -------------------------
def generate_recommendations(model, norm_adj_tensor, test_indices, ids):
    model.eval()
    with torch.no_grad():
        final_embeddings = model(norm_adj_tensor)
        test_tensor = torch.tensor(test_indices).to(device)
        topk = model.recommend(final_embeddings, test_tensor, top_k=20)
        
    results = {}
    for i, test_idx in enumerate(test_indices):
        recommended = topk[i].cpu().numpy()
        recommended_ids = [str(ids[x]) for x in recommended]
        results[str(ids[test_idx])] = recommended_ids
    return results

reco_results = generate_recommendations(model, norm_adj_tensor.to(device), test_indices, ids)

def recall_at_k(reco_results, adj, test_indices, ids, k=20):
    hits = 0
    total = 0

    for idx in test_indices:
        true_neighbors = set(adj[idx].indices)
        recommended = set(id_to_index.get(x, -1) for x in reco_results[ids[idx]][:k])
        recommended.discard(-1)
        hits += len(true_neighbors & recommended)
        total += len(true_neighbors)

    return hits / total if total > 0 else 0.0

recall_score = recall_at_k(reco_results, adj, test_indices, ids, k=20)
print(f"Approximate Recall@20 on direct neighbors: {recall_score:.4f}")

# ------------------------- 7. Affichage des résultats -------------------------
for test_article, recs in list(reco_results.items())[:5]:
    print(f"Article {test_article} => {recs}")

# ------------------------- 8. Sauvegarde des recommandations -------------------------
# Vérification des résultats avant écriture
print("Vérification des recommandations...")

valid_ids = set(ids)
nb_valid = 0
nb_invalid = 0
nb_wrong_length = 0

for article_id, recs in reco_results.items():
    if len(recs) != 20:
        print(f"WARNING: Article {article_id} : {len(recs)} recommandations (au lieu de 20)")
        nb_wrong_length += 1
    invalids = [rec for rec in recs if rec not in valid_ids]
    if invalids:
        print(f"WARNING: Article {article_id} : {len(invalids)} identifiants invalides : {invalids}")
        nb_invalid += 1
    else:
        nb_valid += 1

print(f"{nb_valid} articles avec 20 recommandations valides")
if nb_wrong_length or nb_invalid:
    print(f"WARNING: {nb_wrong_length} avec mauvais nombre de recommandations, {nb_invalid} avec ID invalides")
else:
    print("Tout est prêt pour l'export")


import csv

# Ecriture dans un fichier CSV au format requis
with open("soumission_lightgcn.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["id", "recommandations"])  # en-tête
    for article_id, recs in reco_results.items():
        writer.writerow([article_id, " ".join(recs)])

print("Fichier de soumission 'soumission_lightgcn.csv' généré avec succès.")



Epoch 0 - BPR Loss: 0.6932 | Best: inf (epoch -1)
Epoch 5 - BPR Loss: 0.6931 | Best: 0.6932 (epoch 0)
Epoch 10 - BPR Loss: 0.6931 | Best: 0.6932 (epoch 0)
Epoch 15 - BPR Loss: 0.6930 | Best: 0.6930 (epoch 13)
Epoch 20 - BPR Loss: 0.6928 | Best: 0.6929 (epoch 17)
Epoch 25 - BPR Loss: 0.6925 | Best: 0.6925 (epoch 24)
Epoch 30 - BPR Loss: 0.6920 | Best: 0.6921 (epoch 29)
Epoch 35 - BPR Loss: 0.6914 | Best: 0.6915 (epoch 34)
Epoch 40 - BPR Loss: 0.6904 | Best: 0.6906 (epoch 39)
Epoch 45 - BPR Loss: 0.6891 | Best: 0.6896 (epoch 44)
Epoch 50 - BPR Loss: 0.6879 | Best: 0.6880 (epoch 49)
Epoch 55 - BPR Loss: 0.6863 | Best: 0.6864 (epoch 54)
Epoch 60 - BPR Loss: 0.6843 | Best: 0.6844 (epoch 59)
Epoch 65 - BPR Loss: 0.6820 | Best: 0.6824 (epoch 64)
Epoch 70 - BPR Loss: 0.6793 | Best: 0.6800 (epoch 69)
Epoch 75 - BPR Loss: 0.6765 | Best: 0.6769 (epoch 74)
Epoch 80 - BPR Loss: 0.6735 | Best: 0.6738 (epoch 79)
Epoch 85 - BPR Loss: 0.6691 | Best: 0.6709 (epoch 84)
Epoch 90 - BPR Loss: 0.6671 | Best:

### Possible additions

- Normaliser les embeddings
- Dropout embeddings
- Scheduler de LR
- Could change embedding dim to 128 or 256