In [28]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load PEEKC dataset v2 (avec metadata: title + description)
wiki_mapping = pd.read_csv('../dataset/PEEKC-Dataset-main/datasets/v2/id_to_wiki_metadata_mapping.csv')

print(f"[INFO] Wiki mapping shape: {wiki_mapping.shape}")
print(f"[INFO] Columns: {list(wiki_mapping.columns)}")
wiki_mapping.head(10)

[INFO] Wiki mapping shape: (30413, 4)
[INFO] Columns: ['id', 'url', 'title', 'description']


Unnamed: 0,id,url,title,description
0,0,"https://en.wikipedia.org/wiki/""Hello,_World!""_...","""Hello, World!"" program",Traditional beginners' computer program
1,1,https://en.wikipedia.org/wiki/-elect,-elect,Person who has been elected to a position but ...
2,2,https://en.wikipedia.org/wiki/.45-70,.45-70,Rifle cartridge designed by the U.S. Army
3,3,https://en.wikipedia.org/wiki/.amazon,.amazon,Disputed top-level domain
4,4,https://en.wikipedia.org/wiki/.gov,.gov,Sponsored top-level Internet domain used by Un...
5,5,https://en.wikipedia.org/wiki/1_+_2_+_3_+_4_+_⋯,1 + 2 + 3 + 4 + ⋯,Divergent series
6,6,"https://en.wikipedia.org/wiki/1,2-Ethanedithiol","Ethane-1,2-dithiol",Chemical compound
7,7,"https://en.wikipedia.org/wiki/1,2-rearrangement","1,2-rearrangement",Organic chemical reaction
8,8,https://en.wikipedia.org/wiki/100-year_flood,100-year flood,Flood event that has a 1 in 100 chance of bein...
9,9,https://en.wikipedia.org/wiki/15_minutes_of_fame,15 minutes of fame,Short-lived media publicity or celebrity


In [29]:
# Créer un champ de texte combiné: title + description pour meilleure recherche
wiki_mapping['combined_text'] = wiki_mapping['title'].fillna('') + ' ' + wiki_mapping['description'].fillna('')
wiki_mapping = wiki_mapping.dropna(subset=['title'])

wiki_mapping.head(10)

Unnamed: 0,id,url,title,description,combined_text
0,0,"https://en.wikipedia.org/wiki/""Hello,_World!""_...","""Hello, World!"" program",Traditional beginners' computer program,"""Hello, World!"" program Traditional beginners'..."
1,1,https://en.wikipedia.org/wiki/-elect,-elect,Person who has been elected to a position but ...,-elect Person who has been elected to a positi...
2,2,https://en.wikipedia.org/wiki/.45-70,.45-70,Rifle cartridge designed by the U.S. Army,.45-70 Rifle cartridge designed by the U.S. Army
3,3,https://en.wikipedia.org/wiki/.amazon,.amazon,Disputed top-level domain,.amazon Disputed top-level domain
4,4,https://en.wikipedia.org/wiki/.gov,.gov,Sponsored top-level Internet domain used by Un...,.gov Sponsored top-level Internet domain used ...
5,5,https://en.wikipedia.org/wiki/1_+_2_+_3_+_4_+_⋯,1 + 2 + 3 + 4 + ⋯,Divergent series,1 + 2 + 3 + 4 + ⋯ Divergent series
6,6,"https://en.wikipedia.org/wiki/1,2-Ethanedithiol","Ethane-1,2-dithiol",Chemical compound,"Ethane-1,2-dithiol Chemical compound"
7,7,"https://en.wikipedia.org/wiki/1,2-rearrangement","1,2-rearrangement",Organic chemical reaction,"1,2-rearrangement Organic chemical reaction"
8,8,https://en.wikipedia.org/wiki/100-year_flood,100-year flood,Flood event that has a 1 in 100 chance of bein...,100-year flood Flood event that has a 1 in 100...
9,9,https://en.wikipedia.org/wiki/15_minutes_of_fame,15 minutes of fame,Short-lived media publicity or celebrity,15 minutes of fame Short-lived media publicity...


In [30]:
# Build TF-IDF vectorizer sur le texte combiné (title + description)
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(wiki_mapping['combined_text'])

print(f"[INFO] TF-IDF shape: {tfidf_matrix.shape}")

[INFO] TF-IDF shape: (30358, 5000)


In [31]:
def recommend_modules(query, top_n=10):
    """
    Recherche les modules Wikipedia similaires à la requête de l'utilisateur.
    
    Args:
        query: texte de recherche (sujet ou module)
        top_n: nombre de recommandations à retourner
    
    Returns:
        DataFrame avec les recommandations
    """
    # Vectoriser la requête
    query_vec = vectorizer.transform([query])
    
    # Calculer similarité cosinus
    similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    
    # Top N indices
    top_indices = similarities.argsort()[-top_n:][::-1]
    
    # Résultats
    results = wiki_mapping.iloc[top_indices].copy()
    results['similarity_score'] = similarities[top_indices]
    
    return results[['id', 'title', 'url', 'similarity_score']]

# Test
test_query = "machine learning"
recommendations = recommend_modules(test_query, top_n=5)
print(f"\n[SEARCH] Recherche: '{test_query}'\n")
recommendations


[SEARCH] Recherche: 'machine learning'



Unnamed: 0,id,title,url,similarity_score
2902,2902,Boosting (machine learning),https://en.wikipedia.org/wiki/Boosting_(machin...,0.956483
13203,13203,Incremental learning,https://en.wikipedia.org/wiki/Incremental_lear...,0.885379
303,303,Active learning (machine learning),https://en.wikipedia.org/wiki/Active_learning_...,0.841301
28855,28855,Unsupervised learning,https://en.wikipedia.org/wiki/Unsupervised_lea...,0.82772
12892,12892,Hyperparameter (machine learning),https://en.wikipedia.org/wiki/Hyperparameter_(...,0.817951
