In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Charger les articles de presse
news_df = pd.read_csv('MINDsmall_train/news.tsv', sep='\t')

# Charger les comportements des utilisateurs
behaviors_df = pd.read_csv('MINDsmall_train/behaviors.tsv', sep='\t')

# Explorer les premières lignes
print(news_df.head())
print(behaviors_df.head())


   N55528 lifestyle lifestyleroyals  \
0  N19639    health      weightloss   
1  N61837      news       newsworld   
2  N53526    health          voices   
3  N38324    health         medical   
4   N2073    sports    football_nfl   

  The Brands Queen Elizabeth, Prince Charles, and Prince Philip Swear By  \
0                      50 Worst Habits For Belly Fat                       
1  The Cost of Trump's Aid Freeze in the Trenches...                       
2  I Was An NBA Wife. Here's How It Affected My M...                       
3  How to Get Rid of Skin Tags, According to a De...                       
4  Should NFL be able to fine players for critici...                       

  Shop the notebooks, jackets, and more that the royals can't live without.  \
0  These seemingly harmless habits are holding yo...                          
1  Lt. Ivan Molchanets peeked over a parapet of s...                          
2  I felt like I was a fraud, and being an NBA wi...                   

In [3]:
print(news_df.columns)

Index(['N55528', 'lifestyle', 'lifestyleroyals',
       'The Brands Queen Elizabeth, Prince Charles, and Prince Philip Swear By',
       'Shop the notebooks, jackets, and more that the royals can't live without.',
       'https://assets.msn.com/labs/mind/AAGH0ET.html',
       '[{"Label": "Prince Philip, Duke of Edinburgh", "Type": "P", "WikidataId": "Q80976", "Confidence": 1.0, "OccurrenceOffsets": [48], "SurfaceForms": ["Prince Philip"]}, {"Label": "Charles, Prince of Wales", "Type": "P", "WikidataId": "Q43274", "Confidence": 1.0, "OccurrenceOffsets": [28], "SurfaceForms": ["Prince Charles"]}, {"Label": "Elizabeth II", "Type": "P", "WikidataId": "Q9682", "Confidence": 0.97, "OccurrenceOffsets": [11], "SurfaceForms": ["Queen Elizabeth"]}]',
       '[]'],
      dtype='object')


In [4]:
# Renommer les colonnes pour plus de clarté
news_df.columns = ['article_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'entities', 'extra_data']

# Afficher un exemple pour vérifier
print(news_df[['title', 'abstract']].head())


                                               title  \
0                      50 Worst Habits For Belly Fat   
1  The Cost of Trump's Aid Freeze in the Trenches...   
2  I Was An NBA Wife. Here's How It Affected My M...   
3  How to Get Rid of Skin Tags, According to a De...   
4  Should NFL be able to fine players for critici...   

                                            abstract  
0  These seemingly harmless habits are holding yo...  
1  Lt. Ivan Molchanets peeked over a parapet of s...  
2  I felt like I was a fraud, and being an NBA wi...  
3  They seem harmless, but there's a very good re...  
4  Several fines came down against NFL players fo...  


In [6]:
# Nettoyage simple sur les titres des articles
news_df['title'] = news_df['title'].str.lower().str.replace('[^\w\s]', '')

# Gestion des données manquantes (par exemple, remplir avec une valeur par défaut)
news_df.fillna('Unknown', inplace=True)


In [None]:
# Appliquer le TF-IDF sur les titres d'articles
vectorizer = TfidfVectorizer(max_features=50)
X = vectorizer.fit_transform(news_df['title'])

# Calculer la similarité cosinus entre articles
cosine_similarities = cosine_similarity(X)


In [8]:
cosine_similarities

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.51311051],
       [0.        , 1.        , 0.        , ..., 0.27885586, 0.28558196,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.62526126,
        0.        ],
       ...,
       [0.        , 0.27885586, 0.        , ..., 1.        , 0.24293006,
        0.        ],
       [0.        , 0.28558196, 0.62526126, ..., 0.24293006, 1.        ,
        0.        ],
       [0.51311051, 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])