Extract top keywords for each article using TF-IDF

In [1]:
import pandas as pd


df = pd.read_csv("../reduced_dataset.csv")



In [2]:
df.shape

(50001, 16)

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Fill missing values just in case
df['processed_text'] = df['processed_text'].fillna("")

# You can also combine title and processed_text if needed:
df['combined'] = df['title'] + " " + df['processed_text']

In [4]:
# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')

# Fit and transform the combined text
tfidf_matrix = vectorizer.fit_transform(df['combined'])

# Get the feature names (i.e., vocabulary)
feature_names = vectorizer.get_feature_names_out()


In [5]:
import numpy as np

# Function to get top n keywords for each document
def extract_top_keywords(row_index, top_n=5):
    row = tfidf_matrix[row_index].toarray().flatten()
    top_indices = row.argsort()[-top_n:][::-1]
    return [feature_names[i] for i in top_indices]

# Example: Get top 5 keywords for the first article
print("Top 5 Keywords for Article 0:", extract_top_keywords(0))


Top 5 Keywords for Article 0: ['truth', 'wasnt', 'disease', 'thats', 'thing']


In [6]:
import joblib

# Save the fitted TF-IDF vectorizer
joblib.dump(vectorizer, 'Keyword_Finder_tfidf_vectorizer.pkl')


['Keyword_Finder_tfidf_vectorizer.pkl']

In [7]:
loaded_vectorizer = joblib.load('tfidf_vectorizer.pkl')