In [5]:
# 1. Imports
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import vstack


In [23]:
# Load Preprocessed Data and TF-IDF Matrix
df = pd.read_csv('../data/cleaned_news.csv')

with open('../data/tfidf_matrix.pkl', 'rb') as f:
    tfidf_matrix = pickle.load(f)

with open('../data/vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)



In [25]:
# Show available categories
available_categories = df['Category'].dropna().unique().tolist()
print("Available Categories:")
print(", ".join(available_categories))

# Step 2: Get user input
user_input = input("Enter your preferred categories, separated by commas: ")
selected_categories = [cat.strip() for cat in user_input.split(',') if cat.strip() in available_categories]

if not selected_categories:
    print("No valid categories selected. Using default fallback (empty user profile).")
    matched_indices = []
else:
    # Step 3: Filter articles that match selected categories
    matched_indices = df[df['Category'].isin(selected_categories)].index.tolist()
    print(f"\nSelected categories: {selected_categories}")
    print(f"Number of matched articles: {len(matched_indices)}")
    display(df.iloc[matched_indices][['Title', 'Category']].head())


Available Categories:
lifestyle, health, news, sports, weather, entertainment, autos, travel, foodanddrink, tv, finance, movies, video, music, kids, middleeast, northamerica


Enter your preferred categories, separated by commas:  health,news,sports



Selected categories: ['health', 'news', 'sports']
Number of matched articles: 30268


Unnamed: 0,Title,Category
1,50 Worst Habits For Belly Fat,health
2,The Cost of Trump's Aid Freeze in the Trenches...,news
3,I Was An NBA Wife. Here's How It Affected My M...,health
4,"How to Get Rid of Skin Tags, According to a De...",health
5,Should NFL be able to fine players for critici...,sports


In [27]:
# Build User Profile Vector
# Average the TF-IDF vectors of the selected articles
from scipy.sparse import csr_matrix

if matched_indices:
    user_profile = tfidf_matrix[matched_indices].mean(axis=0)
else:
    user_profile = csr_matrix((1, tfidf_matrix.shape[1]))  


In [29]:
# 5. Save the User Profile
with open('../data/user_profile.pkl', 'wb') as f:
    pickle.dump(user_profile, f)


In [31]:
dff = pd.read_pickle('../data/user_profile.pkl')
dff

matrix([[0.0011177 , 0.00035044, 0.00049104, ..., 0.00044066, 0.00036674,
         0.00035199]])