In [3]:
import numpy as np
import pandas as pd
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

# Load cleaned data
df = pd.read_csv('../data/cleaned_news.csv')

# Load TF-IDF matrix
with open('../data/tfidf_matrix.pkl', 'rb') as f:
    tfidf_matrix = pickle.load(f)

# Load user profile
with open('../data/user_profile.pkl', 'rb') as f:
    user_profile = pickle.load(f)

# Debug: Check types and shapes
print("Type of user_profile:", type(user_profile))
print("Type of tfidf_matrix:", type(tfidf_matrix))
print("Shape of user_profile:", user_profile.shape if hasattr(user_profile, 'shape') else "No shape (not an array)")
print("Shape of tfidf_matrix:", tfidf_matrix.shape)

# Convert user_profile from np.matrix to NumPy array
user_profile = np.asarray(user_profile)  

# Verify shapes
print("Shape of user_profile after conversion:", user_profile.shape)
print("Type of user_profile after conversion:", type(user_profile))

# Ensure tfidf_matrix is a sparse matrix (no conversion needed)
if not isinstance(tfidf_matrix, csr_matrix):
    tfidf_matrix = csr_matrix(tfidf_matrix)

# Verify feature compatibility
if user_profile.shape[1] != tfidf_matrix.shape[1]:
    raise ValueError(f"Feature dimension mismatch: user_profile has {user_profile.shape[1]} features, tfidf_matrix has {tfidf_matrix.shape[1]} features.")

# Compute cosine similarity
similarity_scores = cosine_similarity(user_profile, tfidf_matrix).flatten()

# Add similarity scores to DataFrame
df['similarity'] = similarity_scores

# Save results
df.sort_values(by='similarity', ascending=False).to_csv('../results/similarity_scores.csv', index=False)


Type of user_profile: <class 'numpy.matrix'>
Type of tfidf_matrix: <class 'scipy.sparse._csr.csr_matrix'>
Shape of user_profile: (1, 5000)
Shape of tfidf_matrix: (48616, 5000)
Shape of user_profile after conversion: (1, 5000)
Type of user_profile after conversion: <class 'numpy.ndarray'>
