In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
data = pd.read_csv("combined_data_with_sentiment.csv")
data.head()

In [None]:
# Drop rows with NaN values in 'cleaned_reviews'
data = data.dropna(subset=['cleaned_reviews'])

In [None]:
# Content-based filtering relies on the features of the items themselves (e.g., course descriptions, reviews, ratings). 
# Since you have a good amount of textual data in the cleaned_reviews, sentiment_score, rating, and other numeric features, 
# content-based filtering might be a good place to start.

# Vectorize cleaned_reviews using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(data['cleaned_reviews'])

# Compute cosine similarity between courses
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Function to get the most similar courses for a given course index
def get_similar_courses(course_index, cosine_sim, top_n=5):
    sim_scores = list(enumerate(cosine_sim[course_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n + 1]  # Skip the first score (itself)
    course_indices = [i[0] for i in sim_scores]
    return data.iloc[course_indices]

# Get top 5 similar courses for the first course
similar_courses = get_similar_courses(0, cosine_sim)
print(similar_courses[['name', 'institution', 'rating']])