In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN, SpectralClustering
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from scipy.sparse import hstack

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
data = pd.read_csv("combined_data_with_sentiment.csv")
data.head()

In [None]:
# Drop rows with NaN values in 'cleaned_reviews'
data = data.dropna(subset=['cleaned_reviews'])

In [None]:
# Content-based filtering relies on the features of the items themselves (e.g., course descriptions, reviews, ratings). 
# Since you have a good amount of textual data in the cleaned_reviews, sentiment_score, rating, and other numeric features, 
# content-based filtering might be a good place to start.

# Vectorize cleaned_reviews using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(data['cleaned_reviews'])

# Compute cosine similarity between courses
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Function to get the most similar courses for a given course index
def get_similar_courses(course_index, cosine_sim, top_n=5):
    sim_scores = list(enumerate(cosine_sim[course_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n + 1]  # Skip the first score (itself)
    course_indices = [i[0] for i in sim_scores]
    return data.iloc[course_indices]

# Get top 5 similar courses for the first course
similar_courses = get_similar_courses(0, cosine_sim)
print(similar_courses[['name', 'institution', 'rating']])

In [None]:
def get_top_rated_similar_courses(course_index, cosine_sim, top_n=5):
    similar_courses = get_similar_courses(course_index, cosine_sim, top_n)
    similar_courses['rating'] = similar_courses['rating'].astype(float)
    return similar_courses.sort_values(by='rating', ascending=False)

top_rated_similar_courses = get_top_rated_similar_courses(0, cosine_sim)
print(top_rated_similar_courses[['name', 'rating']])

In [None]:
# Preprocessing text data (cleaned_reviews)
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
tfidf_matrix = tfidf_vectorizer.fit_transform(data['cleaned_reviews'])

# Preprocessing numeric features (rating, review_count, sentiment_score, avg_rating)
numeric_features = data[['rating', 'review_count', 'sentiment_score', 'avg_rating']]
scaler = StandardScaler()
numeric_features_scaled = scaler.fit_transform(numeric_features)

# Combine TF-IDF and numeric features into a single feature matrix
combined_features = hstack([tfidf_matrix, numeric_features_scaled])

# Clustering with KMeans
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(combined_features)
data['kmeans_cluster'] = kmeans.labels_

# Clustering with DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(combined_features)
data['dbscan_cluster'] = dbscan_labels

# Clustering with Spectral Clustering
similarity_matrix = cosine_similarity(combined_features)
spectral_clustering = SpectralClustering(n_clusters=5, affinity='precomputed')
spectral_labels = spectral_clustering.fit_predict(similarity_matrix)
data['spectral_cluster'] = spectral_labels

# Function to recommend courses based on clustering
def recommend_courses(course_index, cluster_column='kmeans_cluster', num_recommendations=5):
    cluster_id = data.iloc[course_index][cluster_column]
    similar_courses = data[data[cluster_column] == cluster_id]
    return similar_courses[['name', 'institution', 'rating']].head(num_recommendations)

# Example: Recommend 5 courses similar to course at index 0 based on KMeans cluster
recommended_courses_kmeans = recommend_courses(0, cluster_column='kmeans_cluster')
print("KMeans Recommendations:")
print(recommended_courses_kmeans)

# Visualize the clusters for KMeans (using rating and review_count for simplicity)
plt.scatter(data['rating'], data['review_count'], c=data['kmeans_cluster'], cmap='viridis')
plt.xlabel('Rating')
plt.ylabel('Review Count')
plt.title('KMeans Clustering')
plt.show()

# Visualize the clusters for DBSCAN
plt.scatter(data['rating'], data['review_count'], c=data['dbscan_cluster'], cmap='viridis')
plt.xlabel('Rating')
plt.ylabel('Review Count')
plt.title('DBSCAN Clustering')
plt.show()

# Visualize the clusters for Spectral Clustering
plt.scatter(data['rating'], data['review_count'], c=data['spectral_cluster'], cmap='viridis')
plt.xlabel('Rating')
plt.ylabel('Review Count')
plt.title('Spectral Clustering')
plt.show()

# Evaluate clustering performance using silhouette score
silhouette_kmeans = silhouette_score(combined_features, kmeans.labels_)
silhouette_dbscan = silhouette_score(combined_features, dbscan_labels) if len(set(dbscan_labels)) > 1 else -1
silhouette_spectral = silhouette_score(combined_features, spectral_labels)

print(f"Silhouette Score for KMeans: {silhouette_kmeans}")
print(f"Silhouette Score for DBSCAN: {silhouette_dbscan}")
print(f"Silhouette Score for Spectral Clustering: {silhouette_spectral}")


In [None]:
# Preprocessing text data (cleaned_reviews)
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
tfidf_matrix = tfidf_vectorizer.fit_transform(data['cleaned_reviews'])

# Preprocessing numeric features (rating, review_count, sentiment_score, avg_rating)
numeric_features = data[['rating', 'review_count', 'sentiment_score', 'avg_rating']]
scaler = StandardScaler()
numeric_features_scaled = scaler.fit_transform(numeric_features)

# Combine TF-IDF and numeric features into a single feature matrix
combined_features = hstack([tfidf_matrix, numeric_features_scaled])

# Clustering with KMeans
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(combined_features)
data['kmeans_cluster'] = kmeans.labels_

# Clustering with DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(combined_features)
data['dbscan_cluster'] = dbscan_labels

# Clustering with Spectral Clustering
similarity_matrix = cosine_similarity(combined_features)
spectral_clustering = SpectralClustering(n_clusters=5, affinity='precomputed')
spectral_labels = spectral_clustering.fit_predict(similarity_matrix)
data['spectral_cluster'] = spectral_labels

# Content-Based Filtering
def content_based_recommendations(course_index, top_n=5):
    # Get the TF-IDF vector for the given course
    course_vector = tfidf_matrix[course_index]
    
    # Calculate cosine similarities with all other courses
    cosine_similarities = cosine_similarity(course_vector, tfidf_matrix)
    
    # Flatten the array and sort by similarity
    similarity_scores = cosine_similarities.flatten()
    similar_indices = similarity_scores.argsort()[-top_n-1:-1][::-1]  # Top N courses excluding the course itself
    
    # Return the recommended courses
    return data.iloc[similar_indices][['name', 'institution', 'rating']]

# Example: Recommend 5 courses similar to the course at index 0 based on content (TF-IDF similarity)
recommended_courses_content = content_based_recommendations(0)
print("Content-Based Recommendations:")
print(recommended_courses_content)

# Function to recommend courses based on clustering
def recommend_courses(course_index, cluster_column='kmeans_cluster', num_recommendations=5):
    cluster_id = data.iloc[course_index][cluster_column]
    similar_courses = data[data[cluster_column] == cluster_id]
    return similar_courses[['name', 'institution', 'rating']].head(num_recommendations)

# Example: Recommend 5 courses similar to course at index 0 based on KMeans cluster
recommended_courses_kmeans = recommend_courses(0, cluster_column='kmeans_cluster')
print("KMeans Recommendations:")
print(recommended_courses_kmeans)

# Visualize the clusters for KMeans (using rating and review_count for simplicity)
plt.scatter(data['rating'], data['review_count'], c=data['kmeans_cluster'], cmap='viridis')
plt.xlabel('Rating')
plt.ylabel('Review Count')
plt.title('KMeans Clustering')
plt.show()

# Visualize the clusters for DBSCAN
plt.scatter(data['rating'], data['review_count'], c=data['dbscan_cluster'], cmap='viridis')
plt.xlabel('Rating')
plt.ylabel('Review Count')
plt.title('DBSCAN Clustering')
plt.show()

# Visualize the clusters for Spectral Clustering
plt.scatter(data['rating'], data['review_count'], c=data['spectral_cluster'], cmap='viridis')
plt.xlabel('Rating')
plt.ylabel('Review Count')
plt.title('Spectral Clustering')
plt.show()

# Evaluate clustering performance using silhouette score
silhouette_kmeans = silhouette_score(combined_features, kmeans.labels_)
silhouette_dbscan = silhouette_score(combined_features, dbscan_labels) if len(set(dbscan_labels)) > 1 else -1
silhouette_spectral = silhouette_score(combined_features, spectral_labels)

print(f"Silhouette Score for KMeans: {silhouette_kmeans}")
print(f"Silhouette Score for DBSCAN: {silhouette_dbscan}")
print(f"Silhouette Score for Spectral Clustering: {silhouette_spectral}")