In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score
from sklearn.datasets import make_classification
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import kagglehub
import zipfile
import os
from transformers import pipeline
import torch

In [2]:
combined_data = pd.read_csv("Data/combined_data_preprocessed.csv")
combined_data.head()

Unnamed: 0,name,institution,course_url,course_id,reviews,reviewers,date_reviews,rating,cleaned_reviews
0,Machine Learning,Stanford University,https://www.coursera.org/learn/machine-learning,machine-learning,This is an extremely basic course. Machine lea...,By Deleted A,2017-03-18,1,extremely basic course machine learning built ...
1,Machine Learning,Stanford University,https://www.coursera.org/learn/machine-learning,machine-learning,The course is ok but the certification procedu...,By Bruno C,2015-11-09,1,course ok certification procedure messno state...
2,Machine Learning,Stanford University,https://www.coursera.org/learn/machine-learning,machine-learning,"I just started week 3 , I have to admit that I...",By Fadi,2019-04-15,1,started week admit good course explaining idea...
3,Machine Learning,Stanford University,https://www.coursera.org/learn/machine-learning,machine-learning,This course is absolute garbage. You get no f...,By Mathew L,2015-09-25,1,course absolute garbage get feedback quiz assi...
4,Machine Learning,Stanford University,https://www.coursera.org/learn/machine-learning,machine-learning,"However good the material and lectures may be,...",By Rui C,2015-12-12,1,however good material lecture may use outdated...


In [3]:
# Feature Engineering: Calculate Average Rating, Review Count per Course
course_stats = combined_data.groupby('course_id').agg(
    avg_rating=('rating', 'mean'),
    review_count=('rating', 'count')
).reset_index()

# Merge back into original dataframe
combined_data = combined_data.merge(course_stats, on='course_id', how='left')

# 5. Normalize Rating and Review Count
scaler = StandardScaler()
combined_data[['normalized_rating', 'normalized_review_count']] = scaler.fit_transform(
    combined_data[['avg_rating', 'review_count']]
)

In [4]:
combined_data.head()

Unnamed: 0,name,institution,course_url,course_id,reviews,reviewers,date_reviews,rating,cleaned_reviews,avg_rating,review_count,normalized_rating,normalized_review_count
0,Machine Learning,Stanford University,https://www.coursera.org/learn/machine-learning,machine-learning,This is an extremely basic course. Machine lea...,By Deleted A,2017-03-18,1,extremely basic course machine learning built ...,4.750522,35895,0.277835,2.014232
1,Machine Learning,Stanford University,https://www.coursera.org/learn/machine-learning,machine-learning,The course is ok but the certification procedu...,By Bruno C,2015-11-09,1,course ok certification procedure messno state...,4.750522,35895,0.277835,2.014232
2,Machine Learning,Stanford University,https://www.coursera.org/learn/machine-learning,machine-learning,"I just started week 3 , I have to admit that I...",By Fadi,2019-04-15,1,started week admit good course explaining idea...,4.750522,35895,0.277835,2.014232
3,Machine Learning,Stanford University,https://www.coursera.org/learn/machine-learning,machine-learning,This course is absolute garbage. You get no f...,By Mathew L,2015-09-25,1,course absolute garbage get feedback quiz assi...,4.750522,35895,0.277835,2.014232
4,Machine Learning,Stanford University,https://www.coursera.org/learn/machine-learning,machine-learning,"However good the material and lectures may be,...",By Rui C,2015-12-12,1,however good material lecture may use outdated...,4.750522,35895,0.277835,2.014232


In [None]:
# Combine course titles and reviews
combined_data['combined_text'] = combined_data['name'] + " " + combined_data['cleaned_reviews']
combined_data['combined_text'] = combined_data['combined_text'].fillna("")

In [None]:
# 1. Reduce the data size by sampling 1% of the combined_data
sampled_df = combined_data.sample(frac=0.01, random_state=42)

# 2. Initialize the TF-IDF vectorizer with unigrams and a maximum of 500 features
vectorizer = TfidfVectorizer(stop_words='english', max_features=500, ngram_range=(1, 1))

# 3. Create the sparse TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(sampled_df['combined_text'])

# 4. Apply Truncated SVD for dimensionality reduction (to 2 components for visualization)
n_components = 2  # Choose the number of dimensions for visualization
svd = TruncatedSVD(n_components=n_components, random_state=42)
reduced_features = svd.fit_transform(tfidf_matrix)

# 5. Apply KMeans clustering to the reduced features
optimal_clusters = 5  # Based on previous analysis or assumptions
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
kmeans.fit(reduced_features)

# 6. Add the cluster labels to the sampled_df
sampled_df['cluster'] = kmeans.labels_

# 7. Check the number of instances in each cluster
print("Cluster Distribution:")
print(sampled_df['cluster'].value_counts())

# 8. Print top terms for each cluster based on the cluster centers
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()

print("\nTop terms in each cluster:")
for i in range(optimal_clusters):
    print(f"\nCluster {i}:")
    for ind in order_centroids[i, :10]:  # Print the top 10 terms for each cluster
        print(f"  {terms[ind]}")

# 9. Generate colors for the visualization
colors = plt.cm.plasma(np.linspace(0, 1, optimal_clusters))

# 10. Create a scatter plot to visualize the clusters
plt.figure(figsize=(10, 8))
for i in range(optimal_clusters):
    cluster_points = reduced_features[sampled_df['cluster'] == i]
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], color=colors[i], label=f'Cluster {i}')

plt.title('K-Means Clusters of Courses (with Truncated SVD)')
plt.xlabel('SVD Component 1')
plt.ylabel('SVD Component 2')
plt.legend(title="Clusters")
plt.grid(True)
plt.show()