In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score
from sklearn.datasets import make_classification
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
df = pd.read_csv('Data/combined_data_preprocessed.csv')

In [None]:
# Feature Engineering: Calculate Average Rating, Review Count per Course
course_stats = df.groupby('course_id').agg(
    avg_rating=('rating', 'mean'),
    review_count=('rating', 'count')
).reset_index()

# Merge back into original dataframe
df = df.merge(course_stats, on='course_id', how='left')

# 5. Normalize Rating and Review Count
scaler = StandardScaler()
df[['normalized_rating', 'normalized_review_count']] = scaler.fit_transform(
    df[['avg_rating', 'review_count']]
)

In [None]:
df.head()

In [None]:
combined_data = df

# Replace NaN values in the 'cleaned_reviews' column with an empty string
combined_data['cleaned_reviews'] = combined_data['cleaned_reviews'].fillna('')
# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  # Adjust max_features based on your data size
X = vectorizer.fit_transform(combined_data['cleaned_reviews'])

In [None]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(combined_data['cleaned_reviews'])
X = tokenizer.texts_to_sequences(combined_data['cleaned_reviews'])
X = pad_sequences(X, maxlen=200)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, combined_data['sentiment'], test_size=0.2, random_state=42)

# Building the LSTM model
model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=200),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")

In [None]:
X_new = tokenizer.texts_to_sequences(combined_data['cleaned_reviews'])
X_new = pad_sequences(X_new, maxlen=200)
combined_data['predicted_sentiment'] = (model.predict(X_new) > 0.5).astype(int)

In [21]:
# Using Pre-trained Sentiment Models (Unsupervised Approach)

data = df

# Initialize VADER Sentiment Analyzer
analyzer = SentimentIntensityAnalyzer()

# Apply sentiment analysis to your reviews
data['sentiment_score'] = data['cleaned_reviews'].apply(lambda x: analyzer.polarity_scores(x)['compound'])

# Classify sentiment as positive (1), neutral (0), or negative (-1)
data['predicted_sentiment'] = data['sentiment_score'].apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))

In [None]:
data.head() 

Unnamed: 0,name,institution,course_url,course_id,reviews,reviewers,date_reviews,rating,cleaned_reviews,avg_rating,review_count,normalized_rating,normalized_review_count,sentiment_score,predicted_sentiment
0,Machine Learning,Stanford University,https://www.coursera.org/learn/machine-learning,machine-learning,This is an extremely basic course. Machine lea...,By Deleted A,2017-03-18,1,extremely basic course machine learning built ...,4.750522,35895,0.277835,2.014232,-0.0262,-1
1,Machine Learning,Stanford University,https://www.coursera.org/learn/machine-learning,machine-learning,The course is ok but the certification procedu...,By Bruno C,2015-11-09,1,course ok certification procedure messno state...,4.750522,35895,0.277835,2.014232,0.3612,1
2,Machine Learning,Stanford University,https://www.coursera.org/learn/machine-learning,machine-learning,"I just started week 3 , I have to admit that I...",By Fadi,2019-04-15,1,started week admit good course explaining idea...,4.750522,35895,0.277835,2.014232,0.836,1
3,Machine Learning,Stanford University,https://www.coursera.org/learn/machine-learning,machine-learning,This course is absolute garbage. You get no f...,By Mathew L,2015-09-25,1,course absolute garbage get feedback quiz assi...,4.750522,35895,0.277835,2.014232,-0.4936,-1
4,Machine Learning,Stanford University,https://www.coursera.org/learn/machine-learning,machine-learning,"However good the material and lectures may be,...",By Rui C,2015-12-12,1,however good material lecture may use outdated...,4.750522,35895,0.277835,2.014232,0.5859,1


In [22]:
from sklearn.cluster import KMeans

combined_data = pd.read_csv('Data/combined_data_preprocessed.csv')

# 1. Vectorize the reviews using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(combined_data['reviews'])

# 2. Apply KMeans clustering
n_clusters = 3  # We'll start with 3 clusters (positive, neutral, negative)
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
combined_data['cluster'] = kmeans.fit_predict(X)

# 3. Inspect the reviews in each cluster to assign sentiment labels
# You can manually look at some of the reviews in each cluster and assign labels
# Let's view a sample of reviews for each cluster

for cluster in range(n_clusters):
    print(f"\nCluster {cluster} reviews:")
    print(combined_data[combined_data['cluster'] == cluster]['reviews'].head(5))  # Show top 5 reviews in the cluster

# 4. Manually label the clusters as 'positive', 'negative', or 'neutral'
# After inspecting, you can assign a sentiment label to each cluster.
# For example:
cluster_sentiment_mapping = {
    0: 'negative',  # Assign label based on your manual inspection
    1: 'neutral',
    2: 'positive'
}

# Assign sentiment labels to each review based on its cluster
combined_data['predicted_sentiment'] = combined_data['cluster'].map(cluster_sentiment_mapping)

# Now let's inspect the final results
print(combined_data[['reviews', 'predicted_sentiment']].head())

# Optionally, save the resulting data to a new CSV file
combined_data.to_csv('sentiment_analysis_results.csv', index=False)




Cluster 0 reviews:
317                   good so far
330                   good so far
343                   good so far
900    It's very good for a start
903       Good course though.... 
Name: reviews, dtype: object

Cluster 1 reviews:
0    This is an extremely basic course. Machine lea...
1    The course is ok but the certification procedu...
2    I just started week 3 , I have to admit that I...
3    This course is absolute garbage.  You get no f...
4    However good the material and lectures may be,...
Name: reviews, dtype: object

Cluster 2 reviews:
77     Python should have been great language for thi...
102    Python should have been great language for thi...
127    Python should have been great language for thi...
971                                                Great
979                                                Great
Name: reviews, dtype: object
                                             reviews predicted_sentiment
0  This is an extremely basic course. Machine lea.

In [23]:
from sklearn.cluster import SpectralClustering

# 1. Vectorize the reviews using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(combined_data['reviews'])

# 2. Apply Spectral Clustering
# n_clusters specifies the number of clusters we want to form
spectral = SpectralClustering(n_clusters=3, affinity='cosine', random_state=42)
combined_data['cluster'] = spectral.fit_predict(X)

# 3. Inspect the reviews in each cluster to assign sentiment labels
for cluster in np.unique(combined_data['cluster']):
    print(f"\nCluster {cluster} reviews:")
    print(combined_data[combined_data['cluster'] == cluster]['reviews'].head(5))  # Show top 5 reviews in the cluster

# 4. Manually label the clusters based on their content
# Assign sentiment labels like this after inspecting the clusters
cluster_sentiment_mapping = {
    0: 'negative',  # Assign label based on inspection
    1: 'positive',
    2: 'neutral'
}

# Assign sentiment labels based on the cluster
combined_data['predicted_sentiment'] = combined_data['cluster'].map(cluster_sentiment_mapping)

# Inspect the final results
print(combined_data[['reviews', 'predicted_sentiment']].head())

KeyboardInterrupt: 