In [12]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score
from sklearn.datasets import make_classification
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import kagglehub
import zipfile
import os
from transformers import pipeline
import torch

In [14]:
combined_data = pd.read_csv("Data/combined_data_preprocessed.csv")
combined_data.head()

Unnamed: 0,name,institution,course_url,course_id,reviews,reviewers,date_reviews,rating,cleaned_reviews
0,Machine Learning,Stanford University,https://www.coursera.org/learn/machine-learning,machine-learning,This is an extremely basic course. Machine lea...,By Deleted A,2017-03-18,1,extremely basic course machine learning built ...
1,Machine Learning,Stanford University,https://www.coursera.org/learn/machine-learning,machine-learning,The course is ok but the certification procedu...,By Bruno C,2015-11-09,1,course ok certification procedure messno state...
2,Machine Learning,Stanford University,https://www.coursera.org/learn/machine-learning,machine-learning,"I just started week 3 , I have to admit that I...",By Fadi,2019-04-15,1,started week admit good course explaining idea...
3,Machine Learning,Stanford University,https://www.coursera.org/learn/machine-learning,machine-learning,This course is absolute garbage. You get no f...,By Mathew L,2015-09-25,1,course absolute garbage get feedback quiz assi...
4,Machine Learning,Stanford University,https://www.coursera.org/learn/machine-learning,machine-learning,"However good the material and lectures may be,...",By Rui C,2015-12-12,1,however good material lecture may use outdated...


In [15]:
# Feature Engineering: Calculate Average Rating, Review Count per Course
course_stats = combined_data.groupby('course_id').agg(
    avg_rating=('rating', 'mean'),
    review_count=('rating', 'count')
).reset_index()

# Merge back into original dataframe
combined_data = combined_data.merge(course_stats, on='course_id', how='left')

# 5. Normalize Rating and Review Count
scaler = StandardScaler()
combined_data[['normalized_rating', 'normalized_review_count']] = scaler.fit_transform(
    combined_data[['avg_rating', 'review_count']]
)

In [16]:
combined_data.head()

Unnamed: 0,name,institution,course_url,course_id,reviews,reviewers,date_reviews,rating,cleaned_reviews,avg_rating,review_count,normalized_rating,normalized_review_count
0,Machine Learning,Stanford University,https://www.coursera.org/learn/machine-learning,machine-learning,This is an extremely basic course. Machine lea...,By Deleted A,2017-03-18,1,extremely basic course machine learning built ...,4.750522,35895,0.277835,2.014232
1,Machine Learning,Stanford University,https://www.coursera.org/learn/machine-learning,machine-learning,The course is ok but the certification procedu...,By Bruno C,2015-11-09,1,course ok certification procedure messno state...,4.750522,35895,0.277835,2.014232
2,Machine Learning,Stanford University,https://www.coursera.org/learn/machine-learning,machine-learning,"I just started week 3 , I have to admit that I...",By Fadi,2019-04-15,1,started week admit good course explaining idea...,4.750522,35895,0.277835,2.014232
3,Machine Learning,Stanford University,https://www.coursera.org/learn/machine-learning,machine-learning,This course is absolute garbage. You get no f...,By Mathew L,2015-09-25,1,course absolute garbage get feedback quiz assi...,4.750522,35895,0.277835,2.014232
4,Machine Learning,Stanford University,https://www.coursera.org/learn/machine-learning,machine-learning,"However good the material and lectures may be,...",By Rui C,2015-12-12,1,however good material lecture may use outdated...,4.750522,35895,0.277835,2.014232


In [20]:
# Check for MPS (Metal Performance Shaders) on M1 Pro
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

Using device: mps


In [21]:
# Load the pre-trained sentiment analysis model
classifier = pipeline(
    "text-classification", 
    model="cardiffnlp/twitter-roberta-base-sentiment", 
    framework="pt",  # Force PyTorch
    device=0 if torch.backends.mps.is_available() else -1
)


# Map the model's labels to sentiment categories
label_mapping = {
    'LABEL_0': 'negative',
    'LABEL_1': 'neutral',
    'LABEL_2': 'positive'
}

# Function to classify sentiment for a batch of reviews
def classify_batch(batch):
    reviews = batch['reviews'].tolist()
    results = classifier(reviews)
    return [(label_mapping[result['label']], result['score']) for result in results]

# Process the dataset in batches for efficiency
def process_in_batches(df, batch_size=64):
    sentiment_results = []
    for start in range(0, len(df), batch_size):
        batch = df.iloc[start:start + batch_size]
        sentiments = classify_batch(batch)
        sentiment_results.extend(sentiments)
        print(f"Processed {min(start + batch_size, len(df))}/{len(df)} rows")
    return sentiment_results

# Apply sentiment analysis to the entire dataset
batch_size = 32  # Lower batch size to avoid memory bottlenecks on M1
sentiments_with_scores = process_in_batches(combined_data, batch_size)

# Add sentiment and score columns
combined_data['sentiment'] = [sentiment for sentiment, score in sentiments_with_scores]
combined_data['sentiment_score'] = [score for sentiment, score in sentiments_with_scores]

# Save the updated dataset
output_file = "combined_data_with_sentiment.csv"
combined_data.to_csv(output_file, index=False)
print(f"Sentiment analysis completed. Results saved to {output_file}")

# Preview the updated DataFrame
print(combined_data[['reviews', 'sentiment', 'sentiment_score']].head())

AssertionError: Torch not compiled with CUDA enabled

In [28]:
import torch
print("MPS Available:", torch.backends.mps.is_available())  # Should return True
print("MPS Built:", torch.backends.mps.is_built())  # Should return True

MPS Available: True
MPS Built: True


In [30]:
classifier = pipeline(
    "text-classification",
    model="cardiffnlp/twitter-roberta-base-sentiment",
    framework="pt",
    device=device
)

In [34]:
# Map the model's labels to sentiment categories
label_mapping = {
    'LABEL_0': 'negative',
    'LABEL_1': 'neutral',
    'LABEL_2': 'positive'
}

# Function to classify sentiment for a batch of reviews
def classify_batch(batch):
    reviews = batch['reviews'].tolist()
    results = classifier(reviews, truncation=True)  # Ensure truncation to avoid token limit issues
    return [(label_mapping[result['label']], result['score']) for result in results]

# Process the dataset in batches for efficiency
def process_in_batches(df, batch_size=64):
    sentiment_results = []
    for start in range(0, len(df), batch_size):
        batch = df.iloc[start:start + batch_size]
        sentiments = classify_batch(batch)
        sentiment_results.extend(sentiments)
        print(f"Processed {min(start + batch_size, len(df))}/{len(df)} rows")
    return sentiment_results

# Apply sentiment analysis to the entire dataset
batch_size = 32  # Lower batch size to avoid memory bottlenecks on M1
sentiments_with_scores = process_in_batches(combined_data, batch_size)

# Add sentiment and score columns
combined_data['sentiment'] = [sentiment for sentiment, score in sentiments_with_scores]
combined_data['sentiment_score'] = [score for sentiment, score in sentiments_with_scores]

# Save the updated dataset
output_file = "combined_data_with_sentiment.csv"
combined_data.to_csv(output_file, index=False)
print(f"Sentiment analysis completed. Results saved to {output_file}")

# Preview the updated DataFrame
print(combined_data[['reviews', 'sentiment', 'sentiment_score']].head())

RuntimeError: The expanded size of the tensor (555) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 555].  Tensor sizes: [1, 514]

In [35]:
from transformers import AutoTokenizer, pipeline

# Initialize the tokenizer for the sentiment model
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")

# Map the model's labels to sentiment categories
label_mapping = {
    'LABEL_0': 'negative',
    'LABEL_1': 'neutral',
    'LABEL_2': 'positive'
}

# Maximum token length for the model
max_token_length = tokenizer.model_max_length  # 514 tokens including special tokens

# Function to truncate and classify a batch of reviews
def classify_batch(batch):
    truncated_reviews = [
        tokenizer.decode(
            tokenizer.encode(
                review, max_length=max_token_length, truncation=True
            ),
            skip_special_tokens=True
        )
        for review in batch['reviews'].tolist()
    ]
    results = classifier(truncated_reviews)
    return [(label_mapping[result['label']], result['score']) for result in results]

# Process the dataset in batches for efficiency
def process_in_batches(df, batch_size=64):
    sentiment_results = []
    for start in range(0, len(df), batch_size):
        batch = df.iloc[start:start + batch_size]
        sentiments = classify_batch(batch)
        sentiment_results.extend(sentiments)
        print(f"Processed {min(start + batch_size, len(df))}/{len(df)} rows")
    return sentiment_results

# Apply sentiment analysis to the entire dataset
batch_size = 32  # Lower batch size to avoid memory bottlenecks on M1
sentiments_with_scores = process_in_batches(combined_data, batch_size)

# Add sentiment and score columns
combined_data['sentiment'] = [sentiment for sentiment, score in sentiments_with_scores]
combined_data['sentiment_score'] = [score for sentiment, score in sentiments_with_scores]

# Save the updated dataset
output_file = "combined_data_with_sentiment.csv"
combined_data.to_csv(output_file, index=False)
print(f"Sentiment analysis completed. Results saved to {output_file}")

# Preview the updated DataFrame
print(combined_data[['reviews', 'sentiment', 'sentiment_score']].head())

OverflowError: int too big to convert