In [7]:
import pandas as pd

# New cell at index 0

path = r"../data/books_1250_above_reviews.csv"
df = pd.read_csv(path)

# Try common column names first, then try to auto-detect a 1-5 numeric column.
common = {"rating", "review_rating", "stars", "review_stars", "score"}
col = next((c for c in df.columns if c.lower() in common), None)

if col is None:
    for c in df.columns:
        s = pd.to_numeric(df[c], errors="coerce").dropna()
        if not s.empty and s.isin([1, 2, 3, 4, 5]).all():
            col = c
            break

if col is None:
    raise ValueError(f"Couldn't find a 1-5 rating column. Available columns: {list(df.columns)}")

counts = pd.to_numeric(df[col], errors="coerce").dropna().astype(int).value_counts().reindex(range(1, 6), fill_value=0).sort_index()
print(f"Rating counts for column '{col}':\n{counts}")

Rating counts for column 'rating':
rating
1     493
2     908
3    2478
4    4663
5    6073
Name: count, dtype: int64


In [8]:
# Remap labels to 3-class sentiment: Negative (1,2) -> 0, Mixed (3) -> 1, Positive (4,5) -> 2
print("=" * 120)
print("REMAPPING LABELS TO 3-CLASS SENTIMENT")
print("=" * 120)

def remap_labels(y):
    """
    Remap 5-class ratings (1-5) to 3-class sentiment:
    1, 2 -> 0 (Negative)
    3 -> 1 (Mixed)
    4, 5 -> 2 (Positive)
    """
    return np.array([0 if rating in [1, 2] else (1 if rating == 3 else 2) for rating in y])

# Add remapped labels to dataframe
df['sentiment'] = df['rating'].apply(lambda x: 0 if x in [1, 2] else (1 if x == 3 else 2))

# Show mapping
print("\nLabel Remapping:")
print("  1, 2 stars -> 0 (Negative)")
print("  3 stars    -> 1 (Mixed)")
print("  4, 5 stars -> 2 (Positive)")

print("\nOverall Sentiment Distribution:")
sentiment_counts = df['sentiment'].value_counts().sort_index()
print(f"  Negative (0): {sentiment_counts.get(0, 0):,} reviews")
print(f"  Mixed (1):    {sentiment_counts.get(1, 0):,} reviews")
print(f"  Positive (2): {sentiment_counts.get(2, 0):,} reviews")

print("=" * 120)

REMAPPING LABELS TO 3-CLASS SENTIMENT

Label Remapping:
  1, 2 stars -> 0 (Negative)
  3 stars    -> 1 (Mixed)
  4, 5 stars -> 2 (Positive)

Overall Sentiment Distribution:
  Negative (0): 1,401 reviews
  Mixed (1):    2,478 reviews
  Positive (2): 11,095 reviews


In [9]:
# Required imports for 3-class sentiment classification
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings('ignore')

In [10]:
# 3-Class Sentiment Classification (All Books Combined - 80/20 Train/Test)
print("=" * 120)
print("3-CLASS SENTIMENT CLASSIFICATION - COMBINED ALL BOOKS (80/20 Train/Test Split)")
print("=" * 120)

# Prepare data with sentiment labels (Negative=0, Mixed=1, Positive=2)
X_sentiment = df['review_text'].values
y_sentiment = df['sentiment'].astype(int).values

print(f"\n1. Data Summary:")
print(f"   Total reviews: {len(X_sentiment):,}")
print(f"   Sentiment distribution:")
print(f"     Negative (1-2★): {(y_sentiment == 0).sum():,} reviews")
print(f"     Mixed (3★):      {(y_sentiment == 1).sum():,} reviews")
print(f"     Positive (4-5★): {(y_sentiment == 2).sum():,} reviews")

# Split into 80/20 train/test with stratification
print(f"\n2. Splitting data (80/20 with stratification)...")
X_train_sent, X_test_sent, y_train_sent, y_test_sent = train_test_split(
    X_sentiment, y_sentiment, test_size=0.2, random_state=42, stratify=y_sentiment
)

print(f"   Training set: {len(X_train_sent):,} reviews")
print(f"   Test set: {len(X_test_sent):,} reviews")
print(f"   Training distribution:")
print(f"     Negative: {(y_train_sent == 0).sum():,} | Mixed: {(y_train_sent == 1).sum():,} | Positive: {(y_train_sent == 2).sum():,}")
print(f"   Test distribution:")
print(f"     Negative: {(y_test_sent == 0).sum():,} | Mixed: {(y_test_sent == 1).sum():,} | Positive: {(y_test_sent == 2).sum():,}")

# Load SBERT model
print(f"\n3. Loading Sentence-BERT model...")
model_sent = SentenceTransformer('all-MiniLM-L6-v2')
print(f"   ✓ Model loaded successfully (384-dimensional embeddings)")

# Generate embeddings for training set
print(f"\n4. Generating embeddings for training set...")
print(f"   Processing {len(X_train_sent):,} reviews...")
embeddings_train_sent = model_sent.encode(X_train_sent, show_progress_bar=True, batch_size=32)
print(f"   ✓ Shape: {embeddings_train_sent.shape}")

# Generate embeddings for test set
print(f"\n5. Generating embeddings for test set...")
print(f"   Processing {len(X_test_sent):,} reviews...")
embeddings_test_sent = model_sent.encode(X_test_sent, show_progress_bar=True, batch_size=32)
print(f"   ✓ Shape: {embeddings_test_sent.shape}")

# Train Random Forest classifier
print(f"\n6. Training Random Forest classifier on SBERT embeddings...")
classifier_sent = RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42, n_jobs=-1)
classifier_sent.fit(embeddings_train_sent, y_train_sent)
print(f"   ✓ Model trained successfully")

# Make predictions
print(f"\n7. Making predictions on test set...")
y_pred_sent = classifier_sent.predict(embeddings_test_sent)
accuracy_sent = accuracy_score(y_test_sent, y_pred_sent)
print(f"   ✓ Predictions completed")

# Display results
print(f"\n" + "=" * 120)
print(f"RESULTS - 3-CLASS SENTIMENT CLASSIFICATION (COMBINED)")
print(f"=" * 120)
print(f"\nAccuracy: {accuracy_sent:.4f} ({accuracy_sent*100:.2f}%)\n")

print(f"Classification Report:")
print(classification_report(y_test_sent, y_pred_sent, labels=[0, 1, 2], 
                          target_names=['Negative', 'Mixed', 'Positive'], digits=4))

# Confusion Matrix
cm_sent = confusion_matrix(y_test_sent, y_pred_sent, labels=[0, 1, 2])
print(f"Confusion Matrix:")
print(f"                Predicted Negative  Predicted Mixed  Predicted Positive")
print(f"Actual Negative     {cm_sent[0, 0]:>6}              {cm_sent[0, 1]:>6}             {cm_sent[0, 2]:>6}")
print(f"Actual Mixed        {cm_sent[1, 0]:>6}              {cm_sent[1, 1]:>6}             {cm_sent[1, 2]:>6}")
print(f"Actual Positive     {cm_sent[2, 0]:>6}              {cm_sent[2, 1]:>6}             {cm_sent[2, 2]:>6}")

print("\n" + "=" * 120)

3-CLASS SENTIMENT CLASSIFICATION - COMBINED ALL BOOKS (80/20 Train/Test Split)

1. Data Summary:
   Total reviews: 14,974
   Sentiment distribution:
     Negative (1-2★): 1,401 reviews
     Mixed (3★):      2,478 reviews
     Positive (4-5★): 11,095 reviews

2. Splitting data (80/20 with stratification)...
   Training set: 11,979 reviews
   Test set: 2,995 reviews
   Training distribution:
     Negative: 1,121 | Mixed: 1,982 | Positive: 8,876
   Test distribution:
     Negative: 280 | Mixed: 496 | Positive: 2,219

3. Loading Sentence-BERT model...
   ✓ Model loaded successfully (384-dimensional embeddings)

4. Generating embeddings for training set...
   Processing 11,979 reviews...


Batches: 100%|██████████| 375/375 [01:39<00:00,  3.76it/s]


   ✓ Shape: (11979, 384)

5. Generating embeddings for test set...
   Processing 2,995 reviews...


Batches: 100%|██████████| 94/94 [00:25<00:00,  3.75it/s]


   ✓ Shape: (2995, 384)

6. Training Random Forest classifier on SBERT embeddings...
   ✓ Model trained successfully

7. Making predictions on test set...
   ✓ Predictions completed

RESULTS - 3-CLASS SENTIMENT CLASSIFICATION (COMBINED)

Accuracy: 0.7476 (74.76%)

Classification Report:
              precision    recall  f1-score   support

    Negative     0.5000    0.0286    0.0541       280
       Mixed     0.5556    0.0504    0.0924       496
    Positive     0.7519    0.9941    0.8562      2219

    accuracy                         0.7476      2995
   macro avg     0.6025    0.3577    0.3342      2995
weighted avg     0.6958    0.7476    0.6547      2995

Confusion Matrix:
                Predicted Negative  Predicted Mixed  Predicted Positive
Actual Negative          8                  11                261
Actual Mixed             4                  25                467
Actual Positive          4                   9               2206



In [None]:
# 3-Class Sentiment Classification by Individual Book (80/20 Train/Test)
print("=" * 120)
print("3-CLASS SENTIMENT CLASSIFICATION - BY INDIVIDUAL BOOK (80/20 Train/Test)")
print("=" * 120)

book_results_3class = {}

for idx, (book_id, book_df) in enumerate(df.groupby('book_id'), 1):
    print(f"\n[Book {idx}/8] Processing book_id: {book_id}")
    print(f"   Total reviews: {len(book_df):,}")
    
    # Get reviews and sentiment labels
    X_book = book_df['review_text'].values
    y_book = book_df['sentiment'].astype(int).values
    
    # Show sentiment distribution
    neg_count = (y_book == 0).sum()
    mix_count = (y_book == 1).sum()
    pos_count = (y_book == 2).sum()
    print(f"   Sentiment: Negative={neg_count:,} | Mixed={mix_count:,} | Positive={pos_count:,}")
    
    # Split data (80/20 with stratification)
    try:
        X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(
            X_book, y_book, test_size=0.2, random_state=42, stratify=y_book
        )
        print(f"   Train: {len(X_train_b):,} | Test: {len(X_test_b):,}")
    except Exception as e:
        print(f"   ⚠️ Skipping (split error): {e}")
        continue
    
    # Generate embeddings
    try:
        emb_train_b = model_sent.encode(X_train_b, show_progress_bar=False, batch_size=32)
        emb_test_b = model_sent.encode(X_test_b, show_progress_bar=False, batch_size=32)
    except Exception as e:
        print(f"   ⚠️ Skipping (embedding error): {e}")
        continue
    
    # Train classifier
    try:
        clf_b = RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42, n_jobs=-1)
        clf_b.fit(emb_train_b, y_train_b)
        y_pred_b = clf_b.predict(emb_test_b)
        accuracy_b = accuracy_score(y_test_b, y_pred_b)
        
        print(f"   ✓ Accuracy: {accuracy_b:.4f} ({accuracy_b*100:.2f}%)")
        
        # Store results
        book_results_3class[book_id] = {
            'n_reviews': len(X_book),
            'n_train': len(X_train_b),
            'n_test': len(X_test_b),
            'accuracy': accuracy_b,
            'classifier': clf_b,
            'y_test': y_test_b,
            'y_pred': y_pred_b,
            'embeddings_train': emb_train_b,
            'embeddings_test': emb_test_b
        }
    except Exception as e:
        print(f"   ⚠️ Skipping (training error): {e}")
        continue

print("\n" + "=" * 120)
print("SUMMARY - 3-CLASS SENTIMENT BY INDIVIDUAL BOOK")
print("=" * 120)

if book_results_3class:
    summary_book_df = pd.DataFrame([
        {
            'Book ID': book_id,
            'Total Reviews': result['n_reviews'],
            'Train Set': result['n_train'],
            'Test Set': result['n_test'],
            'Accuracy': f"{result['accuracy']:.4f} ({result['accuracy']*100:.2f}%)"
        }
        for book_id, result in book_results_3class.items()
    ])
    print(summary_book_df.to_string(index=False))
    
    avg_accuracy_3c = np.mean([r['accuracy'] for r in book_results_3class.values()])
    print(f"\nAverage Accuracy: {avg_accuracy_3c:.4f} ({avg_accuracy_3c*100:.2f}%)")
else:
    print("No results available.")

print("=" * 120)

3-CLASS SENTIMENT CLASSIFICATION - BY INDIVIDUAL BOOK (80/20 Train/Test)

[Book 1/8] Processing book_id: 7905092
   Total reviews: 1,299
   Sentiment: Negative=252 | Mixed=285 | Positive=762
   Train: 1,039 | Test: 260
   ✓ Accuracy: 0.6115 (61.15%)

[Book 2/8] Processing book_id: 10429045
   Total reviews: 3,158
   Sentiment: Negative=464 | Mixed=585 | Positive=2,109
   Train: 2,526 | Test: 632
   ✓ Accuracy: 0.6725 (67.25%)

[Book 3/8] Processing book_id: 12609433
   Total reviews: 1,173
   Sentiment: Negative=60 | Mixed=213 | Positive=900
   Train: 938 | Test: 235
   ✓ Accuracy: 0.7617 (76.17%)

[Book 4/8] Processing book_id: 13227454
   Total reviews: 1,589
   Sentiment: Negative=100 | Mixed=266 | Positive=1,223
   Train: 1,271 | Test: 318
   ✓ Accuracy: 0.7736 (77.36%)

[Book 5/8] Processing book_id: 13455782
   Total reviews: 943
   Sentiment: Negative=126 | Mixed=200 | Positive=617
   Train: 754 | Test: 189
   ✓ Accuracy: 0.6667 (66.67%)

[Book 6/8] Processing book_id: 18774964
