In [6]:
import pandas as pd

# New cell at index 0

path = r"../data/books_1250_above_reviews.csv"
df = pd.read_csv(path)

# Try common column names first, then try to auto-detect a 1-5 numeric column.
common = {"rating", "review_rating", "stars", "review_stars", "score"}
col = next((c for c in df.columns if c.lower() in common), None)

if col is None:
    for c in df.columns:
        s = pd.to_numeric(df[c], errors="coerce").dropna()
        if not s.empty and s.isin([1, 2, 3, 4, 5]).all():
            col = c
            break

if col is None:
    raise ValueError(f"Couldn't find a 1-5 rating column. Available columns: {list(df.columns)}")

counts = pd.to_numeric(df[col], errors="coerce").dropna().astype(int).value_counts().reindex(range(1, 6), fill_value=0).sort_index()
print(f"Rating counts for column '{col}':\n{counts}")

Rating counts for column 'rating':
rating
1     493
2     908
3    2478
4    4663
5    6073
Name: count, dtype: int64


In [7]:
!pip install sentence_transformers

Defaulting to user installation because normal site-packages is not writeable
Collecting sentence_transformers
  Using cached sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)
Using cached sentence_transformers-5.1.2-py3-none-any.whl (488 kB)
Installing collected packages: sentence_transformers
Successfully installed sentence_transformers-5.1.2



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: C:\Users\kenpo\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


## mixed books

In [9]:
# Sentiment Classification using Sentence-BERT (SBERT)
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings('ignore')

print("=" * 120)
print("SENTIMENT CLASSIFICATION WITH SENTENCE-BERT")
print("=" * 120)

# Prepare data
print("\n1. Loading and preparing data...")
print(f"   Total reviews: {len(df):,}")
print(f"   Rating distribution:\n{counts}\n")

# Use review_text and rating
X = df['review_text'].values
y = pd.to_numeric(df['rating'], errors='coerce').dropna().astype(int).values

# Handle any mismatched lengths
if len(X) != len(y):
    valid_idx = pd.to_numeric(df['rating'], errors='coerce').notna()
    X = df[valid_idx]['review_text'].values
    y = pd.to_numeric(df[valid_idx]['rating'], errors='coerce').astype(int).values

print(f"   Reviews: {len(X):,}")
print(f"   Labels: {len(y):,}")

# Split into 80/20 train/test
print("\n2. Splitting data into 80/20 train/test...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"   Training set: {len(X_train):,} reviews")
print(f"   Test set: {len(X_test):,} reviews")
print(f"   Training label distribution:\n{pd.Series(y_train).value_counts().sort_index()}")
print(f"   Test label distribution:\n{pd.Series(y_test).value_counts().sort_index()}\n")

# Load SBERT model
print("3. Loading Sentence-BERT model...")
print("   Model: all-MiniLM-L6-v2 (lightweight, fast, good performance)")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("   ✓ Model loaded successfully\n")

# Generate embeddings for training set
print("4. Generating embeddings for training set...")
print(f"   Processing {len(X_train):,} reviews...")
embeddings_train = model.encode(X_train, show_progress_bar=True, batch_size=32)
print(f"   ✓ Embedding shape: {embeddings_train.shape}")
print(f"   ✓ Embedding dimension: {embeddings_train.shape[1]}\n")

# Generate embeddings for test set
print("5. Generating embeddings for test set...")
print(f"   Processing {len(X_test):,} reviews...")
embeddings_test = model.encode(X_test, show_progress_bar=True, batch_size=32)
print(f"   ✓ Test embeddings shape: {embeddings_test.shape}\n")

# Train classifier on embeddings
print("6. Training Random Forest classifier on SBERT embeddings...")
classifier = RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42, n_jobs=-1)
classifier.fit(embeddings_train, y_train)
print("   ✓ Model trained successfully\n")

# Make predictions
print("7. Making predictions on test set...")
y_pred = classifier.predict(embeddings_test)
print("   ✓ Predictions completed\n")

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)

print("=" * 120)
print("RESULTS")
print("=" * 120)
print(f"\nAccuracy: {accuracy:.4f} ({accuracy*100:.2f}%)\n")

print("Classification Report:")
print(classification_report(y_test, y_pred, labels=[1, 2, 3, 4, 5], target_names=['1★', '2★', '3★', '4★', '5★']))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred, labels=[1, 2, 3, 4, 5])

print("\nConfusion Matrix:")
print(cm)
print()

# Store results for visualization
results = {
    'model': classifier,
    'embeddings_train': embeddings_train,
    'embeddings_test': embeddings_test,
    'y_train': y_train,
    'y_test': y_test,
    'y_pred': y_pred,
    'accuracy': accuracy,
    'cm': cm
}

print("=" * 120)

SENTIMENT CLASSIFICATION WITH SENTENCE-BERT

1. Loading and preparing data...
   Total reviews: 14,974
   Rating distribution:
rating
1     493
2     908
3    2478
4    4663
5    6073
Name: count, dtype: int64

   Reviews: 14,974
   Labels: 14,974

2. Splitting data into 80/20 train/test...
   Training set: 11,979 reviews
   Test set: 2,995 reviews
   Training label distribution:
0     287
1     395
2     727
3    1982
4    3730
5    4858
Name: count, dtype: int64
   Test label distribution:
0      72
1      98
2     181
3     496
4     933
5    1215
Name: count, dtype: int64

3. Loading Sentence-BERT model...
   Model: all-MiniLM-L6-v2 (lightweight, fast, good performance)
   ✓ Model loaded successfully

4. Generating embeddings for training set...
   Processing 11,979 reviews...
   ✓ Model loaded successfully

4. Generating embeddings for training set...
   Processing 11,979 reviews...


Batches: 100%|██████████| 375/375 [01:43<00:00,  3.62it/s]



   ✓ Embedding shape: (11979, 384)
   ✓ Embedding dimension: 384

5. Generating embeddings for test set...
   Processing 2,995 reviews...


Batches: 100%|██████████| 94/94 [00:26<00:00,  3.61it/s]



   ✓ Test embeddings shape: (2995, 384)

6. Training Random Forest classifier on SBERT embeddings...
   ✓ Model trained successfully

7. Making predictions on test set...
   ✓ Predictions completed

RESULTS

Accuracy: 0.4898 (48.98%)

Classification Report:
              precision    recall  f1-score   support

          1★       0.00      0.00      0.00        98
          2★       0.50      0.01      0.01       181
          3★       0.45      0.17      0.25       496
          4★       0.38      0.40      0.39       933
          5★       0.55      0.83      0.67      1215

   micro avg       0.49      0.50      0.50      2923
   macro avg       0.38      0.28      0.26      2923
weighted avg       0.46      0.50      0.44      2923


Confusion Matrix:
[[   0    0   21   37   40]
 [   0    1   39   90   50]
 [   0    1   85  262  148]
 [   0    0   38  369  526]
 [   0    0    4  198 1012]]

   ✓ Model trained successfully

7. Making predictions on test set...
   ✓ Predictions compl

## separated books

In [7]:
# Sentiment Classification for Individual Books
print("=" * 120)
print("SENTIMENT CLASSIFICATION BY INDIVIDUAL BOOK (80/20 Train/Test Split)")
print("=" * 120)

# Group reviews by book_id
book_groups = df.groupby('book_id')

book_results = {}

for idx, (book_id, book_df) in enumerate(book_groups, 1):
    print(f"\n[Book {idx}/8] Processing book_id: {book_id}")
    print(f"   Total reviews: {len(book_df):,}")
    
    # Get reviews and ratings for this book
    X_book = book_df['review_text'].values
    y_book = pd.to_numeric(book_df['rating'], errors='coerce').dropna().astype(int).values
    
    # Handle mismatched lengths
    if len(X_book) != len(y_book):
        valid_idx = pd.to_numeric(book_df['rating'], errors='coerce').notna()
        X_book = book_df[valid_idx]['review_text'].values
        y_book = pd.to_numeric(book_df[valid_idx]['rating'], errors='coerce').astype(int).values
    
    print(f"   Valid reviews: {len(X_book):,}")
    print(f"   Rating distribution: {dict(pd.Series(y_book).value_counts().sort_index())}")
    
    # Split data (80/20 with stratification)
    try:
        X_train_book, X_test_book, y_train_book, y_test_book = train_test_split(
            X_book, y_book, test_size=0.2, random_state=42, stratify=y_book
        )
        print(f"   Train: {len(X_train_book):,} | Test: {len(X_test_book):,}")
    except Exception as e:
        print(f"   ⚠️  Skipping book (split error): {e}")
        continue
    
    # Generate embeddings
    try:
        embeddings_train_book = model.encode(X_train_book, show_progress_bar=False, batch_size=32)
        embeddings_test_book = model.encode(X_test_book, show_progress_bar=False, batch_size=32)
    except Exception as e:
        print(f"   ⚠️  Skipping book (embedding error): {e}")
        continue
    
    # Train classifier
    try:
        classifier_book = RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42, n_jobs=-1)
        classifier_book.fit(embeddings_train_book, y_train_book)
        y_pred_book = classifier_book.predict(embeddings_test_book)
        accuracy_book = accuracy_score(y_test_book, y_pred_book)
        
        print(f"   ✓ Accuracy: {accuracy_book:.4f} ({accuracy_book*100:.2f}%)")
        
        # Store results
        book_results[book_id] = {
            'n_reviews': len(X_book),
            'n_train': len(X_train_book),
            'n_test': len(X_test_book),
            'accuracy': accuracy_book,
            'classifier': classifier_book,
            'y_test': y_test_book,
            'y_pred': y_pred_book,
            'embeddings_train': embeddings_train_book,
            'embeddings_test': embeddings_test_book
        }
    except Exception as e:
        print(f"   ⚠️  Skipping book (training error): {e}")
        continue

print("\n" + "=" * 120)
print("SUMMARY BY BOOK")
print("=" * 120)

if book_results:
    summary_df = pd.DataFrame([
        {
            'Book ID': book_id,
            'Total Reviews': result['n_reviews'],
            'Train Set': result['n_train'],
            'Test Set': result['n_test'],
            'Accuracy': f"{result['accuracy']:.4f} ({result['accuracy']*100:.2f}%)"
        }
        for book_id, result in book_results.items()
    ])
    print(summary_df.to_string(index=False))
    
    avg_accuracy = np.mean([r['accuracy'] for r in book_results.values()])
    print(f"\nAverage Accuracy: {avg_accuracy:.4f} ({avg_accuracy*100:.2f}%)")
else:
    print("No results available.")

print("=" * 120)

SENTIMENT CLASSIFICATION BY INDIVIDUAL BOOK (80/20 Train/Test Split)

[Book 1/8] Processing book_id: 7905092
   Total reviews: 1,299
   Valid reviews: 1,299
   Rating distribution: {0: np.int64(54), 1: np.int64(107), 2: np.int64(145), 3: np.int64(285), 4: np.int64(392), 5: np.int64(316)}
   Train: 1,039 | Test: 260
   ✓ Accuracy: 0.3500 (35.00%)

[Book 2/8] Processing book_id: 10429045
   Total reviews: 3,158
   Valid reviews: 3,158
   Rating distribution: {0: np.int64(82), 1: np.int64(208), 2: np.int64(256), 3: np.int64(585), 4: np.int64(970), 5: np.int64(1057)}
   Train: 2,526 | Test: 632
   ✓ Accuracy: 0.3500 (35.00%)

[Book 2/8] Processing book_id: 10429045
   Total reviews: 3,158
   Valid reviews: 3,158
   Rating distribution: {0: np.int64(82), 1: np.int64(208), 2: np.int64(256), 3: np.int64(585), 4: np.int64(970), 5: np.int64(1057)}
   Train: 2,526 | Test: 632
   ✓ Accuracy: 0.4146 (41.46%)

[Book 3/8] Processing book_id: 12609433
   Total reviews: 1,173
   Valid reviews: 1,173
 