In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow_datasets as tfds
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score

# --- STEP 1: LOAD AND PREPROCESS ---

print("Loading IMDB dataset...")
# Load data (using a subset if needed for speed, here we load all)
dataset, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)
train_data, test_data = dataset['train'], dataset['test']

# Helper function to convert tensorflow dataset to numpy arrays
def convert_to_numpy(data):
    texts = []
    labels = []
    for text, label in data:
        texts.append(text.numpy().decode('utf-8'))
        labels.append(label.numpy())
    return np.array(texts), np.array(labels)

# Convert to numpy for Scikit-Learn
X_train_raw, y_train = convert_to_numpy(train_data)
X_test_raw, y_test = convert_to_numpy(test_data)

print(f"Training samples: {len(X_train_raw)}")
print(f"Test samples: {len(X_test_raw)}")

# --- TEMPORARY STEP 2 (Prerequisite for you) ---
# We limit max_features to 2000 to prevent RAM crash during PCA Standardization
print("Generating TF-IDF features (Prerequisite for PCA)...")
tfidf = TfidfVectorizer(max_features=2000, stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train_raw)
X_test_tfidf = tfidf.transform(X_test_raw)