In [1]:
# ! pip install tensorflow_datasets
import tensorflow_datasets as tfds
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Load the IMDB dataset
imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)

# Retrieve the train and test data
train_data, test_data = imdb['train'], imdb['test']

# Prepare the datasets
train_sentences = []
train_labels = []

test_sentences = []
test_labels = []

# TensorFlow datasets are stored in a tf.data.Dataset object, which is a generator. 
# Hence, we need to loop through it to retrieve data.
for sentence, label in tfds.as_numpy(train_data):
    train_sentences.append(sentence.decode('utf-8'))
    train_labels.append(label)

for sentence, label in tfds.as_numpy(test_data):
    test_sentences.append(sentence.decode('utf-8'))
    test_labels.append(label)

train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

# Split the training data to create a validation set
train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    train_sentences, train_labels, test_size=0.2, random_state=42)

# Initialize a TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Fit the vectorizer to the training data and transform the sentences
X_train = vectorizer.fit_transform(train_sentences)

# Transform the validation and test data with the same vectorizer
X_val = vectorizer.transform(val_sentences)
X_test = vectorizer.transform(test_sentences)

# Define the Logistic Regression model
model = LogisticRegression(random_state=42)

# Train the model
model.fit(X_train, train_labels)

# Predict on the validation set
val_pred = model.predict(X_val)

# Evaluate the model
print("Validation accuracy: ", accuracy_score(val_labels, val_pred))
print("\nValidation Classification Report:\n", classification_report(val_labels, val_pred))

# Predict on the test set
test_pred = model.predict(X_test)

# Evaluate the model
print("Test accuracy: ", accuracy_score(test_labels, test_pred))
print("\nTest Classification Report:\n", classification_report(test_labels, test_pred))


2023-11-15 11:41:50.951938: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Validation accuracy:  0.8882

Validation Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.87      0.89      2576
           1       0.87      0.91      0.89      2424

    accuracy                           0.89      5000
   macro avg       0.89      0.89      0.89      5000
weighted avg       0.89      0.89      0.89      5000

Test accuracy:  0.87632

Test Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.87      0.88     12500
           1       0.87      0.88      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000



In [3]:
train_labels

array([1, 1, 1, ..., 0, 1, 0])