# **Import necessary libraries**

In [12]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# **Load and Explore the Dataset**

In [13]:
# Load the IMDB Movie Reviews dataset
imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)

# Split the dataset into train and test sets
train_data, test_data = imdb['train'], imdb['test']

# **Preprocess the Data**

In [14]:
# Initialize lists to store text and labels
train_sentences = []
test_sentences = []
train_labels = []
test_labels = []

# Extract sentences and labels from the dataset
for s, l in train_data:
    train_sentences.append(str(s.numpy()))
    train_labels.append(l.numpy())

for s, l in test_data:
    test_sentences.append(str(s.numpy()))
    test_labels.append(l.numpy())

# Convert lists to NumPy arrays
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

# **Create a Bag of Words Representation**

In [15]:
# Initialize a CountVectorizer for Bag of Words representation
vectorizer = CountVectorizer(binary=True, max_features=5000)

# Fit and transform the training data
X_train = vectorizer.fit_transform(train_sentences)
X_test = vectorizer.transform(test_sentences)

# **Create and Train the Classification Model**

In [16]:
# Initialize the Random Forest Classifier model
model = RandomForestClassifier(random_state=42)

# Fit the model on the training data
model.fit(X_train, train_labels)

# **Make Predictions and Evaluate the Model**

In [17]:
# Predict on the test data
y_pred = model.predict(X_test)


# Evaluate the model
accuracy = accuracy_score(test_labels, y_pred)
print("Accuracy:", accuracy)

print("\nConfusion Matrix:")
print(confusion_matrix(test_labels, y_pred))

print("\nClassification Report:")
print(classification_report(test_labels, y_pred))

Accuracy: 0.84028

Confusion Matrix:
[[10546  1954]
 [ 2039 10461]]

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.84      0.84     12500
           1       0.84      0.84      0.84     12500

    accuracy                           0.84     25000
   macro avg       0.84      0.84      0.84     25000
weighted avg       0.84      0.84      0.84     25000



# **Plot**

In [18]:
# Create a DataFrame with selected features (replace with actual values)
data = pd.DataFrame({
    'Review Length': [len(review) for review in test_sentences],
    'Number of Words': [len(review.split()) for review in test_sentences],
    'Sentiment Score': [1 if label == 1 else -1 for label in test_labels]
})

fig = px.scatter_3d(data, x='Review Length', y='Number of Words', z='Sentiment Score', color='Sentiment Score', opacity=0.7,
                    labels={'Review Length': 'Review Length', 'Number of Words': 'Number of Words', 'Sentiment Score': 'Sentiment Score'})

fig.update_layout(scene=dict(xaxis_title='Review Length', yaxis_title='Number of Words', zaxis_title='Sentiment Score'))

fig.show()