In [2]:
import pandas as pd

data = pd.read_csv("IMDB Dataset.csv")
print(data.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [3]:
import re
def preprocess_text_simple(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = text.split()  # Split into words
    return ' '.join(tokens)

data['cleaned_review'] = data['review'].apply(preprocess_text_simple)

In [4]:
# Converting text data into numerical representations using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data['cleaned_review']).toarray()
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['sentiment'])

In [5]:
from sklearn.model_selection import train_test_split
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
from sklearn.linear_model import LogisticRegression

# Train a Logistic Regression model
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

# Predict probabilities
logistic_probs = logistic_model.predict_proba(X_test)[:, 1]

In [8]:
import numpy as np
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Build a basic neural network
nn_model = Sequential()
nn_model.add(Dense(512, input_dim=X_train.shape[1], activation='relu'))
nn_model.add(Dropout(0.5))
nn_model.add(Dense(256, activation='relu'))
nn_model.add(Dropout(0.5))
nn_model.add(Dense(1, activation='sigmoid'))

# Compile the model
nn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

# Train the model
nn_model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test), callbacks=[early_stopping])

# Predict probabilities
nn_probs = nn_model.predict(X_test).flatten()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 22ms/step - accuracy: 0.8060 - loss: 0.4037 - val_accuracy: 0.8906 - val_loss: 0.2574
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 21ms/step - accuracy: 0.9157 - loss: 0.2149 - val_accuracy: 0.8845 - val_loss: 0.2672
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 19ms/step - accuracy: 0.9409 - loss: 0.1604 - val_accuracy: 0.8887 - val_loss: 0.2822
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 20ms/step - accuracy: 0.9719 - loss: 0.0877 - val_accuracy: 0.8852 - val_loss: 0.3624
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


In [9]:
# Averaging ensemble
ensemble_probs = (logistic_probs + nn_probs) / 2
ensemble_predictions = (ensemble_probs > 0.5).astype(int)

# Evaluate the ensemble model
from sklearn.metrics import classification_report, accuracy_score

accuracy = accuracy_score(y_test, ensemble_predictions)
report = classification_report(y_test, ensemble_predictions, target_names=['negative', 'positive'])

print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')

Accuracy: 0.8881
Classification Report:
              precision    recall  f1-score   support

    negative       0.91      0.86      0.88      4961
    positive       0.87      0.92      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

