In [None]:
What You're Aiming For

Objective:

Train a neural network to classify movie reviews from the IMDb dataset as positive or negative.

Dataset:

The dataset contains movie reviews with corresponding sentiments (positive or negative).
It's stored in a file named 'IMDB Dataset.csv'.

Instructions

Tools and Libraries Required:

Python
Pandas for data handling
Matplotlib and Seaborn for data visualization
NLTK for text preprocessing
Scikit-learn for machine learning utilities
TensorFlow/Keras for building and training neural network models
Steps:

Data Loading and Exploration:
 
Load the data using Pandas.
Explore the dataset to understand the distribution of sentiments, the length of reviews, and other characteristics.
 
Data Preprocessing:
 
Convert all reviews to lower case.
Remove HTML tags and URLs from reviews.
Tokenize the text and remove stop words.
Use TF-IDF Vectorization to convert text data into a format suitable for input into the neural network.
 
Model Building:
 
Construct a Sequential model with Dense layers:
First layer: Dense, ReLU activation (input dimension should match the number of features from TF-IDF).
Hidden layers: experiment with different sizes and activations.
Output layer: Dense, Sigmoid activation (binary classification).
Compile the model with binary crossentropy loss and accuracy metrics.
 
Model Training:
 
Train the model using the training set.
Use a validation split to monitor performance on unseen data during training.
Adjust parameters like the number of epochs and batch size as needed.
 
Evaluation:
 
Evaluate the model on a separate test set to assess its performance.
Use metrics such as accuracy and loss.
 
Visualization:
 
Plot training and validation loss over epochs.
Plot training and validation accuracy over epochs.
 
Report:
 
Provide insights gained from the project.
Discuss any challenges faced and how they were overcome.
Suggest potential improvements for the model or preprocessing steps.
 


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout


    

2025-02-28 11:02:39.274870: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Download required NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

# Load dataset
df = pd.read_csv("Desktop/IMDB Dataset.csv")

# Explore data
print(df.head())
print(df.info())
print(df['sentiment'].value_counts())


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/macbookair/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/macbookair/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB
None
sentiment
positive    25000
negative    25000
Name: count, dtype: int64


In [None]:

# Convert labels to binary
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Text preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    words = word_tokenize(text)
    words = [word for word in words if word.isalpha() and word not in stopwords.words('english')]
    return " ".join(words)

# Apply preprocessing
df['review'] = df['review'].apply(preprocess_text)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)

# Convert text to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Build the neural network model
model = Sequential([
    Dense(512, activation='relu', input_shape=(X_train_tfidf.shape[1],)),
    Dropout(0.5),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_tfidf.toarray(), y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_tfidf.toarray(), y_test)
print(f'Test Accuracy: {accuracy:.4f}')

# Plot training history
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Loss Over Epochs')

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Accuracy Over Epochs')
plt.show()