In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from matplotlib import style
plt.style.use('ggplot')
import re
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [None]:
# Load the IMDB dataset into a pandas DataFrame
df = pd.read_csv('IMDB Dataset.csv')
# Display the first few rows of the DataFrame
df.head()

In [None]:
# Display the shape (number of rows and columns) of the DataFrame
df.shape


In [None]:
# Display information about the DataFrame, including data types and missing values
df.info()

In [None]:
# Create a count plot to visualize the distribution of positive and negative sentiments
sns.countplot(x='sentiment', data=df)
plt.title("Positive - Negative Ratio Graph")
plt.show()


In [None]:
# Print the first 5 reviews and their corresponding sentiments
for i in range(5):
    print("Review: ", [i])
    print(df['review'].iloc[i], "\n")
    print("Sentiment: ", df['sentiment'].iloc[i], "\n\n")

In [None]:
# Define a function to count the number of words in a text
def no_of_words(text):
    words= text.split()
    word_count = len(words)
    return word_count

In [None]:
# Apply the no_of_words function to create a new column 'word count'
df['word count'] = df['review'].apply(no_of_words)

In [None]:
# Display the first few rows of the DataFrame with the new 'word count' column
df.head()

In [None]:
# Create histograms to visualize the distribution of word counts for positive and negative reviews
fig, ax = plt.subplots(1,2, figsize=(10,6))
ax[0].hist(df[df['sentiment'] == 'positive']['word count'], label='Positive', color='blue', rwidth=0.9);
ax[0].legend(loc='upper right');
ax[1].hist(df[df['sentiment'] == 'negative']['word count'], label='Negative', color='red', rwidth=0.9);
ax[1].legend(loc='upper right');
fig.suptitle("Number of words in review")
plt.show()

In [None]:
# Replace sentiment labels with numerical values (1 for positive, 0 for negative)
df['sentiment'] = df['sentiment'].replace({"positive": 1, "negative": 0})


In [None]:
# Display the first few rows of the DataFrame with the updated sentiment values
df.head()

In [None]:
# Define a function to preprocess text data
def data_processing(text):
    text = text.lower()
    text = re.sub('<br />', '', text)
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@w+|\#','', text)
    text = re.sub(r'[^\w\s]','', text)
    text_tokens = word_tokenize(text)
    filtered_text = [w for w in text_tokens if not w in stop_words]
    return " ".join(filtered_text)

In [None]:
# Importing NLTK library and downloading the 'punkt' tokenizer model for text processing
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

# Applying a data processing function to the 'review' column
df['review'] = df['review'].apply(data_processing)


In [None]:
# Checking and displaying the count of duplicate entries in the dataset
duplicated_count = df.duplicated().sum()
print("Number of duplicate entries: ", duplicated_count)

In [None]:
# Removing duplicate rows based on the 'review' column
df = df.drop_duplicates('review')

In [None]:
# Stemming words in the 'review' column to normalize text
stemmer = PorterStemmer()
def stemming(data):
    text = [stemmer.stem(word) for word in data.split()]
    return " ".join(text)


In [None]:
df['review'] = df['review'].apply(lambda x: stemming(x))


In [None]:
# Adding a 'word count' column and displaying the first few rows of the updated DataFrame
df['word count'] = df['review'].apply(no_of_words)
df.head()

In [None]:
 # Filtering and displaying reviews with a positive sentiment (sentiment = 1)
pos_reviews = df[df.sentiment ==1]
pos_reviews.head()

In [None]:
# Generating and displaying a word cloud for the most frequent words in positive reviews
text = ' '.join([word for word in pos_reviews['review']])
plt.figure(figsize=(20,15), facecolor='None')
wordcloud = WordCloud(max_words=100, width=1600, height=800, collocations=False).generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title('Most frequent words in positive reviews', fontsize = 19)
plt.show()


In [None]:
# Counting and displaying the top 15 most common words in positive reviews
from collections import Counter
count = Counter()
for text in pos_reviews['review'].values:
    for word in text.split():
        count[word] +=1
count.most_common(15)

In [None]:
# Creating and displaying a DataFrame with the most common words and their counts in positive reviews
pos_words = pd.DataFrame(count.most_common(15))
pos_words.columns = ['word', 'count']
pos_words.head()

In [None]:
# Plotting a bar chart of common words in positive reviews
px.bar(pos_words, x='count', y='word', title='Common words in positive reviews', color='word')

In [None]:
# Filtering and displaying reviews with a negative sentiment (sentiment = 0)
neg_reviews = df[df.sentiment == 0]
neg_reviews.head()

In [None]:
# Generating and displaying a word cloud for the most frequent words in negative reviews
text = ' '.join([word for word in neg_reviews['review']])
plt.figure(figsize=(20,15), facecolor='None')
wordcloud = WordCloud(max_words=100, width=1600, height=800, collocations=False).generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title('Most frequent words in negative reviews', fontsize = 19)
plt.show()


In [None]:
# Counting and displaying the top 15 most common words in negative reviews
count = Counter()
for text in neg_reviews['review'].values:
    for word in text.split():
        count[word] += 1
count.most_common(15)

In [None]:
# Creating and displaying a DataFrame with the most common words and their counts in negative reviews
neg_words = pd.DataFrame(count.most_common(15))
neg_words.columns = ['word', 'count']
neg_words.head()

In [None]:
# Plotting a bar chart of common words in negative reviews
px.bar(neg_words, x='count', y='word', title='Common words in negative reviews', color='word')

In [None]:
X = df['review']
Y = df['sentiment']

In [None]:
# Transforming review text into TF-IDF features
vect = TfidfVectorizer()
X = vect.fit_transform(df['review'])

In [None]:
# Splitting data into training and testing sets(70% training, 30% testing)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [None]:
print("Size of x_train: ", (x_train.shape))
print("Size of y_train: ", (y_train.shape))
print("Size of x_test: ", (x_test.shape))
print("Size of y_test: ", (y_test.shape))

In [None]:
# Limiting training and testing data to a subset for faster model training, and safe running on a system
# We can increase the size gradually to improve results, provided we have an efficient CPU/GPU for running it
x_train = x_train[:2000]
y_train = y_train[:2000]
x_test = x_test[:500]
y_test = y_test[:500]

In [None]:
print("Size of x_train: ", (x_train.shape))
print("Size of y_train: ", (y_train.shape))
print("Size of x_test: ", (x_test.shape))
print("Size of y_test: ", (y_test.shape))

In [None]:
# Converting sparse matrices to arrays
x_train = x_train.toarray()
x_test = x_test.toarray()

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Input, Dropout

# Setting input dimension for the neural network model
input_dim = x_train.shape[1]

# Defining a simple neural network model with one hidden layer for sentiment analysis
model = Sequential([
    Input(shape=(input_dim,)),
    Dense(units=64, activation='relu'),
    Dense(units=1, activation='sigmoid')
])

# We can also run the below code to compile the model using the same data but with a different activation function and more layers. After running both, we can choose the better one for our model
'''
model = Sequential([
    Input(shape=(input_dim,)),
    Dense(units=128, activation='relu'),
    Dropout(0.2),
    Dense(units=64, activation='relu'),
    Dense(units=32, activation='relu'),
    Dense(units=16, activation='swish'),
    Dense(units=1, activation='sigmoid')
])
'''


In [None]:
# Specifying which optimizer to use
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

# We can also run the below code to compile the model using the same data but with a different optimizer. After running both, we can choose the better one for our model based on the accuracy
'''
from tensorflow.keras.optimizers import Adam

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
'''

In [None]:
# Specifying the essential information like number of epochs for running the Neural Network
history = model.fit(x_train, y_train, batch_size=10, epochs=15)

# We can try with different batch sizes and epochs and see which one gives a better model in terms of accuracy. Alternative is given below
'''
history = model.fit(x_train, y_train, batch_size=32, epochs=20)
'''

In [None]:
model.summary()

In [None]:
# Seeing the accuracy and error values of the model using the particular optimizer and activation function
test_loss, test_acc = model.evaluate(x_test, y_test)
print('Test loss:', test_loss)
print('Test accuracy:', test_acc)

In [None]:
# === FINAL RESULTS SUMMARY ===
import numpy as np

# Get predictions on test set
predictions = model.predict(x_test)
predicted_labels = (predictions > 0.5).astype(int).flatten()

print('=' * 60)
print('SENTIMENT ANALYSIS MODEL - FINAL RESULTS')
print('=' * 60)
print(f'\nTest Loss:     {test_loss:.4f}')
print(f'Test Accuracy: {test_acc:.4f} ({test_acc*100:.1f}%)')
print(f'\nTotal test samples: {len(y_test)}')
print(f'Correct predictions: {(predicted_labels == y_test.values).sum()}')
print(f'Wrong predictions:   {(predicted_labels != y_test.values).sum()}')
print('\n' + '=' * 60)
print('SAMPLE PREDICTIONS')
print('=' * 60)
for i in range(10):
    actual = 'Positive' if y_test.values[i] == 1 else 'Negative'
    predicted = 'Positive' if predicted_labels[i] == 1 else 'Negative'
    status = '✓' if y_test.values[i] == predicted_labels[i] else '✗'
    print(f'  [{status}] Sample {i+1}: Actual={actual:>8s}, Predicted={predicted:>8s}')
print('=' * 60)


In [None]:
# === FINAL RESULTS SUMMARY ===
import numpy as np

# Get predictions on test set
predictions = model.predict(x_test)
predicted_labels = (predictions > 0.5).astype(int).flatten()

print('=' * 60)
print('SENTIMENT ANALYSIS MODEL - FINAL RESULTS')
print('=' * 60)
print(f'\nTest Loss:     {test_loss:.4f}')
print(f'Test Accuracy: {test_acc:.4f} ({test_acc*100:.1f}%)')
print(f'\nTotal test samples: {len(y_test)}')
print(f'Correct predictions: {(predicted_labels == y_test.values).sum()}')
print(f'Wrong predictions:   {(predicted_labels != y_test.values).sum()}')
print('\n' + '=' * 60)
print('SAMPLE PREDICTIONS')
print('=' * 60)
for i in range(10):
    actual = 'Positive' if y_test.values[i] == 1 else 'Negative'
    predicted = 'Positive' if predicted_labels[i] == 1 else 'Negative'
    status = 'OK' if y_test.values[i] == predicted_labels[i] else 'WRONG'
    print(f'  [{status}] Sample {i+1}: Actual={actual:>8s}, Predicted={predicted:>8s}')
print('=' * 60)
