In [24]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

doc1 = "At the crack of dawn, the sun emerges from the east, painting the surroundings with its radiant beams. The harmonious chorus of chirping birds resonates in the atmosphere, while a serene gust of wind gracefully moves through the foliage, creating a mesmerizing dance of leaves."
doc2 = "With the first light of day, the sun ascends into the sky from the eastern horizon, suffusing the landscape with its gentle warmth. The air fills with the delightful melody of birdsong, accompanied by the soft rustle of leaves swaying in the breeze, creating a serene morning tableau."


def calculate_cosine_similarity(doc1, doc2):
    vectorizer = CountVectorizer().fit_transform([doc1, doc2])
    vectors = vectorizer.toarray()
    return cosine_similarity(vectors)[0, 1]


def calculate_jaccard_similarity(doc1, doc2):
    words_doc1 = set(doc1.split())
    words_doc2 = set(doc2.split())
    intersection = len(words_doc1.intersection(words_doc2))
    union = len(words_doc1.union(words_doc2))
    return intersection / union


cosine_sim = calculate_cosine_similarity(doc1, doc2)
print("Cosine Similarity:", cosine_sim)

# Calculate Jaccard Similarity
jaccard_sim = calculate_jaccard_similarity(doc1, doc2)
print("Jaccard Similarity:", jaccard_sim)


Cosine Similarity: 0.7540452538836004
Jaccard Similarity: 0.1746031746031746


In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [26]:
df=pd.read_csv("/content/IMDB Dataset.csv")

In [27]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [28]:
df.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [30]:
df.shape

(50000, 2)

In [31]:
df.columns

Index(['review', 'sentiment'], dtype='object')

In [32]:
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


# Preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply preprocessing to 'review' column
df['review'] = df['review'].apply(preprocess_text)

# Split into features and target
X = df['review'].values
y = df['sentiment'].values

# Split the dataset into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [19]:
# Tokenization and padding
max_words = 10000
max_len = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

# Check the shape of the data
print("X_train shape:", X_train_pad.shape)
print("X_val shape:", X_val_pad.shape)
print("X_test shape:", X_test_pad.shape)
print("y_train shape:", y_train.shape)
print("y_val shape:", y_val.shape)
print("y_test shape:", y_test.shape)

X_train shape: (35000, 100)
X_val shape: (7500, 100)
X_test shape: (7500, 100)
y_train shape: (35000,)
y_val shape: (7500,)
y_test shape: (7500,)


Sentiment Analysis using Bayesian Classification:

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)

# Train the Multinomial Naive Bayes classifier
bayes_classifier = MultinomialNB()
bayes_classifier.fit(X_train_tfidf, y_train)

# Predictions on validation set
y_pred_val = bayes_classifier.predict(X_val_tfidf)
val_accuracy = accuracy_score(y_val, y_pred_val)
print("Validation Accuracy (Bayesian):", val_accuracy)

# Predictions on test set
y_pred_test = bayes_classifier.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, y_pred_test)
print("Test Accuracy (Bayesian):", test_accuracy)


Validation Accuracy (Bayesian): 0.8494666666666667
Test Accuracy (Bayesian): 0.8569333333333333


Sentiment Analysis using RNN:

In [21]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform labels
y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.transform(y_val)
y_test = label_encoder.transform(y_test)

# Convert target variable
y_train = y_train.astype(np.float32)
y_val = y_val.astype(np.float32)
y_test = y_test.astype(np.float32)


In [22]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

# Build the RNN model
model_rnn = Sequential()
model_rnn.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model_rnn.add(SimpleRNN(units=128, dropout=0.2))  # Adding dropout for regularization
model_rnn.add(Dense(units=1, activation='sigmoid'))

model_rnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the RNN model
history_rnn = model_rnn.fit(X_train_pad, y_train, validation_data=(X_val_pad, y_val), epochs=5, batch_size=128)

# Evaluate the RNN model
val_loss_rnn, val_accuracy_rnn = model_rnn.evaluate(X_val_pad, y_val)
print("Validation Accuracy (RNN):", val_accuracy_rnn)

test_loss_rnn, test_accuracy_rnn = model_rnn.evaluate(X_test_pad, y_test)
print("Test Accuracy (RNN):", test_accuracy_rnn)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Validation Accuracy (RNN): 0.76173335313797
Test Accuracy (RNN): 0.7576000094413757


Sentiment Analysis using LSTM:

In [23]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Build the LSTM model
model_lstm = Sequential()
model_lstm.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model_lstm.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))  # Adding dropout for regularization
model_lstm.add(Dense(units=1, activation='sigmoid'))

model_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the LSTM model
history_lstm = model_lstm.fit(X_train_pad, y_train, validation_data=(X_val_pad, y_val), epochs=5, batch_size=128)

# Evaluate the LSTM model
val_loss_lstm, val_accuracy_lstm = model_lstm.evaluate(X_val_pad, y_val)
print("Validation Accuracy (LSTM):", val_accuracy_lstm)

test_loss_lstm, test_accuracy_lstm = model_lstm.evaluate(X_test_pad, y_test)
print("Test Accuracy (LSTM):", test_accuracy_lstm)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Validation Accuracy (LSTM): 0.8374666571617126
Test Accuracy (LSTM): 0.8421333432197571
