# Import the Library

In [322]:
!pip install nltk
!pip install gensim



In [323]:
import pandas as pd
import numpy as np
import re
import nltk
import gensim

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\visha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\visha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\visha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# Load the Dataset

In [336]:
df = pd.read_csv("IMDB Dataset.csv")

In [338]:
import pandas as pd

# Your existing dataset (assuming it's already loaded as df)
# df = pd.read_csv("your_file.csv")  # Uncomment if you're loading from a file

# ✅ Synthetic sentiment data (basic examples)
synthetic_data_1 = pd.DataFrame({
    "review": [
        "This movie is fantastic! A must-watch.",
        "I hated the storyline. It was boring.",
        "Excellent direction and great performances.",
        "Terrible plot and worse acting.",
        "One of the best movies I've ever seen!",
        "Absolutely dreadful. I walked out halfway.",
        "A delightful surprise, I really enjoyed it.",
        "I wouldn't recommend this movie to anyone.",
        "Loved the cinematography, disliked the pacing.",
        "Not bad at all. Quite entertaining."
    ],
    "sentiment": [
        "positive", "negative", "positive", "negative", "positive",
        "negative", "positive", "negative", "mixed", "positive"
    ]
})

# ✅ Synthetic data with negation examples
synthetic_data_2 = pd.DataFrame({
    "review": [
        "I don't like this movie at all. It was a total disappointment.",
        "I didn't enjoy the film. The acting wasn't convincing.",
        "This wasn't what I expected. It turned out much better!",
        "I can't say enough good things about this movie. Truly amazing!",
        "I never thought I would enjoy a romance movie, but this one was great.",
        "I do not recommend this film. It was a waste of time.",
        "Not a bad movie. I actually enjoyed parts of it.",
        "It is not a good film. I got bored halfway through.",
        "I wouldn’t say it was great, but it wasn’t bad either.",
        "I didn't think it would be this good. Pleasantly surprised!"
    ],
    "sentiment": [
        "negative", "negative", "positive", "positive", "positive",
        "negative", "positive", "negative", "neutral", "positive"
    ]
})

combined_df = pd.concat([df, synthetic_data_1, synthetic_data_2], ignore_index=True)

combined_df.to_csv("(uodate)reviews.csv", index=False)

print(f"Combined dataset shape: {combined_df.shape}")

Combined dataset shape: (50020, 2)


# Clean the Dataset

In [466]:
def handle_negation(text):
    # Use regex to match negations as full words
    negations = [
        "don't", "isn't", "aren't", "didn't", "can't", "won't", "never", 
        "no", "nothing", "none", "nobody", "neither", "nowhere", 
        "without", "hardly", "scarcely", "barely", "not", "doesn't", "wasn't", 
        "weren't", "shouldn't", "wouldn't", "couldn't", "hasn't", "haven't"
    ]
    for neg in negations:
        # Use regex to replace whole word negation only
        pattern = r'\b' + re.escape(neg) + r'\b'
        text = re.sub(pattern, 'NOT', text)
    return text

def clean_text(text):
    text = text.lower()
    text = handle_negation(text)  # Handle negation first
    text = re.sub(r'<.*?>', '', text)  # Remove HTML
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove punctuation
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)


In [468]:
df.head()

Unnamed: 0,review,sentiment,clean_review,label
0,One of the other reviewers has mentioned that ...,positive,one reviewer mentioned watching 1 oz episode y...,1
1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...,1
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...,1
3,Basically there's a family where a little boy ...,negative,basically there family little boy jake think t...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love time money visually stunni...,1


# Word2Vec

In [471]:
from gensim.models import Word2Vec

sentences = df['clean_review'].apply(lambda x: word_tokenize(x.lower()))
model_emb = Word2Vec(sentences=sentences,vector_size = 30,window = 5,min_count=1,workers=4)

model_emb.save('word2vec.model')

In [472]:
load_model = Word2Vec.load('word2vec.model')

print(load_model.wv['like'])

[ 1.5483004  -2.4560332   3.1030588   1.9431099   0.52062637  0.07774864
 -0.1281014   1.6092889   1.6676165  -2.5409896  -0.64096344  0.7478205
 -0.97975343 -1.0739657   2.2897947  -0.00571437  0.3133881  -0.76322925
 -0.5285003   2.8328419  -0.4208395  -3.0923672   3.6359365   1.7251447
  0.9023777  -2.5018787  -3.0402715   2.1696787   2.092648   -1.4597156 ]


In [473]:
word_vec = load_model.wv

In [474]:
def sent_to_vec(sentences):

    return [word_vec[word] if word in word_vec else np.zeros(word_vec.vector_size) for word in sentences]

In [487]:
sentences = [sent_to_vec(sent) for sent in sentences]
X = pad_sequences(sentences,maxlen=50,padding='post', dtype='float32')

In [476]:
Y = np.array(df['label'])

# Split The Dataset

In [491]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,random_state=42,test_size=0.2,stratify = Y)

# Model

In [493]:
model = Sequential()

model.add(SpatialDropout1D(0.1))
model.add(LSTM(64,dropout=0.2,recurrent_dropout=0.2))

model.add(Dense(1,activation='sigmoid'))
         
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
          
model.summary()

In [495]:
history = model.fit(x_train,y_train,epochs=20,batch_size=20)

Epoch 1/20
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 15ms/step - accuracy: 0.7265 - loss: 0.5348
Epoch 2/20
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 18ms/step - accuracy: 0.8006 - loss: 0.4270
Epoch 3/20
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 17ms/step - accuracy: 0.8137 - loss: 0.4023
Epoch 4/20
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 17ms/step - accuracy: 0.8189 - loss: 0.3942
Epoch 5/20
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 18ms/step - accuracy: 0.8253 - loss: 0.3813
Epoch 6/20
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 18ms/step - accuracy: 0.8259 - loss: 0.3771
Epoch 7/20
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 16ms/step - accuracy: 0.8343 - loss: 0.3671
Epoch 8/20
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 17ms/step - accuracy: 0.8346 - loss: 0.3622
Epoch 9/

# Evaluate

In [497]:
score, accuracy = model.evaluate(x_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.8577 - loss: 0.3238
Test Accuracy: 85.74%


# Prediction

In [515]:
new_sentence = "I do not love this movie.It is waste of time"
new_sentence = clean_text(new_sentence)
new_sentence = word_tokenize(new_sentence)
new_sentence_vector = sent_to_vec(new_sentence)
new_sentence_padded = pad_sequences([new_sentence_vector], maxlen=50, padding='post', dtype='float32')

prediction = model.predict(new_sentence_padded)
print("Positive" if prediction >= 0.5 else "Negative")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
Negative


# Save the Model

In [517]:
model.save('lstm_model.keras')
model_emb.save('word2vec.model')