# Import the Library

In [151]:
!pip install nltk
!pip install gensim



In [153]:
import pandas as pd
import numpy as np
import re
import nltk
import gensim

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\visha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\visha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\visha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# Load the Dataset

In [155]:
df = pd.read_csv("IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Clean the Dataset

In [157]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # remove HTML
    text = re.sub(r'[^a-zA-Z]', ' ', text).lower()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['clean_review'] = df['review'].apply(clean_text)
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [158]:
df.head()

Unnamed: 0,review,sentiment,clean_review,label
0,One of the other reviewers has mentioned that ...,positive,one reviewer mentioned watching oz episode hoo...,1
1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...,1
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...,1
3,Basically there's a family where a little boy ...,negative,basically family little boy jake think zombie ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visually stunnin...,1


# Word2Vec

In [163]:
from gensim.models import Word2Vec

sentences = df['clean_review'].apply(lambda x: word_tokenize(x.lower()))
model_emb = Word2Vec(sentences=sentences,vector_size = 30,window = 5,min_count=1,workers=4)

model_emb.save('word2vec.model')

In [179]:
load_model = Word2Vec.load('word2vec.model')

print(load_model.wv['like'])

[ 1.3657792   2.1520717  -0.03343861 -0.80673236  3.0488727   1.3465574
 -0.5801902   0.12344187 -0.8569307   1.4789679  -2.1644387   1.4545419
  1.2706122  -0.01150827 -0.15645576 -2.5496585   0.68027467  2.1695824
  0.2917156  -1.2400804  -0.42600793 -3.6664736  -1.3809342   0.70287395
 -0.603281    2.2535245   0.842128    1.4191126   1.823295   -0.646687  ]


In [181]:
word_vec = load_model.wv

In [183]:
def sent_to_vec(sentences):

    return [word_vec[word] if word in word_vec else np.zeros(word_vec.vector_size) for word in sentences]

In [185]:
sentences = [sent_to_vec(sent) for sent in sentences]
X = pad_sequences(sentences,maxlen=50, padding='post', dtype='float32')

In [186]:
Y = np.array(df['label'])

# Split The Dataset

In [189]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,random_state=42,test_size=0.2)

# Model

In [195]:
model = Sequential()

model.add(SpatialDropout1D(0.1))
model.add(LSTM(64,dropout=0.2,recurrent_dropout=0.2))

model.add(Dense(1,activation='sigmoid'))
         
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
          
model.summary()

In [203]:
history = model.fit(x_train,y_train,epochs=5,batch_size=2)

Epoch 1/5
[1m20000/20000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m499s[0m 25ms/step - accuracy: 0.7915 - loss: 0.4468
Epoch 2/5
[1m20000/20000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m515s[0m 26ms/step - accuracy: 0.8120 - loss: 0.4099
Epoch 3/5
[1m20000/20000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m547s[0m 27ms/step - accuracy: 0.8165 - loss: 0.4015
Epoch 4/5
[1m20000/20000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m423s[0m 21ms/step - accuracy: 0.8247 - loss: 0.3881
Epoch 5/5
[1m20000/20000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m497s[0m 25ms/step - accuracy: 0.8245 - loss: 0.3841


# Evaluate

In [210]:
score, accuracy = model.evaluate(x_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.8436 - loss: 0.3413
Test Accuracy: 84.59%


# Prediction

In [254]:
new_sentence = "i dislike this movie"
new_sentence = clean_text(new_sentence)
new_sentence = word_tokenize(new_sentence)
new_sentence_vector = sent_to_vec(new_sentence)
new_sentence_padded = pad_sequences([new_sentence_vector], maxlen=50, padding='post', dtype='float32')

prediction = model.predict(new_sentence_padded)
print("Positive" if prediction > 0.5 else "Negative")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
Negative


# Save the Model

In [246]:
model.save('lstm_model.keras')
model_emb.save('word2vec.model')