# this task using Imdb data

In [1]:
import pandas as pd
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from nltk.corpus import stopwords
import nltk

In [2]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ramzy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
df = pd.read_csv('IMDB Dataset.csv')

In [4]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [5]:
texts = df['review'].values
labels = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0).values

In [6]:
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)


In [7]:
cleaned_texts = [preprocess_text(text) for text in texts]

In [8]:
max_features = 10000  
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(cleaned_texts)
sequences = tokenizer.texts_to_sequences(cleaned_texts)

In [9]:
maxlen = 100  
X = pad_sequences(sequences, maxlen=maxlen)
y = labels

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
model = Sequential()
model.add(Embedding(input_dim=max_features, output_dim=128, input_length=maxlen))
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))

In [18]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.build(input_shape=(None, maxlen))

In [20]:
model.summary()

In [21]:
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)

Epoch 1/5
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 74ms/step - accuracy: 0.7980 - loss: 0.4209 - val_accuracy: 0.8817 - val_loss: 0.2907
Epoch 2/5
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 67ms/step - accuracy: 0.9261 - loss: 0.1965 - val_accuracy: 0.8801 - val_loss: 0.3033
Epoch 3/5
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 76ms/step - accuracy: 0.9572 - loss: 0.1244 - val_accuracy: 0.8755 - val_loss: 0.3572
Epoch 4/5
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 78ms/step - accuracy: 0.9741 - loss: 0.0787 - val_accuracy: 0.8730 - val_loss: 0.4357
Epoch 5/5
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 80ms/step - accuracy: 0.9839 - loss: 0.0537 - val_accuracy: 0.8694 - val_loss: 0.5286


<keras.src.callbacks.history.History at 0x27d987f0950>

In [22]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 25ms/step - accuracy: 0.8639 - loss: 0.5350
Test Loss: 0.5220032930374146
Test Accuracy: 0.8691999912261963


In [23]:
model.save('imdb_model.h5')



In [29]:
word_index = tokenizer.word_index
def predict_sentiment(text, word_index, model):
    from tensorflow.keras.preprocessing.text import text_to_word_sequence

    words = text_to_word_sequence(text)  
    sequence = [[word_index.get(word, 2) for word in words]]  
    sequence = pad_sequences(sequence, maxlen=maxlen)  
    prediction = model.predict(sequence)[0][0]
    
    sentiment = "Positive" if prediction > 0.5 else "Negative"
    return sentiment

In [36]:
print(predict_sentiment("This movie was bad!", word_index, model))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
Negative


In [43]:
print(predict_sentiment("good movie ", word_index, model))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
Positive
