
Q1. Download a sentiment-labeled dataset (e.g., Twitter sentiment dataset,
etc.).

* Preprocess the text: Perform tokenization, remove stop words, and apply padding to ensure equal sequence length.
* Build an LSTM model with the following layers:
  * Embedding layer.
  * LSTM layer.
  * Dense layer for classification.
  * Train the model to classify text into positive, negative, or neutral sentiment.
  * Evaluate the model using metrics like accuracy, precision, recall, and
F1-score and interpret the results.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
data = pd.read_csv('/content/IMDB Dataset.csv - IMDB Dataset (1).csv')

In [None]:
ls

'IMDB Dataset.csv - IMDB Dataset (1).csv'   [0m[01;34msample_data[0m/


In [None]:
data.shape

(50000, 2)

In [None]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
data.sentiment.value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [None]:
data.replace({'sentiment': {'positive': 1, 'negative': 0}}, inplace=True)

  data.replace({'sentiment': {'positive': 1, 'negative': 0}}, inplace=True)


In [None]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [None]:
data.sentiment.value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
1,25000
0,25000


In [None]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
print(train_data.shape)
print(test_data.shape)

(40000, 2)
(10000, 2)


Data Preprocessing

In [None]:
#tokenize test data

tokenizer = Tokenizer(num_words=5000 )
tokenizer.fit_on_texts(train_data['review'])
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data['review']), maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data['review']), maxlen=200)

In [None]:
print(X_train)
print(X_test)

[[1935    1 1200 ...  205  351 3857]
 [   3 1651  595 ...   89  103    9]
 [   0    0    0 ...    2  710   62]
 ...
 [   0    0    0 ... 1641    2  603]
 [   0    0    0 ...  245  103  125]
 [   0    0    0 ...   70   73 2062]]
[[   0    0    0 ...  995  719  155]
 [  12  162   58 ...  380    7    7]
 [   0    0    0 ...   50 1088   96]
 ...
 [   0    0    0 ...  125  200 3241]
 [   0    0    0 ... 1066    1 2304]
 [   0    0    0 ...    1  332   27]]


In [None]:
Y_train = train_data['sentiment']
Y_test = test_data['sentiment']

In [None]:
Y_train

Unnamed: 0,sentiment
39087,0
30893,0
45278,1
16398,0
13653,0
...,...
11284,1
44732,1
38158,0
860,1


LSTM - Long SHort Term Memory

In [None]:
# build the model

model = Sequential()
model.add(Embedding(5000, 128, input_length=200))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))



In [None]:
model.summary()

In [None]:
#complie the model

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

Training the Model

In [None]:
model.fit(X_train, Y_train, batch_size=32, epochs=1, validation_split=0.2)

Epoch 1/5
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m525s[0m 525ms/step - accuracy: 0.7622 - loss: 0.5053 - val_accuracy: 0.8456 - val_loss: 0.3604
Epoch 2/5
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m584s[0m 546ms/step - accuracy: 0.8259 - loss: 0.3968 - val_accuracy: 0.8465 - val_loss: 0.3493
Epoch 3/5
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m555s[0m 540ms/step - accuracy: 0.8645 - loss: 0.3265 - val_accuracy: 0.8752 - val_loss: 0.3071
Epoch 4/5
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m539s[0m 517ms/step - accuracy: 0.8989 - loss: 0.2522 - val_accuracy: 0.8690 - val_loss: 0.3121
Epoch 5/5
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m585s[0m 539ms/step - accuracy: 0.9118 - loss: 0.2243 - val_accuracy: 0.8851 - val_loss: 0.2970


<keras.src.callbacks.history.History at 0x7beb23687b50>

Model Evaluation

In [None]:
loss, accuracy = model.evaluate(X_test, Y_test)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 122ms/step - accuracy: 0.8853 - loss: 0.2788
Test Loss: 0.27828001976013184
Test Accuracy: 0.8888999819755554


In [None]:
 #Predict and evaluate using confusion matrix
 Y_pred = (model.predict(X_test) > 0.5).astype(int)
cm = confusion_matrix(Y_test, Y_pred)
print('Confusion Matrix:')
print(cm)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 93ms/step
Confusion Matrix:
[[4410  551]
 [ 560 4479]]


In [None]:
#Performance metrics
report = classification_report(Y_test, Y_pred)
print('Classification Report:')
print(report)

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.89      0.89      4961
           1       0.89      0.89      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



Building a Predictive System

In [None]:
def predict_sentiment(review):
  #tokenize and pad the review
  review_sequence = tokenizer.texts_to_sequences([review])
  review_padded = pad_sequences(review_sequence, maxlen=200)
  #make prediction
  prediction = model.predict(review_padded)
  sentiment = 'positive' if prediction[0][0] > 0.5 else 'negative'
  return sentiment

In [None]:
# example usage
new_review = "This movie was fantastic! I loved every minute of it."
predicted_sentiment = predict_sentiment(new_review)
print("Predicted sentiment:", predicted_sentiment)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
Predicted sentiment: positive
