<a href="https://colab.research.google.com/github/Seenuprime/Deep-Learning/blob/main/IMDB_WordEmbedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
import pandas as pd
import zipfile

In [2]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
100% 25.7M/25.7M [00:01<00:00, 33.7MB/s]
100% 25.7M/25.7M [00:01<00:00, 21.7MB/s]


In [3]:
zip_dir = zipfile.ZipFile('/content/imdb-dataset-of-50k-movie-reviews.zip')
zip_dir.extractall()
zip_dir.close()

In [68]:
df = pd.read_csv(r'/content/IMDB Dataset.csv')
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [69]:
import re
df['review'] = df.review.map(lambda x: re.sub("[^a-zA-Z']", ' ', x))
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production br br The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,Petter Mattei's Love in the Time of Money is...,positive


In [70]:
df['review'] = df.review.map(lambda x: x.replace('br', ' '))
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production The f...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,Petter Mattei's Love in the Time of Money is...,positive


In [71]:
len(df['review'][0])

1754

In [72]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [73]:
df['review'] = df['review'].apply(lambda x: ' '.join(word for word in word_tokenize(x) if x not in stopwords))
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production The filming tech...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there 's a family where a little boy...,negative
4,Petter Mattei 's Love in the Time of Money is ...,positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,Bad plot bad dialogue bad acting idiotic direc...,negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I 'm going to have to disagree with the previo...,negative


In [74]:
len(df['review'][0])

1680

In [75]:
df.sentiment.unique()

array(['positive', 'negative'], dtype=object)

In [76]:
X = df['review']
y = df['sentiment']

In [77]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [78]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

In [79]:
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [80]:
len(X_train), len(X_test)

(40000, 10000)

In [81]:
vocab_size = 10000
sequence_len = 100

In [82]:
vectorizer = tf.keras.layers.TextVectorization(
        max_tokens=vocab_size,
        output_mode = 'int',
        output_sequence_length=sequence_len)

vectorizer.adapt(df['review'])

In [83]:
vectorizer.vocabulary_size()

10000

In [84]:
Embedding = tf.keras.layers.Embedding(input_dim=vocab_size,
                                      output_dim=128,
                                      embeddings_initializer='uniform')

In [85]:
sample_embed = Embedding(vectorizer(['One of the other reviewers has mentioned']))
sample_embed

<tf.Tensor: shape=(1, 100, 128), dtype=float32, numpy=
array([[[ 0.04410741,  0.0309742 ,  0.02332336, ..., -0.04401497,
         -0.04824281,  0.04803634],
        [ 0.02284516, -0.0494113 ,  0.03385242, ..., -0.01371858,
         -0.00631809,  0.04938685],
        [ 0.03235071,  0.04881194,  0.03519031, ...,  0.00993924,
          0.0187796 , -0.0053378 ],
        ...,
        [-0.0480024 ,  0.04931192, -0.02571647, ...,  0.01014579,
         -0.01406384, -0.03754685],
        [-0.0480024 ,  0.04931192, -0.02571647, ...,  0.01014579,
         -0.01406384, -0.03754685],
        [-0.0480024 ,  0.04931192, -0.02571647, ...,  0.01014579,
         -0.01406384, -0.03754685]]], dtype=float32)>

In [86]:
from tensorflow.keras import layers

In [87]:
inputs = layers.Input(shape=(1, ), dtype='string')

x = vectorizer(inputs)
x = Embedding(x)

x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(32, activation='relu'))(x)
x = layers.Dense(32, activation='relu')(x)

outputs = layers.Dense(1, activation='relu')(x)

model = tf.keras.Model(inputs, outputs)

In [88]:
model.summary()

In [89]:
model.compile(loss = 'binary_crossentropy',
              optimizer = 'adam',
              metrics = ['accuracy'])

In [90]:
model.fit(X_train, y_train,
          batch_size=32,
          epochs=5)

Epoch 1/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m356s[0m 282ms/step - accuracy: 0.6385 - loss: 1.5160
Epoch 2/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m383s[0m 283ms/step - accuracy: 0.7628 - loss: 1.6787
Epoch 3/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m372s[0m 297ms/step - accuracy: 0.7405 - loss: 1.0514
Epoch 4/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m394s[0m 307ms/step - accuracy: 0.7839 - loss: 1.1853
Epoch 5/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m438s[0m 304ms/step - accuracy: 0.8035 - loss: 1.5801


<keras.src.callbacks.history.History at 0x786338f6c340>

In [91]:
model.evaluate(X_test, y_test)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 85ms/step - accuracy: 0.4978 - loss: 8.0947


[8.121909141540527, 0.4961000084877014]

In [92]:
preds = model.predict(X_test)
preds[:5]

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 86ms/step


array([[0.],
       [0.],
       [0.],
       [0.],
       [0.]], dtype=float32)

In [93]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, preds))

0.4961
