In [33]:
PATH = r'./dataset/IMDB Dataset.csv'

In [34]:
import pandas as pd

df = pd.read_csv(PATH, on_bad_lines='skip')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [35]:
df.shape

(50000, 2)

In [36]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [37]:
print(f"Null Sentiments Values {len(df[pd.isnull(df['sentiment'])])}")
print(f"Null Review Values {len(df[pd.isnull(df['review'])])}")


Null Sentiments Values 0
Null Review Values 0


In [38]:
df.dropna(inplace=True)
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [39]:
df.fillna({'sentiment' : '', 'review' : ''}, inplace=True)
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [40]:
import re
def regex(data : str):
    data = re.sub(r'<.*?>', '', data)  
    data = re.sub(r'[^a-zA-Z\s]', '', data)   
    data = data.lower()
    data = data.strip()

    return data

In [41]:
df['sentiment'] = df['sentiment'].apply(regex)
df['review'] = df['review'].apply(regex)

df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


In [42]:
df['feedback'] = df['sentiment'].apply(lambda x : 1 if x == 'positive' else 0)
df['feedback']

0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: feedback, Length: 50000, dtype: int64

In [43]:
df.head()

Unnamed: 0,review,sentiment,feedback
0,one of the other reviewers has mentioned that ...,positive,1
1,a wonderful little production the filming tech...,positive,1
2,i thought this was a wonderful way to spend ti...,positive,1
3,basically theres a family where a little boy j...,negative,0
4,petter matteis love in the time of money is a ...,positive,1


In [44]:
from tensorflow.keras.layers import TextVectorization
MAX_SEQUENCE_LENGTH = 500


vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=MAX_SEQUENCE_LENGTH)
unique_words = vectorizer.adapt(df['review'])
print(f"Length : {len(vectorizer.get_vocabulary())}")
print(f"Type : {type(vectorizer.get_vocabulary())}")

# vectorizer.get_vocabulary()
seq = vectorizer(df['review'])

import numpy as np
x = np.array(seq)
y = df['feedback']

Length : 20000
Type : <class 'list'>


In [45]:
import numpy as np


from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(x, y, 
                                                    test_size=0.2, 
                                                    train_size=0.8, 
                                                    random_state=42, 
                                                    shuffle=True)



In [46]:
from tensorflow.keras.layers import LSTM, Embedding, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras import Input
model = Sequential()
model.add(Embedding(input_dim=len(vectorizer.get_vocabulary()) + 1, output_dim=128, input_shape=(MAX_SEQUENCE_LENGTH, )))
model.add(LSTM(64, return_sequences=True))
model.add(LSTM(64))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

  super().__init__(**kwargs)


In [47]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

model.fit(
            X_train, Y_train,
            epochs=50,
            batch_size=128,
            validation_data=(X_test, Y_test),
            verbose=1
        )

Epoch 1/50
[1m202/313[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m3:47[0m 2s/step - accuracy: 0.5007 - loss: 0.6934

KeyboardInterrupt: 