In [16]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Dense,LSTM,Embedding,Dropout,Bidirectional,GlobalMaxPool1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [22]:
import pandas as pd
import numpy as np
import os ,re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

In [3]:
#Initialze All

stemmer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

In [4]:
data = pd.read_csv("datasets/imdb.csv")

In [5]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
data['target'] = data['sentiment'].apply(lambda x : 1 if x=="positive" else 0)

In [7]:
def clean_text(text):
    words = re.sub(r"<br />","",text)
    words =  re.sub(r"[^\w\s]","",text)
    words = words.lower()
    words = [stemmer.lemmatize(w) for w in words.split(" ")]
    words = [w for w in words if w not in stop_words]
    return " ".join(words)

In [8]:
data['new'] = data['review'].apply(lambda x: clean_text(x))

In [9]:
max_features= 6000

token = Tokenizer(num_words = max_features)

In [10]:
token.fit_on_texts(data['new'])
train = token.texts_to_sequences(data['new'])

In [11]:
max_len =130

In [12]:
x = pad_sequences(train,maxlen=max_len)

In [13]:
y =  data['target']

In [14]:
embed_size=128

In [18]:
model = Sequential()
model.add(Embedding(max_features,embed_size))
model.add(Bidirectional(LSTM(32,return_sequences=True)))
model.add(GlobalMaxPool1D())
model.add(Dense(20,activation='relu'))
model.add(Dropout(0.05))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [23]:
x_train,x_test,y_train,y_test =  train_test_split(x,y,test_size=0.3)

In [24]:
x_train.shape

(35000, 130)

In [26]:
model.fit(x_train,y_train,batch_size=100,validation_split=0.2,epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x4f97cd08>

In [27]:
loss,acc = model.evaluate(x_test,y_test)




In [28]:
acc

0.9176666736602783

In [29]:
model.save("models/imdb.h5")