In [1]:
import pandas as pd 
import numpy as np

In [2]:
df=pd.read_csv('IMDB Dataset.csv')

In [3]:
df.sample(5)

Unnamed: 0,review,sentiment
21508,"One night, barkeeper Randy (Matt Dillon) rescu...",positive
33750,"Reading the other user comments, the review by...",negative
44874,Foolish hikers go camping in the Utah mountain...,negative
18508,Acolytes presents an interesting mix of origin...,negative
36727,"...this is, above all else, the typical Crown ...",positive


In [4]:
df['sentiment']=df['sentiment'].map({'negative':0,'positive':1})

In [5]:

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))
punctuations="./><''!@#$%^&*()_=+,"

def remove_stop_words(x):
    word_tokens = word_tokenize(x)
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    filtered_sentence1 = [w for w in filtered_sentence  if not w.lower() in punctuations]
    
    return " ".join(filtered_sentence1).lower()

    
    

In [6]:
df['review']=df['review'].apply(remove_stop_words)

In [7]:

from nltk.stem import WordNetLemmatizer
  
lemmatizer = WordNetLemmatizer()

  
def lemmatization_corpus(x):
    word_tokens = word_tokenize(x)
    res=[]
    for i in word_tokens:
        res.append(lemmatizer.lemmatize(i))
    return " ".join(res)
        
    

In [8]:
df['review']=df['review'].apply(lemmatization_corpus)

In [9]:
df.sample(5)

Unnamed: 0,review,sentiment
49368,`` mask moving film work many level simplest h...,1
6649,saw movie first time surprised little shocked ...,1
43876,... disney film garbage . br br anyway saw `` ...,0
18026,saw film chance small box fantastic chilling s...,1
14666,think problem show getting respect truly deser...,1


In [10]:
data_0 = df[df['sentiment'] == 0]
data_1 = df[df['sentiment'] == 1]

train_size = int(0.7*25000)
val_size = int(0.2*25000)

data_train = pd.concat((data_0[:train_size], data_1[:train_size]), axis = 0)
data_val = pd.concat((data_0[train_size: (train_size + val_size)], data_1[train_size:(train_size + val_size)]), axis = 0)
data_test = pd.concat((data_0[(train_size + val_size):], data_1[(train_size + val_size):]), axis = 0)

X_train, y_train = list(data_train['review']), np.array(data_train['sentiment'])
X_val, y_val = list(data_val['review']), np.array(data_val['sentiment'])
X_test, y_test = list(data_test['review']), np.array(data_test['sentiment'])

print('Train size:', len(X_train))
print('Validation size: ', len(X_val))
print('Test size: ', len(X_test))

Train size: 35000
Validation size:  10000
Test size:  5000


In [11]:
vocab_size = 10000
max_length = 500
trunc_type = 'post'
oov_tok = 'OOV'

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenization
token = Tokenizer(num_words = vocab_size, oov_token = oov_tok)
token.fit_on_texts(X_train)
index_word = token.index_word

# Convert texts to sequences
train_seq = token.texts_to_sequences(X_train)
val_seq = token.texts_to_sequences(X_val)
test_seq = token.texts_to_sequences(X_test)

train_pad = pad_sequences(train_seq, maxlen = max_length, padding = 'post', truncating = trunc_type)
val_pad = pad_sequences(val_seq, maxlen = max_length, padding = 'post', truncating = trunc_type)
test_pad = pad_sequences(test_seq, maxlen = max_length, padding = 'post', truncating = trunc_type)

In [12]:
p = np.random.permutation(len(train_pad))
train_pad = train_pad[p]
y_train = y_train[p]

In [13]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, AveragePooling1D, Bidirectional, LSTM, Dense,Flatten,LeakyReLU
from tensorflow.keras.utils import plot_model

In [None]:
embedding_dim = 64

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length = max_length))
model.add(Conv1D(512, kernel_size=5))
model.add(Flatten())
model.add(Dense(128,activation='relu'))
model.add(Dense(64,activation='relu'))
model.add(Dense(1, 'sigmoid'))

model.summary()
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

H = model.fit(train_pad, y_train, epochs = 10, batch_size = 128,
             validation_data = (val_pad, y_val) )

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 64)           640000    
                                                                 
 conv1d (Conv1D)             (None, 496, 512)          164352    
                                                                 
 flatten (Flatten)           (None, 253952)            0         
                                                                 
 dense (Dense)               (None, 128)               32505984  
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 33,318,657
Trainable params: 33,318,657
No