<a href="https://colab.research.google.com/github/Siddhant-Anand/All-In-One-DSA/blob/main/DeepLearningProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import re
import pandas as pd
import numpy as np
import csv
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
import keras
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import math
import nltk

In [127]:
data = pd.read_csv('IMDB Dataset.csv', engine="python",  encoding='utf-8',on_bad_lines='skip')
data.columns=['review','sentiment','xx','yy']
data.drop(data.columns[[2, 3]], axis=1, inplace=True)
dt=data.head()
dt

Unnamed: 0,review,sentiment
0,"Okay, as a long time Disney fan, I really -hat...",negative
1,What was an exciting and fairly original serie...,negative
2,Cast to die for in a movie that is considerabl...,negative
3,"An ""independant"" film that, from the back of t...",negative
4,I just finished watching this movie. It wasn't...,negative
...,...,...
46014,I thought this movie did a down right good job...,positive
46015,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
46016,I am a Catholic taught in parochial elementary...,negative
46017,I'm going to have to disagree with the previou...,negative


In [112]:
data.shape[0]

46019

In [128]:
def remove_tags(string):
    removelist = ""
    result = re.sub(r'<.*?>', '', string)  # Remove HTML tags
    result = re.sub(r'https://.*', '', result)  # Remove URLs
    result = re.sub(r'[\W'+removelist+']', ' ', result)  # Remove non-alphanumeric characters
    result = result.lower()
    return result

data['review'] = data['review'].apply(lambda cw: remove_tags(cw))



In [130]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
data['review'] = data['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
nltk.download('all')

In [132]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
   st = ""
   for i in w_tokenizer.tokenize(text):
       st = st + lemmatizer.lemmatize(i) + " "
   return st
data['review'] = data.review.apply(lemmatize_text)


In [134]:
s = 0.0
for i in data['review']:
    word_list = i.split()
    s = s + len(word_list)
print("Average length of each review : ",s/data.shape[0])
pos = 0
for i in range(data.shape[0]):
    if data.iloc[i]['sentiment'] == 'positive':
        pos = pos + 1
neg = data.shape[0]-pos
print("Percentage of reviews with positive sentiment is "+str(pos/data.shape[0]*100)+"%")
print("Percentage of reviews with negative sentiment is "+str(neg/data.shape[0]*100)+"%")

Average length of each review :  119.6216997327191
Percentage of reviews with positive sentiment is 50.04889284860601%
Percentage of reviews with negative sentiment is 49.95110715139399%


In [135]:
reviews = data['review'].values
labels = data['sentiment'].values
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)


In [136]:
train_sentences, test_sentences, train_labels, test_labels = train_test_split(reviews, encoded_labels, stratify = encoded_labels)

In [137]:
vocab_size = 3000 # choose based on statistics
oov_tok = ''
embedding_dim = 100
max_length = 200 # choose based on statistics, for example 150 to 200
padding_type='post'
trunc_type='post'
# tokenize sentences
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index
# convert train dataset to sequence and pad sequences
train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, padding='post', maxlen=max_length)
# convert Test dataset to sequence and pad sequences
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, padding='post', maxlen=max_length)

In [138]:
# model initialization
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    keras.layers.Bidirectional(keras.layers.LSTM(64)),
    keras.layers.Dense(24, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])
# compile model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
# model summary
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 200, 100)          300000    
                                                                 
 bidirectional_2 (Bidirectio  (None, 128)              84480     
 nal)                                                            
                                                                 
 dense_4 (Dense)             (None, 24)                3096      
                                                                 
 dense_5 (Dense)             (None, 1)                 25        
                                                                 
Total params: 387,601
Trainable params: 387,601
Non-trainable params: 0
_________________________________________________________________


In [139]:
num_epochs = 5
history = model.fit(train_padded, train_labels,
                    epochs=num_epochs, verbose=1,
                    validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [147]:
# reviews on which we need to predict
sentence = ["The movie was very touching and heart whelming",
            "terrible",
            "movie was average"]
# convert to a sequence
sequences = tokenizer.texts_to_sequences(sentence)
# pad the sequence
padded = pad_sequences(sequences, padding='post', maxlen=max_length)
# Get labels based on probability 1 if p>= 0.5 else 0
prediction = model.predict(padded)
pred_labels = []
for i in prediction:
    if i>=0.3 and i<0.7:
        pred_labels.append(2)
    elif i >= 0.7:
        pred_labels.append(1)
    else:
      pred_labels.append(3)
for i in range(len(sentence)):
    print(sentence[i])
    if pred_labels[i] == 1:
        s = 'Positive'
    elif pred_labels[i] == 2:
        s='Average'
    else:
        s = 'Negative'
    print("Predicted sentiment : ",s)

[0.8748184] The movie was very touching and heart whelming
[0.04156353] terrible
[0.5603836] movie was average
The movie was very touching and heart whelming
Predicted sentiment :  Positive
terrible
Predicted sentiment :  Negative
movie was average
Predicted sentiment :  Average
