In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense, Dropout, SpatialDropout1D, MaxPooling1D, Conv1D, Bidirectional
from tensorflow.keras.layers import Embedding
from nltk.tokenize import word_tokenize

In [2]:
df = pd.read_csv('/kaggle/input/zomato-reviews-ratings/zomato_reviews.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,rating,review
0,0,5,nice
1,1,5,"best biryani , so supportive staff of outlet ,..."
2,2,4,delivery boy was very decent and supportive.👌👍
3,3,1,"worst biryani i have tasted in my life, half o..."
4,4,5,all food is good and tasty . will order again ...


In [3]:
print("Number of rows in data:", df.shape[0])
print("Number of columns in data", df.shape[1])

Number of rows in data: 5479
Number of columns in data 3


In [4]:
df.rating.value_counts()

rating
5    2288
1    1891
3     474
4     458
2     368
Name: count, dtype: int64

In [5]:
df.isnull().sum()

Unnamed: 0    0
rating        0
review        1
dtype: int64

In [6]:
df = df.dropna()

In [8]:
df.isnull().sum()

Unnamed: 0    0
rating        0
review        0
dtype: int64

In [9]:
df = df.drop(columns=['Unnamed: 0'])

In [10]:
df.rating = df.rating.replace([1,2,3],0)
df.rating = df.rating.replace([4,5],1)

In [11]:
df.head()

Unnamed: 0,rating,review
0,1,nice
1,1,"best biryani , so supportive staff of outlet ,..."
2,1,delivery boy was very decent and supportive.👌👍
3,0,"worst biryani i have tasted in my life, half o..."
4,1,all food is good and tasty . will order again ...


In [12]:
df.rating.value_counts()

rating
1    2745
0    2733
Name: count, dtype: int64

In [13]:
X = df["review"]
y = df["rating"]

In [14]:
X.head()

0                                                 nice
1    best biryani , so supportive staff of outlet ,...
2       delivery boy was very decent and supportive.👌👍
3    worst biryani i have tasted in my life, half o...
4    all food is good and tasty . will order again ...
Name: review, dtype: object

In [15]:
def stringprocess(text):
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    
    return text

In [29]:
!pip install nltk



In [30]:
import nltk

# Download the WordNet corpus
nltk.download('wordnet')

# You may also want to download the punkt tokenizer and omw-1.4 for word tokenization and lemmatization support
nltk.download('punkt')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [31]:
from string import digits
import string
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def textpreprocess(text):
    text = map(lambda x: x.lower(), text) # Lower case
    text = map(lambda x: re.sub(r"https?://\S+|www\.\S+", "", x), text) # Remove Links
    text = map(lambda x: re.sub(re.compile(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});"),"", x), text) # Remove html tags
    text = map(lambda x: re.sub(r'[^\x00-\x7f]',r' ', x), text) # Remove non-ASCII characters 
    # Remove special special characters, including symbols, emojis, and other graphic characters
    emoji_pattern = re.compile(
            '['
            u'\U0001F600-\U0001F64F'  # emoticons
            u'\U0001F300-\U0001F5FF'  # symbols & pictographs
            u'\U0001F680-\U0001F6FF'  # transport & map symbols
            u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
            u'\U00002702-\U000027B0'
            u'\U000024C2-\U0001F251'
            ']+',
            flags=re.UNICODE)

    text = map(lambda x: emoji_pattern.sub(r'', x), text) 
    text = map(lambda x: x.translate(str.maketrans('', '', string.punctuation)), text) # Remove punctuations
    
#     text = text.apply(lambda x: TextBlob(x).correct()) # Spelling correction
    
    remove_digits = str.maketrans('', '', digits)
    text = [i.translate(remove_digits) for i in text]
    text = [w for w in text if not w in stop_words]
    text = ' '.join([lemmatizer.lemmatize(w) for w in text])
    text = text.strip()
    return text


In [32]:
X = X.apply(lambda x: stringprocess(x))
word_tokens = X.apply(lambda x: word_tokenize(x))

preprocess_text = word_tokens.apply(lambda x: textpreprocess(x))

LookupError: 
**********************************************************************
  Resource 'corpora/wordnet' not found.  Please use the NLTK
  Downloader to obtain the resource:  >>> nltk.download()
  Searched in:
    - '/root/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************

In [33]:
training_portion = 0.9
train_size = int(len(preprocess_text) * training_portion)

train_data = preprocess_text[0: train_size]
train_labels = np.array(y[0: train_size])

validation_data = preprocess_text[train_size:]
validation_labels = np.array(y[train_size:])


print(len(train_data))
print(len(train_labels))
print(len(validation_data))
print(len(validation_labels))

NameError: name 'preprocess_text' is not defined

In [34]:
vocab_size = 5000
oov_tok = '<OOV>'

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(preprocess_text)
word_index = tokenizer.word_index
dict(list(word_index.items())[0:10])

NameError: name 'preprocess_text' is not defined

In [35]:
train_sequences = tokenizer.texts_to_sequences(train_data)
print(train_sequences[10])

NameError: name 'train_data' is not defined

In [36]:
embedding_dim = 32
max_length = 70
trunc_type = 'post'  # remove or truncate last words in sentences if max_length > 50 ans "post" defined last at sentence
padding_type = 'post'

In [37]:
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print(len(train_sequences[1]))
print(len(train_padded[1]))
print(train_padded.shape)

NameError: name 'train_sequences' is not defined

In [38]:
validation_sequences = tokenizer.texts_to_sequences(validation_data)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print(len(validation_sequences))
print(validation_padded.shape)

NameError: name 'validation_data' is not defined

In [39]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_data(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])
print(decode_data(train_padded[10]))
print('---')
print(train_data[10])

NameError: name 'word_index' is not defined

In [40]:
model = Sequential()
model.add(Embedding(vocab_size, 128, input_length=256))
model.add(Conv1D(filters=128, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Bidirectional(LSTM(32,return_sequences=True)))
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.4))
model.add(LSTM(32,return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [41]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

num_epochs = 20
history = model.fit(train_padded, train_labels, epochs=num_epochs, verbose=2, validation_data=(validation_padded, validation_labels))

NameError: name 'train_padded' is not defined

In [None]:
string = ['worst biryani i have tasted in my life and food quality was also bad']
#vectorizing the tweet by the pre-fitted tokenizer instance
token = tokenizer.texts_to_sequences(string)
#padding the tweet to have exactly the same shape as `embedding_2` input
token_list = pad_sequences(token, maxlen=max_length-1, padding=padding_type, truncating=trunc_type)
# print(token_list)
sentiment = model.predict(token_list,batch_size=2,verbose = 2)[0]
print(string)
if(np.argmax(sentiment) == 0):
    print("negative")
elif (np.argmax(sentiment) == 1):
    print("positive")