# Natural Language Processing

# Install libraries

In [1]:
!pip install keras
!pip install numpy
!pip install matplotlib
!pip install pandas
!pip install nltk
!pip install sklearn



# Machine learning

## Importing the libraries

In [65]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [66]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)

In [67]:
dataset.tail()

Unnamed: 0,Review,Liked
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0
999,"Then, as if I hadn't wasted enough of my life ...",0


## Cleaning the texts

In [68]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jawahar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Creating the Bag of Words model

In [69]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values

## Splitting the dataset into the Training set and Test set

In [70]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

## Training the Naive Bayes model on the Training set

In [71]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [72]:
print(X_test.size)

300000


## Predicting the Test set results

In [73]:
y_pred = classifier.predict(X_test)

In [74]:
print(y_pred)

[1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 1 0 0 1 1 1 0 1 1 1 0 1 1 1 1 1 0 1
 0 1 1 1 1 1 0 0 0 1 1 0 0 1 1 1 1 1 0 1 1 0 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1
 0 1 1 0 0 1 0 1 1 0 1 1 1 0 1 1 0 1 0 0 1 1 1 1 1 1 0 1 1 1 0 1 1 1 0 0 0
 1 0 1 1 0 1 1 1 1 1 0 1 1 0 0 1 1 0 1 1 1 0 0 1 1 1 1 1 1 0 1 1 0 1 0 1 1
 1 1 1 0 1 1 1 0 1 1 1 1 1 0 0 1 0 0 1 0 0 0 0 1 1 0 0 1 0 1 0 0 1 0 0 1 0
 1 0 1 0 1 1 0 1 1 1 0 1 1 1 1]


## Making the Confusion Matrix

In [78]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

[[55 42]
 [12 91]]


In [79]:
from sklearn.metrics import accuracy_score
print("Model accuracy using Naive Bayes model -- ",accuracy_score(y_test, y_pred))

Model accuracy using Naive Bayes model --  0.73


# Neural networks

In [1]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras import layers
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import pandas as pd

df = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)
#df.columns = ["label","text"]
x = df['Review'].values
y = df['Liked'].values

x_train, x_test, y_train, y_test = \
 train_test_split(x, y, test_size=0.1, random_state=123)
#print(x_test)

tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(x)
xtrain= tokenizer.texts_to_sequences(x_train)
xtest= tokenizer.texts_to_sequences(x_test)

vocab_size=len(tokenizer.word_index)+1

maxlen=10
xtrain=pad_sequences(xtrain,padding='post', maxlen=maxlen)
xtest=pad_sequences(xtest,padding='post', maxlen=maxlen) 
 
print(x_train[3])
print(xtrain[3])

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


The service was great, even the manager came and helped with our table.
[ 1 17  4 22 90  1 78  2 21 41]


# Model architecture

In [2]:
embedding_dim=50

model=Sequential()
model.add(layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen))
model.add(layers.LSTM(units=50,return_sequences=True))
model.add(layers.LSTM(units=10))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(8))
model.add(layers.Dense(1, activation="sigmoid"))
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=['accuracy'])
model.summary()





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 10, 50)            103600    
_________________________________________________________________
lstm_1 (LSTM)                (None, 10, 50)            20200     
_________________________________________________________________
lstm_2 (LSTM)                (None, 10)                2440      
_________________________________________________________________
dropout_1 (Dropout)          (None, 10)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 8)                 88        
___________________________________________________________

# Model training

In [3]:
model.fit(xtrain,y_train, epochs=100, batch_size=32, verbose=True)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7fae451bda90>

# Model predicions

In [84]:
loss, acc = model.evaluate(xtrain, y_train, verbose=False)
print("Training Accuracy: ", acc.round(2))
loss, acc = model.evaluate(xtest, y_test, verbose=False)
print("Test Accuracy: ", acc.round(2))

ypred=model.predict(xtest)

ypred[ypred>0.5]=1 
ypred[ypred<=0.5]=0 
cm = confusion_matrix(y_test, ypred)
print(cm)

result=zip(x_test, y_test, ypred)

for i in result:
    print(i)

Training Accuracy:  0.8
Test Accuracy:  0.7
[[37  8]
 [22 33]]
('Penne vodka excellent!', 1, array([0.], dtype=float32))
('Great brunch spot.', 1, array([1.], dtype=float32))
('We ordered the duck rare and it was pink and tender on the inside with a nice char on the outside.', 1, array([1.], dtype=float32))
('Great food and great service in a clean and friendly setting.', 1, array([1.], dtype=float32))
('He was extremely rude and really, there are so many other restaurants I would love to dine at during a weekend in Vegas.', 0, array([1.], dtype=float32))
('Same evening, him and I are both drastically sick.', 0, array([0.], dtype=float32))
("I go to far too many places and I've never seen any restaurant that serves a 1 egg breakfast, especially for $4.00.", 0, array([0.], dtype=float32))
('The vanilla ice cream was creamy and smooth while the profiterole (choux) pastry was fresh enough.', 1, array([0.], dtype=float32))
('However, there was so much garlic in the fondue, it was barely ed

In [5]:
model.save("model.hdf5")

# predicitions for new text

In [85]:
def predict_review_class(review: str):
    tokenizer = Tokenizer(num_words=100)
    tokenizer.fit_on_texts(x)
    review_seq= tokenizer.texts_to_sequences([review])
    vocab_size=len(tokenizer.word_index)+1
    maxlen=10
    review_seq=pad_sequences(review_seq,padding='post', maxlen=maxlen) 
    result = model.predict_classes(review_seq)
    if result == 1:
        print("Good review, well done.")
    else:
        print("Bad review.")

In [86]:
predict_review_class("this is so cool")

Good review, well done.
