<a href="https://colab.research.google.com/github/Rajeeb321123/Machine-learning-Journey/blob/master/21_natural_language_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Natural Language Processing

## Importing the libraries

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [3]:
# quoting = 3 means remove all quotes especially double quote
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3) # in tsv value is separated by tab ('\t')

In [4]:
dataset[:10]

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
5,Now I am getting angry and I want my damn pho.,0
6,Honeslty it didn't taste THAT fresh.),0
7,The potatoes were like rubber and you could te...,0
8,The fries were great too.,1
9,A great touch.,1


## Cleaning the texts


>`For ease for NLP model`

In [5]:
dataset["Review"][:4] # getting first 4 rows of Review column

0                             Wow... Loved this place.
1                                   Crust is not good.
2            Not tasty and the texture was just nasty.
3    Stopped by during the late May bank holiday of...
Name: Review, dtype: object

In [6]:
import re # to simplify the reviews of resturant
import nltk
nltk.download('stopwords') # for stop word ( remove words that isnot important for review prediction process like a, the, and)
from nltk.corpus import stopwords
# apply stemming ( eg: in sentence I loved the dog, if we apply stemming in loved, we can simplify it to present tense love for keeping things simple for review process)
# stemming or simplifying is important because at the end we create a sparse matrix like in prev matrix in video.
# to keep dimension of matrix small we do stemming [1,0,3,4,0,0,9,....]
# eg: we donot want two column or two value  for loved and love separate, so we stem them to love only.
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
#new list
# contain all the different reviews, all cleaned
# cleaning: all letter in lowercase, remove stopwords(a, the, and, I, she, he, ...), punctuation(:, "", comma,), stemmed
corpus = []
# each value of in each postion corpus will be converted to each sparse matrix

for i in range(0, 1000): # all reviews
  # steps to clean by step by step
  # hat or '^' means not: we  want anything not A-Z or a-z  replaced by space(' ')
  review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])# remove punctuation by space. space is important so two different word dont get merge
  review = review.lower() # all lowercase
  review = review.split() # splitting each word into a list
  #Stemming: very important for lower dimensionality of sparse matrix
  ps = PorterStemmer()
  # iterate throuh the review list after splitting
  # using single row for loop
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not') # we want to keep not in our reviews
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)] # similar to filter in JS: only select the word not in Stopwords.words('english)
  # just join the different word to get full review after stemming
  review = ' '.join(review) # join with space(' ')

  corpus.append(review)

In [8]:
corpus[:10] # our review after cleaned

['wow love place',
 'crust not good',
 'not tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price',
 'get angri want damn pho',
 'honeslti tast fresh',
 'potato like rubber could tell made ahead time kept warmer',
 'fri great',
 'great touch']

## Creating the Bag of Words model

In [9]:
# Tokenization process
from sklearn.feature_extraction.text import CountVectorizer

#convert into martix of token
# we still have some word that reveiw is postive or negative. eg: stevie, holiday, texture...
cv = CountVectorizer(max_features = 1500) # reduce the max size of sparse matrix by only including frequent words only (top 1500)

# create sparse matrix
X = cv.fit_transform(corpus).toarray() # toarray() because naive base model need 2D array
y = dataset.iloc[:, -1].values

In [10]:
len(X[0])

1500

## Splitting the dataset into the Training set and Test set

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

## Training the Naive Bayes model on the Training set

In [12]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

## Predicting the Test set results

In [13]:
y_pred = classifier.predict(X_test)

In [14]:
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test),1)), axis=1))

[[1 0]
 [1 0]
 [1 0]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [1 0]
 [1 0]
 [1 1]
 [0 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 1]
 [1 1]
 [1 0]
 [1 0]
 [0 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 0]
 [0 0]
 [1 0]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 0]
 [1 0]
 [1 1]
 [0 0]
 [0 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [0 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 0]
 [1 1]
 [1 1]
 [0 0]
 [0 1]
 [0 1]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 1]
 [1 1]

## Making the Confusion Matrix

In [15]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[ 67  50]
 [ 20 113]]


0.72

## Predicting with our own review

In [16]:
# same as before but without for loop
new_review = 'I love this restaurant so much'
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_X_test = cv.transform(new_corpus).toarray() # only transform not fit_transfrom
new_y_pred = classifier.predict(new_X_test)
print(new_y_pred)

[1]


In [18]:
new_review = 'I hate this restaurant so much'
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_X_test = cv.transform(new_corpus).toarray()
new_y_pred = classifier.predict(new_X_test)
print(new_y_pred)

[0]


## Exercise use some other models than naive bayes

### K_Nearest

In [20]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

In [21]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[98 19]
 [67 66]]


0.656

###Random forest

In [22]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[101  16]
 [ 54  79]]


0.72

### Kernel_SVM

In [23]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[108   9]
 [ 51  82]]


0.76

### Decision Tree

In [24]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

In [25]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[91 26]
 [42 91]]


0.728