# Restaurant Reviews Classification with NLTK

## Business Problem

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:

data = pd.read_csv("data/Restaurant_Reviews.tsv", sep='\t', quoting=3)


In [3]:
data.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [4]:
data.tail()

Unnamed: 0,Review,Liked
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0
999,"Then, as if I hadn't wasted enough of my life ...",0


In [5]:
data['Liked'].value_counts()

1    500
0    500
Name: Liked, dtype: int64

## Cleaning Text Data

In [6]:
import nltk 
import re

# How to download NLTK data in Python

# 참고사이트 - [https://www.kite.com/python/answers/how-to-download-nltk-data-in-python](https://www.kite.com/python/answers/how-to-download-nltk-data-in-python)


# 불용어 제거 방법 참고하기

In [7]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MyCom\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [8]:
from nltk.corpus import stopwords

In [9]:
data['Review'][0]

'Wow... Loved this place.'

In [45]:
review = re.sub('[^a-zA-Z]', ' ', data['Review'][0]) # 소문자 a부터 z + 대문자 A부터 Z 까지 만 출력하자

In [11]:
review

'Wow    Loved this place '

In [12]:
review = review.lower()

In [13]:
review

'wow    loved this place '

In [46]:
review = review.split() #tokenized

In [15]:
review

['wow', 'loved', 'this', 'place']

In [16]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [47]:
preview = []
for word in review:
  if word not in stopwords.words('english'):
    preview.append(word) # list는 append으로 추가해줌 


In [18]:
preview

['wow', 'loved', 'place']

In [19]:
review = [word for word in review if word not in stopwords.words('english')]

In [20]:
review

['wow', 'loved', 'place']

# PorterStemmer 은 무엇일까?

The Porter stemming algorithm (or ‘Porter stemmer’) is a process for removing the commoner morphological and inflexional endings from words in English. Its main use is as part of a term normalisation process that is usually done when setting up Information Retrieval systems.

In [21]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [22]:
review = [ps.stem(word) for word in review]

In [23]:
review

['wow', 'love', 'place']

In [24]:
review = " ".join(review)

In [25]:
print(review)

wow love place


In [48]:
corpus = []

ps = PorterStemmer()

for i in range(len(data)):
  
  review = re.sub('[^a-zA-Z]', ' ', data['Review'][i])
  review = review.lower() # 2번째 -모두 소문자로 만들기
  review = review.split() # 3번째 - tokenized
  # 4번째 - 불용어 제거 후   어간 추출 해주기 
  review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
  review = " ".join(review)

  corpus.append(review)

In [27]:
print(corpus)

['wow love place', 'crust good', 'tasti textur nasti', 'stop late may bank holiday rick steve recommend love', 'select menu great price', 'get angri want damn pho', 'honeslti tast fresh', 'potato like rubber could tell made ahead time kept warmer', 'fri great', 'great touch', 'servic prompt', 'would go back', 'cashier care ever say still end wayyy overpr', 'tri cape cod ravoli chicken cranberri mmmm', 'disgust pretti sure human hair', 'shock sign indic cash', 'highli recommend', 'waitress littl slow servic', 'place worth time let alon vega', 'like', 'burritto blah', 'food amaz', 'servic also cute', 'could care less interior beauti', 'perform', 'right red velvet cake ohhh stuff good', 'never brought salad ask', 'hole wall great mexican street taco friendli staff', 'took hour get food tabl restaur food luke warm sever run around like total overwhelm', 'worst salmon sashimi', 'also combo like burger fri beer decent deal', 'like final blow', 'found place accid could happier', 'seem like go

## Bag of Word Model 은 무엇일까?

Bag of Words란 단어들의 순서는 전혀 고려하지 않고, 단어들의 출현 빈도(frequency)에만 집중하는 텍스트 데이터의 수치화 표현 방법입니다. Bag of Words를 직역하면 단어들의 가방이라는 의미입니다. 단어들이 들어있는 가방을 상상해봅시다. 갖고있는 어떤 텍스트 문서에 있는 단어들을 가방에다가 전부 넣습니다. 그러고나서 이 가방을 흔들어 단어들을 섞습니다. 만약, 해당 문서 내에서 특정 단어가 N번 등장했다면, 이 가방에는 그 특정 단어가 N개 있게됩니다. 또한 가방을 흔들어서 단어를 섞었기 때문에 더 이상 단어의 순서는 중요하지 않습니다.

BoW를 만드는 과정을 이렇게 두 가지 과정으로 생각해보겠습니다.
(1) 우선, 각 단어에 고유한 정수 인덱스를 부여합니다.
(2) 각 인덱스의 위치에 단어 토큰의 등장 횟수를 기록한 벡터를 만듭니다.

# 참고사이트-[https://wikidocs.net/22650](https://wikidocs.net/22650)

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

In [29]:
cv = CountVectorizer(max_features=1500)

# CountVectorizer

# 참고사이트 - [https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)

Convert a collection of text documents to a matrix of token counts.

This implementation produces a sparse representation of the counts using scipy.sparse.csr_matrix.

If you do not provide an a-priori dictionary and you do not use an analyzer that does some kind of feature selection then the number of features will be equal to the vocabulary size found by analyzing the data.

텍스트 문서 모음을 토큰 수의 행렬로 변환합니다.

이 구현은 scipy.sparse.csr_matrix를 사용하여 카운트의 희소 표현을 생성합니다.

사전 사전을 제공하지 않고 일종의 기능 선택을 수행하는 분석기를 사용하지 않는 경우 기능의 수는 데이터를 분석하여 찾은 어휘 크기와 동일합니다.

In [30]:
x = cv.fit_transform(corpus).toarray()

In [31]:
x.shape

(1000, 1500)

In [32]:
y = data.iloc[:, 1].values

In [33]:
y.shape

(1000,)

In [34]:
y[:10]

array([1, 0, 0, 1, 1, 0, 0, 0, 1, 1], dtype=int64)

## Apply Naive Bayes Algorithm

In [35]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 0)

In [36]:
x_train.shape, x_test.shape

((800, 1500), (200, 1500))

In [37]:
y_train.shape, y_test.shape

((800,), (200,))

In [38]:
from sklearn.naive_bayes import GaussianNB

In [39]:
classifier  = GaussianNB()

In [40]:
classifier.fit(x_train, y_train)

GaussianNB()

In [41]:
y_pred = classifier.predict(x_test)

In [42]:
from sklearn.metrics import accuracy_score

In [43]:
accuracy_score(y_test, y_pred)

0.73

In [44]:
200 * 0.73

146.0

# Restaurant_Reviews.tsv 데이터는 밑에 링크에서 보실 수 있습니다.

# 링크 - [https://github.com/SEONGJAE-YOO/Natural-Language-Processing-NLP-in-Python-with-8-Projects/blob/main/data/Restaurant_Reviews.tsv](https://github.com/SEONGJAE-YOO/Natural-Language-Processing-NLP-in-Python-with-8-Projects/blob/main/data/Restaurant_Reviews.tsv)