#　文字預處理，部分內容前面章節可能提過，這裡會將前處理所需技巧串起

In [1]:
import re
import numpy as np 
import pandas as pd

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
# nltk.download('stopwords')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

In [2]:
#tsv是指用tab分開字元的檔案
dataset=pd.read_csv('./D14_movie_feedback.csv', header = None, encoding = 'Big5')
X = dataset[0].values
Y = dataset[1].values

print(dataset)
print('\n\nreview before preprocessing : {}'.format(X[0]))

                                                       0  1
0      the rock is destined to be the 21st century's ...  1
1      the gorgeously elaborate continuation of " the...  1
2                         effective but too-tepid biopic  1
3      if you sometimes like to go to the movies to h...  1
4      emerges as something rare , an issue movie tha...  1
...                                                  ... ..
10657  a terrible movie that some people will neverth...  0
10658  there are many definitions of 'time waster' bu...  0
10659  as it stands , crocodile hunter has the hurrie...  0
10660  the thing looks like a made-for-home-video qui...  0
10661  enigma is well-made , but it's just too dry an...  0

[10662 rows x 2 columns]


review before preprocessing : the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . 


---

## 運用re.sub去除部分字元

In [3]:
# 去除a-zA-Z以外的字元，並將他們取代為空格' '
review = re.sub(r'[^a-zA-Z]',' ',X[0])
print('review after re.sub : {}'.format(review))

review after re.sub : the rock is destined to be the   st century s new   conan   and that he s going to make a splash even greater than arnold schwarzenegger   jean claud van damme or steven segal   


## 將所有字母轉為小寫:因為大部分情境區分大小寫並不能提供而外訊息，如CV內顏色無法提供額外訊息時我們會將圖像轉為灰階，藉此降低複雜度

In [4]:
#把全部變成小寫
review = review.lower()
print('review after lower : {}'.format(review))

review after lower : the rock is destined to be the   st century s new   conan   and that he s going to make a splash even greater than arnold schwarzenegger   jean claud van damme or steven segal   


## 斷詞

In [5]:
#把review裡面的單字切開
print('review after split : {}'.format(review.split()))

review after split : ['the', 'rock', 'is', 'destined', 'to', 'be', 'the', 'st', 'century', 's', 'new', 'conan', 'and', 'that', 'he', 's', 'going', 'to', 'make', 'a', 'splash', 'even', 'greater', 'than', 'arnold', 'schwarzenegger', 'jean', 'claud', 'van', 'damme', 'or', 'steven', 'segal']


* tokenize 相較於split會是更好的選擇，如 split 無法分開 word. 這種case

In [6]:
review = nltk.word_tokenize(review)
print('review after tokenize : {}'.format(review))

review after tokenize : ['the', 'rock', 'is', 'destined', 'to', 'be', 'the', 'st', 'century', 's', 'new', 'conan', 'and', 'that', 'he', 's', 'going', 'to', 'make', 'a', 'splash', 'even', 'greater', 'than', 'arnold', 'schwarzenegger', 'jean', 'claud', 'van', 'damme', 'or', 'steven', 'segal']


## stopwords: 移除贅字，此步驟為前處理的重要步驟之一，過多的贅字不僅無法提供更多訊息，還會干擾到模型的訓練

In [7]:
#處理文字，有建立好的文字褲會幫我們移除不想要的文字
review = [word for word in review if not word in set(stopwords.words('english'))]
print('review after removeing stopwords : {}'.format(review))

review after removeing stopwords : ['rock', 'destined', 'st', 'century', 'new', 'conan', 'going', 'make', 'splash', 'even', 'greater', 'arnold', 'schwarzenegger', 'jean', 'claud', 'van', 'damme', 'steven', 'segal']


## Stemming: 詞幹提取
 * ex. loves,loved都變成love
 * 中文沒有詞幹提取的需求

In [8]:
ps = PorterStemmer()
review=[ps.stem(word) for word in review]
print('review after stemming : {}'.format(review))

review after stemming : ['rock', 'destin', 'st', 'centuri', 'new', 'conan', 'go', 'make', 'splash', 'even', 'greater', 'arnold', 'schwarzenegg', 'jean', 'claud', 'van', 'damm', 'steven', 'segal']


## 練習清理所有的句子

In [9]:
def preprocessing(word):
    ps = PorterStemmer()
    return ' '.join([ps.stem(w) for w in re.sub('[^a-zA-Z]', ' ', word).lower().split()])

In [10]:
## 這裡先不用stopwords 因為 review中很多反定詞會被移掉 如isn't good, 會變成 good
corpus = [preprocessing(s) for s in X]

## 轉bag-of-words vector

In [11]:
# Creating bag of word model
# max_features是要建造幾個column，會按造字出現的高低去篩選 
cv = CountVectorizer(max_features = 2500)

# toarray是建造matrixs
# X現在為sparsity就是很多零的matrix
X_ = cv.fit_transform(corpus).toarray()
Y_ = Y.copy()

## 選擇練習: 將處理好數據放入 naive_bayes模型，並預測評論為正向或負面，詳細原理之後章節會解釋。

## Training

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_, Y_, test_size = 0.1)

# Feature Scaling
# Naive Bayes
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB()

## Inference

In [13]:
## 要使用一樣的前處理
def predict_message(input_):
    print(classifier.predict(cv.transform([preprocessing(input_)]).toarray())[0])

In [14]:
message = 'I really like this movie!!' ## 1代表正向評價
predict_message(message)
message = 'A terrible movie  !!' ## 0代表正向評價
predict_message(message)

1
0
