## 本日課程-文字預處理，部分內容前面章節可能提過，這裡會將前處理所需技巧串起

In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords
#tsv是指用tab分開字元的檔案
dataset=pd.read_csv('Restaurant_Reviews.tsv',delimiter='\t',quoting=3)

In [3]:
print('review before preprocessing : {}'.format(dataset['Review'][0]))

review before preprocessing : Wow... Loved this place.


## 運用re.sub去除部分字元

In [4]:
import re 
#re.sub用來去除不要字元，第一個參數是要去除字元，但可以透過添加＾，變成不要去除字元
#第二個參數是去除字元後這些東西要變成什麼，在這我們是希望它變成一個空格
#第三個參數則是我們要剝除的字元從哪裡來

review=re.sub('[^a-zA-Z]',' ',dataset['Review'][0])

In [5]:
print('review after re.sub : {}'.format(review))

review after re.sub : Wow    Loved this place 


## 將所有字母轉為小寫:因為大部分情境區分大小寫並不能提供而外訊息，如CV內顏色無法提供額外訊息時我們會將圖像轉為灰階，藉此降低複雜度

In [6]:
#把全部變成小寫
review=review.lower()
print('review after lower : {}'.format(review))

review after lower : wow    loved this place 


## 斷詞

In [7]:
import nltk
#把review裡面的單字切開
print('review after split : {}'.format(review.split()))

review after split : ['wow', 'loved', 'this', 'place']


### tokenize 相較於split會是更好的選擇，如 split 無法分開 word. 這種case

In [8]:
nltk.word_tokenize('Wow... Loved this place.')

['Wow', '...', 'Loved', 'this', 'place', '.']

In [9]:
review = nltk.word_tokenize(review)
print('review after tokenize : {}'.format(review))

review after tokenize : ['wow', 'loved', 'this', 'place']


## 中文使用 jieba

In [14]:
import jieba
jieba.set_dictionary('dict.txt')

In [15]:
review_ = '哇！我好喜歡這個地方'
cut_result = jieba.cut(review_, cut_all=False, HMM=False)
print("output: {}".format('|'.join(cut_result)))

Building prefix dict from C:\Users\vvvcc\TestCode\NLP_test\dict.txt ...
Dumping model to file cache C:\Users\vvvcc\AppData\Local\Temp\jieba.u09dc63091e668797c4ae9a8fad7ee20d.cache
Loading model cost 0.785 seconds.
Prefix dict has been built successfully.


output: 哇|！|我|好|喜歡|這|個|地方


## stopwords: 移除贅字，此步驟為前處理的重要步驟之一，過多的贅字不僅無法提供更多訊息，還會干擾到模型的訓練

In [16]:
#處理文字，有建立好的文字褲會幫我們移除不想要的文字
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vvvcc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
review=[word for word in review if not word in set(stopwords.words('english'))]
print('review after removeing stopwords : {}'.format(review))

review after removeing stopwords : ['wow', 'loved', 'place']


### stopwords.words('english') 是一個建立好的list，包含一些常見的英文贅字

In [18]:
stopwords.words('english')[:5]

['i', 'me', 'my', 'myself', 'we']

In [21]:
# source:https://github.com/tomlinNTUB/Machine-Learning
with open('停用詞-繁體中文.txt','r',encoding="utf-8") as file:
    stop_words = file.readlines()
stop_words = [word.strip('\n') for word in stop_words]

In [22]:
practice_sentence = ['哈哈','!','現在','好想','睡覺','啊']
practice_sentence=[word for word in practice_sentence if not word in set(stop_words)]
print('practice_sentence after removeing stopwords : {}'.format(practice_sentence))

practice_sentence after removeing stopwords : ['現在', '好想', '睡覺']


## Stemming: 詞幹提取

ex. loves,loved都變成love

中文沒有詞幹提取的需求

In [23]:
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()
review=[ps.stem(word) for word in review]

In [24]:
print('review after stemming : {}'.format(review))

review after stemming : ['wow', 'love', 'place']


## 練習清理所有的句子

In [25]:
#dataset=pd.read_csv('movie_feedback.csv',encoding = 'Big5',names=['feedback', 'label'] )
dataset=pd.read_csv('Restaurant_Reviews.tsv',delimiter='\t',quoting=3)

In [26]:
corpus=[]
row=len(dataset)
for i in range(0,row):
    review=re.sub('[^a-zA-Z]',' ',dataset['Review'][i])
    review=review.lower()
    review=review.split()
    ps=PorterStemmer()
    ## 這裡先不用stopwords 因為 review中很多反定詞會被移掉 如isn't good, 會變成 good
    review=[ps.stem(word) for word in review ]
    review=' '.join(review)
    corpus.append(review)

手動選出現頻率較高的單字：一般來說我們不需要自己處理這個步驟，通常文字轉向量或index的api都有參數可以設定，這裡是讓大家自己練習

In [27]:
from collections import Counter

In [28]:
## 從整個corpus中取出所有的單詞
whole_words = []
for sentence in corpus:
    for words in nltk.word_tokenize(sentence):
        whole_words.append(words)

In [29]:
## 取出出現頻率top_k的單詞
top_k = 1000
top_k_words = []
for item in Counter(whole_words).most_common(top_k):
    top_k_words.append(item[0])

### 以 corpus中第一個句子為範例

In [30]:
remove_low_frequency_word=' '.join([word for word in nltk.word_tokenize(corpus[0]) if word in set(top_k_words)])

In [31]:
print('Before removing low frequency words:\n {}'.format(corpus[0]))
print('\n')
print('After removing low frequency words:\n {}'.format(remove_low_frequency_word))

Before removing low frequency words:
 wow love thi place


After removing low frequency words:
 wow love thi place


## 轉bag-of-words vector

In [32]:
from sklearn.feature_extraction.text import CountVectorizer
#Creating bag of word model
#tokenization(符號化)
from sklearn.feature_extraction.text import CountVectorizer
#max_features是要建造幾個column，會按造字出現的高低去篩選 
cv=CountVectorizer(max_features=1000)
#toarray是建造matrixs
#X現在為sparsity就是很多零的matrix
X=cv.fit_transform(corpus).toarray()
y=dataset.iloc[:,1].values

### 選擇練習: 將處理好數據放入 naive_bayes模型，並預測評論為正向或負面，詳細原理之後章節會解釋。

## Training

In [33]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

# Feature Scaling

#Naive Bayes
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

## Inference

In [34]:
message='I really like this!!'
## 要使用一樣的前處理
review=re.sub('[^a-zA-Z]',' ',message)
review=review.lower()
review=review.split()
ps=PorterStemmer()
review=[ps.stem(word) for word in review]
review = ' '.join(review)
input_ = cv.transform([review]).toarray()
prediction = classifier.predict(input_)


In [35]:
prediction ## 1代表正向評價

array([1], dtype=int64)

In [36]:
message='All dishes are disgusting !!'
review=re.sub('[^a-zA-Z]',' ',message)
review=review.lower()
review=review.split()
ps=PorterStemmer()
review=[ps.stem(word) for word in review]
review = ' '.join(review)
input_ = cv.transform([review]).toarray()
prediction = classifier.predict(input_)

In [37]:
prediction ## 0代表負面評價

array([0], dtype=int64)

# 測試

In [109]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords
#tsv是指用tab分開字元的檔案
dataset=pd.read_csv('movie_feedback.csv', header=None, encoding='Big5')
X = dataset[0].values
Y = dataset[1].values

In [110]:
dataset.head()

Unnamed: 0,0,1
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1


In [111]:
print('review before preprocessing : {}'.format(X[0]))

review before preprocessing : the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . 


## 運用re.sub去除部分字元

In [112]:
import re 
# 去除a-zA-Z以外的字元，並將他們取代為空格' '
review=re.sub('[^a-zA-Z]',' ',dataset[0][0])

In [113]:
print('review after re.sub : {}'.format(review))

review after re.sub : the rock is destined to be the   st century s new   conan   and that he s going to make a splash even greater than arnold schwarzenegger   jean claud van damme or steven segal   


## 將所有字母轉為小寫:因為大部分情境區分大小寫並不能提供而外訊息，如CV內顏色無法提供額外訊息時我們會將圖像轉為灰階，藉此降低複雜度

In [114]:
#把全部變成小寫
review=review.lower()
print('review after lower : {}'.format(review))

review after lower : the rock is destined to be the   st century s new   conan   and that he s going to make a splash even greater than arnold schwarzenegger   jean claud van damme or steven segal   


## 斷詞

In [115]:
import nltk
#把review裡面的單字切開
print('review after split : {}'.format(review.split()))

review after split : ['the', 'rock', 'is', 'destined', 'to', 'be', 'the', 'st', 'century', 's', 'new', 'conan', 'and', 'that', 'he', 's', 'going', 'to', 'make', 'a', 'splash', 'even', 'greater', 'than', 'arnold', 'schwarzenegger', 'jean', 'claud', 'van', 'damme', 'or', 'steven', 'segal']


In [116]:
nltk.word_tokenize('Wow... Loved this place.')

['Wow', '...', 'Loved', 'this', 'place', '.']

In [117]:
review = nltk.word_tokenize(review)
print('review after tokenize : {}'.format(review))

review after tokenize : ['the', 'rock', 'is', 'destined', 'to', 'be', 'the', 'st', 'century', 's', 'new', 'conan', 'and', 'that', 'he', 's', 'going', 'to', 'make', 'a', 'splash', 'even', 'greater', 'than', 'arnold', 'schwarzenegger', 'jean', 'claud', 'van', 'damme', 'or', 'steven', 'segal']


## stopwords: 移除贅字

In [118]:
#處理文字，有建立好的文字褲會幫我們移除不想要的文字
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vvvcc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [119]:
review=[word for word in review if not word in set("自行填入")]
print('review after removeing stopwords : {}'.format(review))

review after removeing stopwords : ['the', 'rock', 'is', 'destined', 'to', 'be', 'the', 'st', 'century', 's', 'new', 'conan', 'and', 'that', 'he', 's', 'going', 'to', 'make', 'a', 'splash', 'even', 'greater', 'than', 'arnold', 'schwarzenegger', 'jean', 'claud', 'van', 'damme', 'or', 'steven', 'segal']


## Stemming: 詞幹提取

In [120]:
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()
review=[ps.stem(word) for word in review]

In [121]:
print('review after stemming : {}'.format(review))

review after stemming : ['the', 'rock', 'is', 'destin', 'to', 'be', 'the', 'st', 'centuri', 's', 'new', 'conan', 'and', 'that', 'he', 's', 'go', 'to', 'make', 'a', 'splash', 'even', 'greater', 'than', 'arnold', 'schwarzenegg', 'jean', 'claud', 'van', 'damm', 'or', 'steven', 'segal']


## 練習清理所有的句子

In [122]:
#dataset=pd.read_csv('movie_feedback.csv',encoding = 'Big5',names=['feedback', 'label'] )
X = dataset[0].values

In [123]:
corpus=[]
row=len(X)
for i in range(0,row):
    review=re.sub('[^a-zA-Z]',' ',X[i])
    review=review.lower()
    review=review.split()
    ps=PorterStemmer()
    ## 這裡先不用stopwords 因為 review中很多反定詞會被移掉 如isn't good, 會變成 good
    review=[ps.stem(word) for word in review ]
    review=' '.join(review)
    corpus.append(review)

## 轉bag-of-words vector

In [124]:
from sklearn.feature_extraction.text import CountVectorizer
#Creating bag of word model
#tokenization(符號化)
from sklearn.feature_extraction.text import CountVectorizer
#max_features是要建造幾個column，會按造字出現的高低去篩選 
cv = CountVectorizer(max_features=1500)
#toarray是建造matrixs
#X現在為sparsity就是很多零的matrix
X_ = cv.fit_transform(corpus).toarray()
Y_ = dataset[1].values

## 選擇練習: 將處理好數據放入 naive_bayes模型，並預測評論為正向或負面，詳細原理之後章節會解釋。

In [125]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_, Y_, test_size = 0.1)

# Feature Scaling

#Naive Bayes
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

## Inference

In [126]:
message='I really like this movie!!'
## 要使用一樣的前處理
review=re.sub('[^a-zA-Z]',' ',message)
review=review.lower()
review=review.split()
ps=PorterStemmer()
review=[ps.stem(word) for word in review]
review = ' '.join(review)
input_ = cv.transform([review]).toarray()
prediction = classifier.predict(input_)


In [127]:
prediction ## 1代表正向評價

array([1], dtype=int64)

In [128]:
message='A terrible movie  !!'
review=re.sub('[^a-zA-Z]',' ',message)
review=review.lower()
review=review.split()
ps=PorterStemmer()
review=[ps.stem(word) for word in review]
review = ' '.join(review)
input_ = cv.transform([review]).toarray()
prediction = classifier.predict(input_)

In [129]:
prediction ## 0代表負面評價

array([0], dtype=int64)