In [16]:
import re
import string
import time
import unidecode
import string
import pickle
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch
from transformers import AutoModel, AutoTokenizer
from tqdm import tqdm, notebook
import pickle
from joblib import dump
from sklearn.model_selection import train_test_split

from keras.layers import Dense, GlobalMaxPooling1D, Dropout, Dense, BatchNormalization, Input
from matplotlib import pyplot
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import LSTM,Bidirectional
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from sklearn import datasets, svm
import matplotlib.pyplot as plt

**PhoBert Embedding**

In [85]:
def load_bert():
    v_phobert = AutoModel.from_pretrained("vinai/phobert-base")
    v_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)
    return v_phobert, v_tokenizer
phobert, tokenizer = load_bert()

In [86]:
def phobert_embed_sentence(padded, mask, model=phobert):
    # embed a single setence data
    # param padded: a tokenized, padded sentence
    # param mask: an attention mask of the padded sentence
    padded = torch.tensor(padded).to(torch.long)
    mask = torch.tensor(mask)
    with torch.no_grad():
        last_hidden_states = model(input_ids=padded, attention_mask=mask)[0]
    vector = last_hidden_states[:, 0, :].numpy() # [:,0,:] to get embedding vector of the first output token [CLS]
    return vector.flatten()

def phobert_embed_data(data, tokenizer=tokenizer):
    # embed the whole dataset with phobert
    # param padded_data: tokenized, padded dataset
    # param mask_data: attention masks of the padded dataset
    MAX_LENGTH = tokenizer.model_max_length # phobert default max sequence length = 256

    embedded_data = np.array([])
    for line in tqdm(data):
        tokenized_line = tokenizer.encode(line, max_length=MAX_LENGTH, truncation=True)
        # pad sentence to a pre-defined max length, no truncating since it is already truncated in the phobert tokenizing
        padded_line = pad_sequences([tokenized_line], maxlen=MAX_LENGTH, padding='post', value=1)
        # Get attention mask from padded sentence of data to make PhoBERT focus on non-padded data only
        # pad tokenized sentence with value = 1, since 1 is pre-defined padding value of PhoBERT
        mask = np.where(padded_line == 1, 0, 1)

        embedded_line = phobert_embed_sentence(padded_line, mask)
        
        if embedded_data.shape[0] == 0:
            embedded_data = np.empty((0, embedded_line.shape[0]), 'float32')
            
        embedded_data = np.concatenate((embedded_data, [embedded_line]))
    return embedded_data

**Cleaning Data**

In [87]:

# !pip install underthesea
# !git clone https://github.com/stopwords/vietnamese-stopwords
import underthesea
stopwords_path = "vietnamese-stopwords.txt"

def standardize_data(sentence):
    row = re.sub(r"[\.,\?]+$-","",sentence)
    row = row.replace(",", " ").replace(".", " ") \
        .replace(";", " ").replace("“", " ") \
        .replace(":", " ").replace("”", " ") \
        .replace('"', " ").replace("'", " ") \
        .replace("!", " ").replace("?", " ") \
        .replace("-", " ").replace("?", " ")
    row = row.strip().lower()
    return row

def cleaning_text(text, keep_punct=True):
    if keep_punct:
        return re.sub(u'[^{Latin}0-9[:punct:]]+', u' ', text)
    return re.sub(u'[^{Latin}0-9]+', u' ', text)

def load_stopwords():
    sw = []
    with open(stopwords_path, encoding='utf-8') as f:
        lines = f.readlines()
    for line in lines:
        sw.append(line.replace("\n",""))
    return sw


def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

sw = load_stopwords()
def preprocessing(text, keep_punct=True):
    text = standardize_data(text)
    text = remove_emoji(text)
    #tokenize
    line = underthesea.word_tokenize(text)

    #filter stopword
    filter_words = [word for word in line if word not in sw]
    #concat
    line = "".join(text)
    line = underthesea.word_tokenize(line,format = "text")
    
    return cleaning_text(line)

**Load model**

In [98]:
my_model=tf.keras.models.load_model('model_LSTM_DS307.h5', compile=False)

In [127]:
def predict(sentence):

    x = [cleaning_text(sentence)]
    
    doc = phobert_embed_data(x)
    res = my_model.predict(np.array(np.expand_dims(doc,1)))[0]
    if res[0] > res[1]:
       return "Không hài lòng"
    return "Hài lòng"

In [128]:
import random
import pandas as pd

from selenium import webdriver
from time import sleep
from selenium.webdriver.common.by import By
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from bs4 import BeautifulSoup
from selenium.common.exceptions import NoSuchElementException

**Kiểm tra với dữ liệu tự nhập vào**

In [131]:
while(True):
    text=input('nhập câu bình luân vào:\n')
    if text==' end ':
        break
    else:
        print(predict(text))

nhập câu bình luân vào:
hàng chất lượng, phục vụ tận tình


100%|██████████| 1/1 [00:00<00:00,  1.65it/s]






Hài lòng
nhập câu bình luân vào:
giao hàng kh đúng hẹn, đã vậy shop còn thô lỗ


100%|██████████| 1/1 [00:00<00:00,  2.18it/s]






Không hài lòng
nhập câu bình luân vào:
áo bận cũng tạm ổn so với giá tiền


100%|██████████| 1/1 [00:00<00:00,  1.86it/s]






Không hài lòng
nhập câu bình luân vào:
giá ổn , hàng ok, cho shop năm sao


100%|██████████| 1/1 [00:00<00:00,  2.22it/s]






Hài lòng
nhập câu bình luân vào:
end


100%|██████████| 1/1 [00:00<00:00,  2.11it/s]






Hài lòng
nhập câu bình luân vào:
 end 


**Truy cập vào trang web**

In [122]:
browser = webdriver.Chrome(executable_path="./chromedriver.exe")
# Mở thử một trang web
url_booking ='https://www.facebook.com/permalink.php?story_fbid=pfbid09sYn14ZRpmSKUwxTdymog8RvE43AXLbiSRsdwpkKKVscWfTde7YLftect6GttAM8l&id=100074058847693'
browser.get(url_booking)
sleep(random.randint(4,5))

  browser = webdriver.Chrome(executable_path="./chromedriver.exe")


In [129]:
from IPython.display import clear_output

**Dự đoán các bình luận của sản phẩm đó xem thử sản phậm được Hài lòng hay Không hài lòng**

In [134]:
TC=[]
RC=browser.find_elements(By.XPATH,"//div[@class='x1lliihq xjkvuk6 x1iorvi4']")
for R in RC:
    text=R.text
    p=predict(text)
    TC.append(p[0])
    TC_DF=pd.DataFrame(TC)
print(TC_DF.value_counts()/len(TC))
sleep(1)
old_len=len(RC)
while True:
        RC=browser.find_elements(By.XPATH,"//div[@class='x1lliihq xjkvuk6 x1iorvi4']")
        new_len=len(RC)
        if(new_len>old_len):
                clear_output(wait=True)
                text=RC[-1].text
                p=predict(text)
                TC.append(p[0])
                TC_DF=pd.DataFrame(TC)
                print(TC_DF.value_counts()/len(TC))
                sleep(1)
        old_len=len(RC)

100%|██████████| 1/1 [00:00<00:00,  1.07it/s]






K    0.504425
H    0.495575
dtype: float64



KeyboardInterrupt

