In [10]:
import requests
import pandas as pd
from urllib.parse import urlparse, parse_qs
import numpy as np
import os
import re

from underthesea import sent_tokenize, word_tokenize, sentiment
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from joblib import Parallel, delayed
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [4]:
def get_product_id(sendo_url):
    # Extract the product ID from the URL
    parsed_url = urlparse(sendo_url)
    # Split the path by '-'
    path_parts = parsed_url.path.split('-')
    # Get the last part of the path, remove '.html', and replace 'p' with ''
    product_id = path_parts[-1].replace('.html', '').replace('p', '')
    return product_id

#Hàm tạo ra cấu trúc để lưu dữ liệu sau khi crawl data
def comment_parser(json):
    d = dict()
    d['id'] = json.get('rating_id')
    d['title'] = json.get('comment_title')
    d['comment'] = json.get('comment')
    d['default_sentiment'] = json.get('status')
    d['like_count'] = json.get('like_count')
    d['customer_id'] = json.get('customer_id')
    d['rating_star'] = json.get('star')
    d['customer_name'] = json.get('user_name')
    return d

#Hàm lấy comment 
def get_comments(product_id):
    """Fetches and parses comments for a given Tiki.vn product ID, including all pages, dropping duplicates."""

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
        'Accept': '*/*',
        'Accept-Language': 'vi,vi-VN;q=0.9,fr-FR;q=0.8,fr;q=0.7,en-US;q=0.6,en;q=0.5',
        'Connection': 'keep-alive',
    }

    params = {
        'page': 1,
        'limit': 10,  # Adjust limit as needed
        'sort': 'review_score',
        'v': '2',
        'star': 'all'
    }

    result = []
    while True:
        response = requests.get(f'https://ratingapi.sendo.vn/product/{product_id}/rating',headers=headers,params=params)
        if response.status_code == 200:
            data = response.json().get('data')
            if not data:  # Check if there are no more comments
                break
            for comment in data:
                parsed_comment = comment_parser(comment)
                # Check if comment ID already exists in results before adding
                if parsed_comment['id'] not in [c['id'] for c in result]:
                    result.append(parsed_comment)
            params['page'] += 1
        else:
            print(f"Error getting comments for page {params['page']}. Status code: {response.status_code}")
            break

    df_comment = pd.DataFrame(result)
    return df_comment

#Hàm standardize_comment để chuẩn hóa comment trước khi Sentiment Analysis:
def standardize_comment(comment):
    comment = comment.replace('\n', ' ')\
                    .replace('\r', ' ')\
                    .replace('"', ' ').replace("”", " ")\
                    .replace(":", " ")\
                    .replace("!", " ")\
                    .replace("?", " ") \
                    .replace("-", " ")\
                    .replace("?", " ")\
                    .lower()
    return comment


#Hàm xóa Emoji ra khỏi comment
def demoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

# Hàm để tokenize từng comment
def tokenize_comment(comment):
    # Tách câu
    sentences = sent_tokenize(comment)
    # Tách từ trong mỗi câu và lưu kết quả
    tokenized_sentences = []
    for sentence in sentences:
        words = word_tokenize(sentence)
        # Xóa dấu câu khỏi mỗi từ
        for i, word in enumerate(words):
            words[i] = re.sub(r'[^\w\s]', '', word)

        tokenized_sentences.append(words)
    return tokenized_sentences

#Hàm sentiment bằng Underthesea cho từng dòng trong cột comment
def get_sentiment_by_underthesea(text):
    sentiment_result = sentiment(text)
    if sentiment_result is None:
        return 'NEUTRAL'
    elif 'positive' in sentiment_result:
        return 'POSITIVE'
    elif 'negative' in sentiment_result:
        return 'NEGATIVE'
    else:
        return 'NEUTRAL'

#Hàm sentiment bằng PhoBert cho từng dòng trong cột comment:
def get_sentiment_scores_by_phobert(text):
  
  checkpoint = "mr4/phobert-base-vi-sentiment-analysis"
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)
  model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

  inputs = tokenizer([text], padding=True, truncation=True, return_tensors="pt")
  outputs = model(**inputs)
  predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

  scores = predictions[0].tolist()  # Get scores for the first input (single text)
  highest_score_index = scores.index(max(scores))  # Find index of highest score
  label_mapping = model.config.id2label  # Get label mapping from model config
  dominant_sentiment = label_mapping[highest_score_index]
  return dominant_sentiment

In [None]:
# Uncomment phần code này để tạo ra một bộ dataset gồm đa dạng các comment từ các sản phẩm khác nhau
# <<<<<<<<<<<<<<<<<>>>>>>>>>>>>>>>>>>>>>
#Lấy commment từ URL
def parallel_apply(df, func, n_jobs=-10):
    results = Parallel(n_jobs=n_jobs)(delayed(func)(row) for row in df)
    return results

link = input('Nhập vào URL Sendo: ')
product_id = get_product_id(link)
#Lấy comment từ product_id
df = get_comments(product_id)

#Xử lý xóa duplicate và emojis
df = df.drop_duplicates(subset='id', keep='first') # Xóa duplicates dựa trên cột 'id'
df['comment'] = df['comment'].apply(demoji) #Xóa emoji
df['comment'] = df['comment'].apply(standardize_comment)
df.to_csv('D:\Personal Projects\SendoCommentsCrawler\model_evaluation\dataset\dataset.csv', mode='a', encoding='utf-8-sig')

#Copy dataframe df vào df_comment
df_comment = df.copy()
df_comment['comment'] = df_comment['comment'].apply(str)

#Tokennize cột 'comment' để tạo ra cột mới 'tokenized_comment' -> output là các mảng
df_comment['tokenized_comment'] = df_comment['comment'].apply(tokenize_comment)

#Tách các mảng sau khi được tokenize thành từng dòng riêng biệt
df_comment = df_comment.explode('tokenized_comment')

# Xóa các dòng rỗng trong cột 'tokenized_comment'
df_comment.dropna(subset=['tokenized_comment'], inplace=True)

# Ghép các mảng trong cột 'tokenized_comment' thành text theo từng dòng
df_comment['tokenized_comment'] = df_comment['tokenized_comment'].apply(lambda x: ' '.join(x))

# Apply sentiment analysis in parallel
df_comment['underthesea_sentiment'] = parallel_apply(df_comment['comment'], get_sentiment_by_underthesea)
df_comment['phobert_sentiment_score'] = parallel_apply(df_comment['comment'], get_sentiment_scores_by_phobert)


df_comment.to_csv('D:\Personal Projects\SendoCommentsCrawler\model_evaluation\output_model_dataset\model_comparision_dataset.csv',encoding='utf-8-sig')

df_comment

In [12]:
sentiment('hay qua')

NotFittedError: This TfidfTransformer instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.