In [1]:
raw_text = """
<p>This is an example text with some <b>HTML tags</b> and special characters like &amp;.</p>
It also has some extra spaces  and mixed case.
Let's see how we can clean it!
"""

In [2]:
import re

In [3]:
def clean_text(text):
    # Loại bỏ HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    # Loại bỏ ký tự đặc biệt và dấu câu
    text = re.sub(r'[^\w\s]', '', text)
    # Chuyển về chữ thường
    text = text.lower()
    # Xóa khoảng trắng thừa
    text = re.sub(r'\s+', ' ', text).strip()
    return text
cleaned_text = clean_text(raw_text)
print("Cleaned text:", cleaned_text)

Cleaned text: this is an example text with some html tags and special characters like amp it also has some extra spaces and mixed case lets see how we can clean it


In [4]:
import unicodedata

def normalize_text(text):
    # Chuẩn hóa Unicode
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
    # Ở đây có thể thêm xử lý lỗi chính tả nếu cần
    return text

normalized_text = normalize_text(cleaned_text)
print("Normalized text:", normalized_text)

Normalized text: this is an example text with some html tags and special characters like amp it also has some extra spaces and mixed case lets see how we can clean it


In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('stopwords')

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return ' '.join(filtered_text)

stopword_removed_text = remove_stopwords(normalized_text)
print("Stopword removed text:", stopword_removed_text)

Stopword removed text: example text html tags special characters like amp also extra spaces mixed case lets see clean


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\luuvi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    word_tokens = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in word_tokens]
    return ' '.join(lemmatized_words)

lemmatized_text = lemmatize_text(stopword_removed_text)
print("Lemmatized text:", lemmatized_text)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\luuvi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Lemmatized text: example text html tag special character like amp also extra space mixed case let see clean


In [7]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\luuvi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [8]:
import spacy

nlp = spacy.load("en_core_web_sm")

def spacy_pos_tagging(text):
    doc = nlp(text)
    pos_tags = [(token.text, token.pos_) for token in doc]
    return pos_tags

#Giả sử lemmatized_text đã được định nghĩa.
spacy_pos_tagged_text = spacy_pos_tagging(lemmatized_text)
print("spaCy POS tagged text:", spacy_pos_tagged_text)

spaCy POS tagged text: [('example', 'NOUN'), ('text', 'NOUN'), ('html', 'PROPN'), ('tag', 'NOUN'), ('special', 'ADJ'), ('character', 'NOUN'), ('like', 'ADP'), ('amp', 'PROPN'), ('also', 'ADV'), ('extra', 'ADJ'), ('space', 'NOUN'), ('mixed', 'ADJ'), ('case', 'NOUN'), ('let', 'AUX'), ('see', 'VERB'), ('clean', 'ADJ')]


In [9]:
import spacy

nlp = spacy.load("en_core_web_sm")

def extract_ner(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

ner_entities = extract_ner(normalized_text) #Dùng normalized text thì hơn
print("NER entities:", ner_entities)

NER entities: []


In [10]:
from transformers import pipeline

def analyze_sentiment(text):
    sentiment_pipeline = pipeline("sentiment-analysis")
    result = sentiment_pipeline(text)[0]
    return result

sentiment = analyze_sentiment(normalized_text) #Dùng normalized text thì hơn
print("Sentiment:", sentiment)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


Sentiment: {'label': 'NEGATIVE', 'score': 0.9427444338798523}


In [11]:
from transformers import BertTokenizerFast

def bert_tokenize(text):
    tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
    tokens = tokenizer.tokenize(text)
    return tokens

tokenized_text = bert_tokenize(normalized_text) #Dùng normalized text thì hơn
print("Tokenized text:", tokenized_text)

Tokenized text: ['this', 'is', 'an', 'example', 'text', 'with', 'some', 'html', 'tags', 'and', 'special', 'characters', 'like', 'amp', 'it', 'also', 'has', 'some', 'extra', 'spaces', 'and', 'mixed', 'case', 'lets', 'see', 'how', 'we', 'can', 'clean', 'it']


In [12]:
def convert_tokens_to_ids(tokens):
    tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    return input_ids

token_ids = convert_tokens_to_ids(tokenized_text)
print("Token IDs:", token_ids)

Token IDs: [2023, 2003, 2019, 2742, 3793, 2007, 2070, 16129, 22073, 1998, 2569, 3494, 2066, 23713, 2009, 2036, 2038, 2070, 4469, 7258, 1998, 3816, 2553, 11082, 2156, 2129, 2057, 2064, 4550, 2009]
