In [11]:
import nltk
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
nltk.download('punkt')
nltk.download('wordnet')

bbc = pd.read_csv('bbc.csv')
bbc.head()
print(bbc.shape)
print(bbc.info())
print(bbc.head())   

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...


(2225, 2)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    2225 non-null   object
 1   topic   2225 non-null   object
dtypes: object(2)
memory usage: 34.9+ KB
None
                                                text     topic
0  Dallaglio his own man to the end\n\nControvers...     sport
1  Best person' for top legal job\n\nThe "best pe...  politics
2  Viewers to be able to shape TV\n\nImagine edit...      tech
3  Fox attacks Blair's Tory 'lies'\n\nTony Blair ...  politics
4  Microsoft debuts security tools\n\nMicrosoft i...      tech


In [12]:
# Check for missing data and print the total number of missing values for each column
print("Missing data:", bbc.isnull().sum())

# Check for duplicate rows in the dataset and print the total number of duplicate rows
print("Duplicate data:", bbc.duplicated().sum())

# Display the unique values of the 'topic' column to examine the different categories
print("Unique values in the 'topic' column:", bbc['topic'].unique())


Missing data: text     0
topic    0
dtype: int64
Duplicate data: 98
Unique values in the 'topic' column: ['sport' 'politics' 'tech' 'business' 'entertainment']


- There are 98 duplicate rows in the dataset.
- There is no missing data.
- There are 5 unique values in the 'topic' column, and none of them are incorrect or inconsistent.
- Therefore, we only need to remove the duplicate rows.

In [13]:
# Display summary statistics of the cleaned dataset
bbc.describe()
print(bbc.head())


                                                text     topic
0  Dallaglio his own man to the end\n\nControvers...     sport
1  Best person' for top legal job\n\nThe "best pe...  politics
2  Viewers to be able to shape TV\n\nImagine edit...      tech
3  Fox attacks Blair's Tory 'lies'\n\nTony Blair ...  politics
4  Microsoft debuts security tools\n\nMicrosoft i...      tech


In [14]:
bbc["text"] = bbc["text"].str.lower()
print(bbc.head())

                                                text     topic
0  dallaglio his own man to the end\n\ncontrovers...     sport
1  best person' for top legal job\n\nthe "best pe...  politics
2  viewers to be able to shape tv\n\nimagine edit...      tech
3  fox attacks blair's tory 'lies'\n\ntony blair ...  politics
4  microsoft debuts security tools\n\nmicrosoft i...      tech


In [15]:
bbc.loc[:, "text"] = bbc["text"].str.replace("'s", "", regex=True)
bbc

Unnamed: 0,text,topic
0,dallaglio his own man to the end\n\ncontrovers...,sport
1,"best person' for top legal job\n\nthe ""best pe...",politics
2,viewers to be able to shape tv\n\nimagine edit...,tech
3,fox attacks blair tory 'lies'\n\ntony blair li...,politics
4,microsoft debuts security tools\n\nmicrosoft i...,tech
...,...,...
2220,michael film signals 'retirement'\n\nsinger ge...,entertainment
2221,ray charles studio becomes museum\n\na museum ...,entertainment
2222,chancellor rallies labour voters\n\ngordon bro...,politics
2223,oscar nominees gear up for lunch\n\nleonardo d...,entertainment


In [16]:
# Loại bỏ các ký tự đặc biệt trong cột 'text'
bbc["text"] = bbc["text"].str.replace(r"[;!?,.:()\"']", "", regex=True)
bbc

Unnamed: 0,text,topic
0,dallaglio his own man to the end\n\ncontrovers...,sport
1,best person for top legal job\n\nthe best pers...,politics
2,viewers to be able to shape tv\n\nimagine edit...,tech
3,fox attacks blair tory lies\n\ntony blair lied...,politics
4,microsoft debuts security tools\n\nmicrosoft i...,tech
...,...,...
2220,michael film signals retirement\n\nsinger geor...,entertainment
2221,ray charles studio becomes museum\n\na museum ...,entertainment
2222,chancellor rallies labour voters\n\ngordon bro...,politics
2223,oscar nominees gear up for lunch\n\nleonardo d...,entertainment


In [17]:
# Create set of stop words from sklearn
stop_words = set(ENGLISH_STOP_WORDS)

# This function is use for removing stop words in text 
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

bbc.loc[:, 'text'] = bbc['text'].apply(remove_stopwords)

bbc.head()

Unnamed: 0,text,topic
0,dallaglio man end controversy lawrence dallagl...,sport
1,best person legal job best person job appointe...,politics
2,viewers able shape tv imagine editing titanic ...,tech
3,fox attacks blair tory lies tony blair lied to...,politics
4,microsoft debuts security tools microsoft rele...,tech


In [18]:
import re

# Hàm để loại bỏ các từ viết tắt như "&", "/",":"
def remove_abbreviations_and_slash(text):
    abbreviations = r'\b(&|/)\b'
    cleaned_text = re.sub(abbreviations, '', text)
    # Loại bỏ khoảng trắng thừa sau khi xóa
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

# Áp dụng hàm cho cột 'text' trong DataFrame
bbc.loc[:,'text'] = bbc['text'].apply(remove_abbreviations_and_slash)
bbc

Unnamed: 0,text,topic
0,dallaglio man end controversy lawrence dallagl...,sport
1,best person legal job best person job appointe...,politics
2,viewers able shape tv imagine editing titanic ...,tech
3,fox attacks blair tory lies tony blair lied to...,politics
4,microsoft debuts security tools microsoft rele...,tech
...,...,...
2220,michael film signals retirement singer george ...,entertainment
2221,ray charles studio museum museum dedicated car...,entertainment
2222,chancellor rallies labour voters gordon brown ...,politics
2223,oscar nominees gear lunch leonardo dicaprio ja...,entertainment


In [19]:
from nltk.stem import WordNetLemmatizer

# Tạo lemmatizer
lemmatizer = WordNetLemmatizer()
bbc["text"] = bbc["text"].apply(lambda x: ' '.join([lemmatizer.lemmatize(word, 'v') for word in x.split()]))
bbc

Unnamed: 0,text,topic
0,dallaglio man end controversy lawrence dallagl...,sport
1,best person legal job best person job appoint ...,politics
2,viewers able shape tv imagine edit titanic wat...,tech
3,fox attack blair tory lie tony blair lie take ...,politics
4,microsoft debut security tool microsoft releas...,tech
...,...,...
2220,michael film signal retirement singer george m...,entertainment
2221,ray charles studio museum museum dedicate care...,entertainment
2222,chancellor rally labour voters gordon brown is...,politics
2223,oscar nominees gear lunch leonardo dicaprio ja...,entertainment


In [20]:

if 'text' in bbc.columns:
    # Hàm để loại bỏ số và đơn vị tiền tệ
    def remove_currency_and_numbers_with_logging(text):
        removed_items = re.findall(r'\b\d+\b|\$\w*|%\w*|₫\w*|â£\w*|£\w*|â\w*', text)
        cleaned_text = re.sub(r'\b\d+\b|\$\w*|%\w*|₫\w*|â£\w*|£\w*|â\w*', '', text)
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
        return cleaned_text, removed_items
    
    # Áp dụng hàm cho cột 'text' và lưu lại các mục bị loại bỏ
    removed_data = []
    def process_text(row):
        cleaned_text, removed_items = remove_currency_and_numbers_with_logging(row)
        removed_data.extend(removed_items)
        return cleaned_text

    # Sử dụng .loc để thay đổi giá trị trong DataFrame
    bbc.loc[:, 'text'] = bbc['text'].apply(process_text)

    # Lưu danh sách các mục bị loại bỏ vào một biến
    removed_data = list(set(removed_data)) 

    # Xuất danh sách các mục bị loại bỏ
    print(f"Số lượng từ bị xóa: {len(removed_data)}")
    print("Các mục bị loại bỏ:", removed_data)

Số lượng từ bị xóa: 2287
Các mục bị loại bỏ: ['397', '$23000', 'â£132', '147', '$345m', '$565m', '$901m', '12', '64', '1441', '$546m', '$4451bn', '$110000', '$18bn', '$41m', '357', '104000', '$15m', '$146m', '193', 'â£150000', '$280bn', '35000', 'â£59000', 'â£98m', '198000', '29', '24', 'â£254bn', '$595', '$170', '2572', '$33bn', 'â£155bn', '1948', '900000', '1812', 'â£936bn', '$480m', 'â£279bn', '$331m', '220', '$260m', '$10', 'â£95m', '1430', '201', '12800', '$171m', '$644m', '$76m', '$675bn', '$5934bn', '$368bn', '2000', 'â£22000', '173', '440', '7', 'â£14bn', '$107m', '0900', '240', 'â£29000', 'â£391bn', 'â£88m', '3028', 'â£129', 'â£4610849', '2900', '185', 'â£13bn', 'â£213m', '56000', '$191m', '$15000', '1313', '1758', '129', '$646m', '199293', '372', 'â£65m', 'â£519bn', '102975', '$185bn', 'â£255bn', 'â£277m', 'â£23', 'â£236m', '164', '$499', '138', '77', '$4193bn', '$95m', '108', '$80m', 'â£572m', '$150', 'â£13500', 'â£37bn', '1728', 'â£160m', '1148876', '$5115', 'â£485m', '189'

In [21]:

from collections import Counter

# Kiểm tra sự tồn tại của cột 'text'
if 'text' in bbc.columns:
    # Đếm tần suất của các từ
    all_words = ' '.join(bbc['text']).split()
    word_counts = Counter(all_words)

    # Lọc ra các từ xuất hiện từ 5 lần trở lên
    common_words = {word for word, count in word_counts.items() if count >= 5}
    
    # In số lượng từ phổ biến
    print(f"Số lượng từ phổ biến: {len(common_words)}")

    # Danh sách từ bị xóa (từ hiếm gặp)
    removed_words = {word for word in word_counts.keys() if word not in common_words}
    
    # In danh sách các từ hiếm gặp
    print(f"Số lượng từ bị xóa: {len(removed_words)}")

    bbc.loc[:, 'text'] = bbc['text'].apply(lambda x: ' '.join([word for word in x.split() if word in common_words]))

Số lượng từ phổ biến: 8790
Số lượng từ bị xóa: 17375


In [22]:

# Kiểm tra sự tồn tại của cột 'text'
if 'text' in bbc.columns:
    # Hàm để loại bỏ dấu gạch ngang '-' không liên kết với từ
    def clean_text(text):
        # Loại bỏ dấu gạch ngang không có từ đi kèm
        cleaned_text = re.sub(r'(?<=\s)-(?=\s)', '', text)
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
        return cleaned_text
    
    # Áp dụng hàm cho cột 'text'
    bbc.loc[:, 'text'] = bbc['text'].apply(clean_text)
bbc


Unnamed: 0,text,topic
0,dallaglio man end controversy lawrence dallagl...,sport
1,best person legal job best person job appoint ...,politics
2,viewers able shape tv imagine edit titanic wat...,tech
3,fox attack blair tory lie tony blair lie take ...,politics
4,microsoft debut security tool microsoft releas...,tech
...,...,...
2220,michael film signal retirement singer george m...,entertainment
2221,ray charles studio museum museum dedicate care...,entertainment
2222,chancellor rally labour voters gordon brown is...,politics
2223,oscar nominees gear leonardo dicaprio jamie fo...,entertainment


In [23]:
# Find duplicate rows
duplicate_rows = bbc[bbc.duplicated(subset=['text'], keep=False)]

# Print duplicate datas
grouped_duplicates = duplicate_rows.sort_values(by=['text']).groupby('text')
for idx, (group, rows) in enumerate(grouped_duplicates):
    if idx >= 5:
        break
    print(f"Duplicate group: {group}")
    print(rows)

# Check total duplicated rows 
print("Duplicate data:", bbc.duplicated().sum())

# Start cleaning the data by removing duplicate rows
bbc = bbc.drop_duplicates()

# print(bbc)

Duplicate group: 2d metal slug offer retro fun like drill sergeant past metal slug today gamers slick visuals fancy 2d consider retro release years ago frantic shooter end year yes include halo simply choose 2d level video game blast encounter toughest game likely play enemies live pile pressure players battle soldier zombies giant alien mention huge boss guard level gameplay pepper moments genius fan robotic note title refer instead vast array vehicles offer game stuff bizarre hardware tank jet offer iraq justice joy thank ultra control tough crack addictive mere metal slug cheap slice fry man say course ignore lack do visual modern time blockbuster title offer fresh paint favour real innovation metal slug fresh air era xbox gate eye
                                                   text topic
1325  2d metal slug offer retro fun like drill serge...  tech
1314  2d metal slug offer retro fun like drill serge...  tech
Duplicate group: apple attack source row civil liberties group electr

In [28]:
# Tách text thành các từ riêng biệt
bbc.loc[:,'cleaned_text'] = bbc['text'].apply(nltk.wordpunct_tokenize)
bbc

Unnamed: 0,text,topic,cleaned_text
0,dallaglio man end controversy lawrence dallagl...,sport,"[dallaglio, man, end, controversy, lawrence, d..."
1,best person legal job best person job appoint ...,politics,"[best, person, legal, job, best, person, job, ..."
2,viewers able shape tv imagine edit titanic wat...,tech,"[viewers, able, shape, tv, imagine, edit, tita..."
3,fox attack blair tory lie tony blair lie take ...,politics,"[fox, attack, blair, tory, lie, tony, blair, l..."
4,microsoft debut security tool microsoft releas...,tech,"[microsoft, debut, security, tool, microsoft, ..."
...,...,...,...
2219,rapper cent end protege feud rapper cent end p...,entertainment,"[rapper, cent, end, protege, feud, rapper, cen..."
2220,michael film signal retirement singer george m...,entertainment,"[michael, film, signal, retirement, singer, ge..."
2221,ray charles studio museum museum dedicate care...,entertainment,"[ray, charles, studio, museum, museum, dedicat..."
2222,chancellor rally labour voters gordon brown is...,politics,"[chancellor, rally, labour, voters, gordon, br..."


In [27]:
# Save the cleaned dataset to a new CSV file
bbc.to_csv('bbc_cleaned.csv', index=False)