In [121]:
import nltk
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
nltk.download('punkt')
nltk.download('wordnet')

bbc = pd.read_csv('bbc.csv')
bbc.head()
print(bbc.shape)
print(bbc.info())
print(bbc.head())   

(2225, 2)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    2225 non-null   object
 1   topic   2225 non-null   object
dtypes: object(2)
memory usage: 34.9+ KB
None
                                                text     topic
0  Dallaglio his own man to the end\r\n\r\nContro...     sport
1  Best person' for top legal job\r\n\r\nThe "bes...  politics
2  Viewers to be able to shape TV\r\n\r\nImagine ...      tech
3  Fox attacks Blair's Tory 'lies'\r\n\r\nTony Bl...  politics
4  Microsoft debuts security tools\r\n\r\nMicroso...      tech


[nltk_data] Downloading package punkt to c:\Users\nthuy\AppData\Local\
[nltk_data]     Programs\Python\Python312\lib\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to c:\Users\nthuy\AppData\Loca
[nltk_data]     l\Programs\Python\Python312\lib\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [122]:
# Check for missing data and print the total number of missing values for each column
print("Missing data:", bbc.isnull().sum())

# Check for duplicate rows in the dataset and print the total number of duplicate rows
print("Duplicate data:", bbc.duplicated().sum())

# Display the unique values of the 'topic' column to examine the different categories
print("Unique values in the 'topic' column:", bbc['topic'].unique())


Missing data: text     0
topic    0
dtype: int64
Duplicate data: 98
Unique values in the 'topic' column: ['sport' 'politics' 'tech' 'business' 'entertainment']


- There are 98 duplicate rows in the dataset.
- There is no missing data.
- There are 5 unique values in the 'topic' column, and none of them are incorrect or inconsistent.
- Therefore, we only need to remove the duplicate rows.

In [123]:
# Display summary statistics of the cleaned dataset
bbc.describe()
print(bbc.head())


                                                text     topic
0  Dallaglio his own man to the end\r\n\r\nContro...     sport
1  Best person' for top legal job\r\n\r\nThe "bes...  politics
2  Viewers to be able to shape TV\r\n\r\nImagine ...      tech
3  Fox attacks Blair's Tory 'lies'\r\n\r\nTony Bl...  politics
4  Microsoft debuts security tools\r\n\r\nMicroso...      tech


In [124]:
bbc["text"] = bbc["text"].str.lower()
print(bbc.head())

                                                text     topic
0  dallaglio his own man to the end\r\n\r\ncontro...     sport
1  best person' for top legal job\r\n\r\nthe "bes...  politics
2  viewers to be able to shape tv\r\n\r\nimagine ...      tech
3  fox attacks blair's tory 'lies'\r\n\r\ntony bl...  politics
4  microsoft debuts security tools\r\n\r\nmicroso...      tech


In [125]:
bbc.loc[:, "text"] = bbc["text"].str.replace("'s", "", regex=True)
bbc

Unnamed: 0,text,topic
0,dallaglio his own man to the end\r\n\r\ncontro...,sport
1,"best person' for top legal job\r\n\r\nthe ""bes...",politics
2,viewers to be able to shape tv\r\n\r\nimagine ...,tech
3,fox attacks blair tory 'lies'\r\n\r\ntony blai...,politics
4,microsoft debuts security tools\r\n\r\nmicroso...,tech
...,...,...
2220,michael film signals 'retirement'\r\n\r\nsinge...,entertainment
2221,ray charles studio becomes museum\r\n\r\na mus...,entertainment
2222,chancellor rallies labour voters\r\n\r\ngordon...,politics
2223,oscar nominees gear up for lunch\r\n\r\nleonar...,entertainment


In [126]:
# Loại bỏ các ký tự đặc biệt trong cột 'text'
bbc["text"] = bbc["text"].str.replace(r"[;!?,.:()\"']", "", regex=True)
bbc

Unnamed: 0,text,topic
0,dallaglio his own man to the end\r\n\r\ncontro...,sport
1,best person for top legal job\r\n\r\nthe best ...,politics
2,viewers to be able to shape tv\r\n\r\nimagine ...,tech
3,fox attacks blair tory lies\r\n\r\ntony blair ...,politics
4,microsoft debuts security tools\r\n\r\nmicroso...,tech
...,...,...
2220,michael film signals retirement\r\n\r\nsinger ...,entertainment
2221,ray charles studio becomes museum\r\n\r\na mus...,entertainment
2222,chancellor rallies labour voters\r\n\r\ngordon...,politics
2223,oscar nominees gear up for lunch\r\n\r\nleonar...,entertainment


In [127]:
# Create set of stop words from sklearn
stop_words = set(ENGLISH_STOP_WORDS)

# This function is use for removing stop words in text 
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

bbc.loc[:, 'text'] = bbc['text'].apply(remove_stopwords)

bbc.head()

Unnamed: 0,text,topic
0,dallaglio man end controversy lawrence dallagl...,sport
1,best person legal job best person job appointe...,politics
2,viewers able shape tv imagine editing titanic ...,tech
3,fox attacks blair tory lies tony blair lied to...,politics
4,microsoft debuts security tools microsoft rele...,tech


In [128]:
import re

# Hàm để loại bỏ các từ viết tắt như "&", "/",":"
def remove_abbreviations_and_slash(text):
    abbreviations = r'\b(&|/)\b'
    cleaned_text = re.sub(abbreviations, '', text)
    # Loại bỏ khoảng trắng thừa sau khi xóa
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

# Áp dụng hàm cho cột 'text' trong DataFrame
bbc.loc[:,'text'] = bbc['text'].apply(remove_abbreviations_and_slash)
bbc

Unnamed: 0,text,topic
0,dallaglio man end controversy lawrence dallagl...,sport
1,best person legal job best person job appointe...,politics
2,viewers able shape tv imagine editing titanic ...,tech
3,fox attacks blair tory lies tony blair lied to...,politics
4,microsoft debuts security tools microsoft rele...,tech
...,...,...
2220,michael film signals retirement singer george ...,entertainment
2221,ray charles studio museum museum dedicated car...,entertainment
2222,chancellor rallies labour voters gordon brown ...,politics
2223,oscar nominees gear lunch leonardo dicaprio ja...,entertainment


In [129]:
from nltk.stem import WordNetLemmatizer

# Tạo lemmatizer
lemmatizer = WordNetLemmatizer()
bbc["text"] = bbc["text"].apply(lambda x: ' '.join([lemmatizer.lemmatize(word, 'v') for word in x.split()]))
bbc

Unnamed: 0,text,topic
0,dallaglio man end controversy lawrence dallagl...,sport
1,best person legal job best person job appoint ...,politics
2,viewers able shape tv imagine edit titanic wat...,tech
3,fox attack blair tory lie tony blair lie take ...,politics
4,microsoft debut security tool microsoft releas...,tech
...,...,...
2220,michael film signal retirement singer george m...,entertainment
2221,ray charles studio museum museum dedicate care...,entertainment
2222,chancellor rally labour voters gordon brown is...,politics
2223,oscar nominees gear lunch leonardo dicaprio ja...,entertainment


In [130]:

if 'text' in bbc.columns:
    # Hàm để loại bỏ số và đơn vị tiền tệ
    def remove_currency_and_numbers_with_logging(text):
        removed_items = re.findall(r'\b\d+\b|\$\w*|%\w*|₫\w*|â£\w*|£\w*|â\w*', text)
        cleaned_text = re.sub(r'\b\d+\b|\$\w*|%\w*|₫\w*|â£\w*|£\w*|â\w*', '', text)
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
        return cleaned_text, removed_items
    
    # Áp dụng hàm cho cột 'text' và lưu lại các mục bị loại bỏ
    removed_data = []
    def process_text(row):
        cleaned_text, removed_items = remove_currency_and_numbers_with_logging(row)
        removed_data.extend(removed_items)
        return cleaned_text

    # Sử dụng .loc để thay đổi giá trị trong DataFrame
    bbc.loc[:, 'text'] = bbc['text'].apply(process_text)

    # Lưu danh sách các mục bị loại bỏ vào một biến
    removed_data = list(set(removed_data)) 

    # Xuất danh sách các mục bị loại bỏ
    print(f"Số lượng từ bị xóa: {len(removed_data)}")
    print("Các mục bị loại bỏ:", removed_data)

Số lượng từ bị xóa: 2287
Các mục bị loại bỏ: ['387500', 'â£546m', 'â£205000', '$5934bn', 'â£514m', '48000', 'â£910m', '5037', '999', '$152', '10416', '$13666', 'â£39000', 'â£27473m', '79000', '$20bn', '1989', 'â£188bn', '1008', 'â£519bn', '027', '48', '537', '$4584m', '$13652', 'â£115m', '$7782m', 'â£264m', '623', '$52bn', 'â£65m', '23500', 'â£374m', '111', '$16bn', 'â£52590', '203', '$4974', '833', 'â£11724929', '267', '5298', 'â£76m', '$46bn', '$107bn', '322', '271', '83000', 'â£230bn', '$125m', '0870', '6962', 'â£1m', '$187bn', '$1756bn', '$13006', '$800m', 'â£30m', 'â£64bn', '1997', '206900', 'â£161m', '$93bn', '28041', '$148', 'â£135', '08', '158', 'â£1500', 'â£266582', '$408m', '265', '107', '2115', '$32m', '828', 'â£13bn', '3367', 'â£191m', '1951', '$186bn', 'â£3921m', '$103500', '4000', '$613bn', '1474', '1944', '0227', '2940', 'â£324bn', '771', '12', 'â£572m', '1930', 'â£79m', '9300', '$119bn', '219', '487939', 'â£152', '$280bn', '$504m', '$170', '164', '440700', 'â£44m', '44'

In [131]:

from collections import Counter

# Kiểm tra sự tồn tại của cột 'text'
if 'text' in bbc.columns:
    # Đếm tần suất của các từ
    all_words = ' '.join(bbc['text']).split()
    word_counts = Counter(all_words)

    # Lọc ra các từ xuất hiện từ 5 lần trở lên
    common_words = {word for word, count in word_counts.items() if count >= 5}
    
    # In số lượng từ phổ biến
    print(f"Số lượng từ phổ biến: {len(common_words)}")

    # Danh sách từ bị xóa (từ hiếm gặp)
    removed_words = {word for word in word_counts.keys() if word not in common_words}
    
    # In danh sách các từ hiếm gặp
    print(f"Số lượng từ bị xóa: {len(removed_words)}")

    bbc.loc[:, 'text'] = bbc['text'].apply(lambda x: ' '.join([word for word in x.split() if word in common_words]))

Số lượng từ phổ biến: 8790
Số lượng từ bị xóa: 17375


In [132]:
# Kiểm tra sự tồn tại của cột 'text'
if 'text' in bbc.columns:
    # Hàm để loại bỏ dấu gạch ngang '-' không liên kết với từ
    def clean_text(text):
        # Loại bỏ dấu gạch ngang không có từ đi kèm
        cleaned_text = re.sub(r'(?<=\s)-(?=\s)', '', text)
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
        return cleaned_text
    
    # Áp dụng hàm cho cột 'text'
    bbc.loc[:, 'text'] = bbc['text'].apply(clean_text)
print(bbc)


                                                   text          topic
0     dallaglio man end controversy lawrence dallagl...          sport
1     best person legal job best person job appoint ...       politics
2     viewers able shape tv imagine edit titanic wat...           tech
3     fox attack blair tory lie tony blair lie take ...       politics
4     microsoft debut security tool microsoft releas...           tech
...                                                 ...            ...
2220  michael film signal retirement singer george m...  entertainment
2221  ray charles studio museum museum dedicate care...  entertainment
2222  chancellor rally labour voters gordon brown is...       politics
2223  oscar nominees gear leonardo dicaprio jamie fo...  entertainment
2224  california set fin spyware makers computer pro...           tech

[2225 rows x 2 columns]


In [133]:
# Find duplicate rows
duplicate_rows = bbc[bbc.duplicated(subset=['text'], keep=False)]

# Print duplicate datas
grouped_duplicates = duplicate_rows.sort_values(by=['text']).groupby('text')
for idx, (group, rows) in enumerate(grouped_duplicates):
    if idx >= 5:
        break
    print(f"Duplicate group: {group}")
    print(rows)

# Check total duplicated rows 
print("Duplicate data:", bbc.duplicated().sum())

# Start cleaning the data by removing duplicate rows
bbc = bbc.drop_duplicates()

# print(bbc)

Duplicate group: 2d metal slug offer retro fun like drill sergeant past metal slug today gamers slick visuals fancy 2d consider retro release years ago frantic shooter end year yes include halo simply choose 2d level video game blast encounter toughest game likely play enemies live pile pressure players battle soldier zombies giant alien mention huge boss guard level gameplay pepper moments genius fan robotic note title refer instead vast array vehicles offer game stuff bizarre hardware tank jet offer iraq justice joy thank ultra control tough crack addictive mere metal slug cheap slice fry man say course ignore lack do visual modern time blockbuster title offer fresh paint favour real innovation metal slug fresh air era xbox gate eye
                                                   text topic
1325  2d metal slug offer retro fun like drill serge...  tech
1314  2d metal slug offer retro fun like drill serge...  tech
Duplicate group: apple attack source row civil liberties group electr

In [134]:
# Save the cleaned dataset to a new CSV file
bbc.to_csv('bbc_cleaned.csv', index=False)