In [1]:
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

bbc = pd.read_csv('bbc.csv')
bbc.head()
print(bbc.shape)
print(bbc.info())
print(bbc.head())   

(2225, 2)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    2225 non-null   object
 1   topic   2225 non-null   object
dtypes: object(2)
memory usage: 34.9+ KB
None
                                                text     topic
0  Dallaglio his own man to the end\r\n\r\nContro...     sport
1  Best person' for top legal job\r\n\r\nThe "bes...  politics
2  Viewers to be able to shape TV\r\n\r\nImagine ...      tech
3  Fox attacks Blair's Tory 'lies'\r\n\r\nTony Bl...  politics
4  Microsoft debuts security tools\r\n\r\nMicroso...      tech


In [2]:
# Check for missing data and print the total number of missing values for each column
print("Missing data:", bbc.isnull().sum())

# Check for duplicate rows in the dataset and print the total number of duplicate rows
print("Duplicate data:", bbc.duplicated().sum())

# Display the unique values of the 'topic' column to examine the different categories
print("Unique values in the 'topic' column:", bbc['topic'].unique())

Missing data: text     0
topic    0
dtype: int64
Duplicate data: 98
Unique values in the 'topic' column: ['sport' 'politics' 'tech' 'business' 'entertainment']


- There are 98 duplicate rows in the dataset.
- There is no missing data.
- There are 5 unique values in the 'topic' column, and none of them are incorrect or inconsistent.
- Therefore, we only need to remove the duplicate rows.

In [3]:
# Display summary statistics of the cleaned dataset
bbc.describe()
print(bbc.head())


                                                text     topic
0  Dallaglio his own man to the end\r\n\r\nContro...     sport
1  Best person' for top legal job\r\n\r\nThe "bes...  politics
2  Viewers to be able to shape TV\r\n\r\nImagine ...      tech
3  Fox attacks Blair's Tory 'lies'\r\n\r\nTony Bl...  politics
4  Microsoft debuts security tools\r\n\r\nMicroso...      tech


In [4]:
bbc["text"] = bbc["text"].str.lower()
print(bbc.head())

                                                text     topic
0  dallaglio his own man to the end\r\n\r\ncontro...     sport
1  best person' for top legal job\r\n\r\nthe "bes...  politics
2  viewers to be able to shape tv\r\n\r\nimagine ...      tech
3  fox attacks blair's tory 'lies'\r\n\r\ntony bl...  politics
4  microsoft debuts security tools\r\n\r\nmicroso...      tech


In [5]:
# Loại bỏ dấu câu và ký tự đặc biệt
bbc["text"] = bbc["text"].str.replace(r"[;!?,.()\"']", "", regex=True)

print(bbc.head())


                                                text     topic
0  dallaglio his own man to the end\r\n\r\ncontro...     sport
1  best person for top legal job\r\n\r\nthe best ...  politics
2  viewers to be able to shape tv\r\n\r\nimagine ...      tech
3  fox attacks blairs tory lies\r\n\r\ntony blair...  politics
4  microsoft debuts security tools\r\n\r\nmicroso...      tech


In [6]:
# Create set of stop words from sklearn
stop_words = set(ENGLISH_STOP_WORDS)

# This function is use for removing stop words in text 
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

bbc.loc[:, 'text'] = bbc['text'].apply(remove_stopwords)

bbc.head()

Unnamed: 0,text,topic
0,dallaglio man end controversy lawrence dallagl...,sport
1,best person legal job best person job appointe...,politics
2,viewers able shape tv imagine editing titanic ...,tech
3,fox attacks blairs tory lies tony blair lied t...,politics
4,microsoft debuts security tools microsoft rele...,tech


In [7]:
import nltk
from nltk.stem import WordNetLemmatizer

# Tạo lemmatizer
lemmatizer = WordNetLemmatizer()

# Ví dụ dữ liệu
bbc["text"] = bbc["text"].apply(lambda x: ' '.join([lemmatizer.lemmatize(word, 'v') for word in x.split()]))

print(bbc.head())


                                                text     topic
0  dallaglio man end controversy lawrence dallagl...     sport
1  best person legal job best person job appoint ...  politics
2  viewers able shape tv imagine edit titanic wat...      tech
3  fox attack blairs tory lie tony blair lie take...  politics
4  microsoft debut security tool microsoft releas...      tech


In [8]:
# Find duplicate rows
# Tìm các dòng trùng lặp dựa trên cột 'text'
duplicate_rows = bbc[bbc.duplicated(subset=['text'], keep=False)]

# Hiển thị các nhóm dữ liệu trùng
grouped_duplicates = duplicate_rows.sort_values(by=['text']).groupby('text')
for group, rows in grouped_duplicates:
    print(f"Duplicate group: {group}")
    print(rows)

print(duplicate_rows)

# Start cleaning the data by removing duplicate rows
bbc = bbc.drop_duplicates()

print(bbc)



Duplicate group: 2d metal slug offer retro fun like drill sergeant past metal slug 3 wake-up todays gamers molly-coddled slick visuals fancy trimmings hand-animated sprites 2d side-scrolling consider retro release arcades years ago frantic shooter end joypad year yes include halo 2 simply choose grunt wade 2d side-scrolling level hectic video game blast encounter toughest game likely play hordes enemies live pile pressure players battle soldier snowmen zombies giant crab alien mention huge screen-filling boss guard level shoot-anything-that-moves gameplay pepper moments old-school genius fan robotic gastropods note title refer instead vast array vehicles offer game stuff bizarre hardware tank jet submarine commandeer cannon-toting camels elephants ostriches - weaponry offer acre iraq doling justice joy thank ultra responsive control tough nut crack addictive gag mere â£20 metal slug 3 cheap slice fry spud man say course ignore lack do visual fireworks modern blasters time blockbuster t

In [9]:
# Save the cleaned dataset to a new CSV file
bbc.to_csv('bbc_cleaned.csv', index=False)