In [25]:
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

bbc = pd.read_csv('bbc.csv')
bbc.head()

Unnamed: 0,text,topic
0,Dallaglio his own man to the end\n\nControvers...,sport
1,"Best person' for top legal job\n\nThe ""best pe...",politics
2,Viewers to be able to shape TV\n\nImagine edit...,tech
3,Fox attacks Blair's Tory 'lies'\n\nTony Blair ...,politics
4,Microsoft debuts security tools\n\nMicrosoft i...,tech


In [26]:
# Check for missing data and print the total number of missing values for each column
print("Missing data:", bbc.isnull().sum())

# Check for duplicate rows in the dataset and print the total number of duplicate rows
print("Duplicate data:", bbc.duplicated().sum())

# Display the unique values of the 'topic' column to examine the different categories
print("Unique values in the 'topic' column:", bbc['topic'].unique())

Missing data: text     0
topic    0
dtype: int64
Duplicate data: 98
Unique values in the 'topic' column: ['sport' 'politics' 'tech' 'business' 'entertainment']


- There are 98 duplicate rows in the dataset.
- There is no missing data.
- There are 5 unique values in the 'topic' column, and none of them are incorrect or inconsistent.
- Therefore, we only need to remove the duplicate rows.

In [27]:
# Start cleaning the data by removing duplicate rows
bbc_cleaned = bbc.drop_duplicates()

# Display summary statistics of the cleaned dataset
bbc_cleaned.describe()


Unnamed: 0,text,topic
count,2127,2127
unique,2127,5
top,Oscar nominees gear up for lunch\n\nLeonardo D...,sport
freq,1,505


In [28]:
# Create set of stop words from sklearn
stop_words = set(ENGLISH_STOP_WORDS)

# This function is use for removing stop words in text 
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

bbc_cleaned.loc[:, 'text'] = bbc_cleaned['text'].apply(remove_stopwords)

bbc_cleaned.head()

Unnamed: 0,text,topic
0,Dallaglio man end Controversy Lawrence Dallagl...,sport
1,"Best person' legal job ""best person job"" appoi...",politics
2,Viewers able shape TV Imagine editing Titanic ...,tech
3,Fox attacks Blair's Tory 'lies' Tony Blair lie...,politics
4,Microsoft debuts security tools Microsoft rele...,tech


In [29]:
# Save the cleaned dataset to a new CSV file
bbc_cleaned.to_csv('bbc_cleaned.csv', index=False)