In [43]:
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

bbc = pd.read_csv('bbc.csv')
bbc.head()
print(bbc.shape)
print(bbc.info())
print(bbc.head())   

(2225, 2)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    2225 non-null   object
 1   topic   2225 non-null   object
dtypes: object(2)
memory usage: 34.9+ KB
None
                                                text     topic
0  Dallaglio his own man to the end\r\n\r\nContro...     sport
1  Best person' for top legal job\r\n\r\nThe "bes...  politics
2  Viewers to be able to shape TV\r\n\r\nImagine ...      tech
3  Fox attacks Blair's Tory 'lies'\r\n\r\nTony Bl...  politics
4  Microsoft debuts security tools\r\n\r\nMicroso...      tech


In [44]:
# Check for missing data and print the total number of missing values for each column
print("Missing data:", bbc.isnull().sum())

# Check for duplicate rows in the dataset and print the total number of duplicate rows
print("Duplicate data:", bbc.duplicated().sum())

# Display the unique values of the 'topic' column to examine the different categories
print("Unique values in the 'topic' column:", bbc['topic'].unique())


Missing data: text     0
topic    0
dtype: int64
Duplicate data: 98
Unique values in the 'topic' column: ['sport' 'politics' 'tech' 'business' 'entertainment']


- There are 98 duplicate rows in the dataset.
- There is no missing data.
- There are 5 unique values in the 'topic' column, and none of them are incorrect or inconsistent.
- Therefore, we only need to remove the duplicate rows.

In [45]:
# Display summary statistics of the cleaned dataset
bbc.describe()
print(bbc.head())


                                                text     topic
0  Dallaglio his own man to the end\r\n\r\nContro...     sport
1  Best person' for top legal job\r\n\r\nThe "bes...  politics
2  Viewers to be able to shape TV\r\n\r\nImagine ...      tech
3  Fox attacks Blair's Tory 'lies'\r\n\r\nTony Bl...  politics
4  Microsoft debuts security tools\r\n\r\nMicroso...      tech


Converts all the text in the 'text' column of the bbc DataFrame to lowercase.

In [46]:
bbc["text"] = bbc["text"].str.lower()
print(bbc.head())

                                                text     topic
0  dallaglio his own man to the end\r\n\r\ncontro...     sport
1  best person' for top legal job\r\n\r\nthe "bes...  politics
2  viewers to be able to shape tv\r\n\r\nimagine ...      tech
3  fox attacks blair's tory 'lies'\r\n\r\ntony bl...  politics
4  microsoft debuts security tools\r\n\r\nmicroso...      tech


This operation helps standardize the text by eliminating possessive forms, making the text easier to process or analyze without unnecessary distinctions like possession.

The function is used to remove the possessive suffix "'s" from the words in the 'text' column of the bbc DataFrame

In [47]:
bbc.loc[:, "text"] = bbc["text"].str.replace("'s", "", regex=True)
bbc

Unnamed: 0,text,topic
0,dallaglio his own man to the end\r\n\r\ncontro...,sport
1,"best person' for top legal job\r\n\r\nthe ""bes...",politics
2,viewers to be able to shape tv\r\n\r\nimagine ...,tech
3,fox attacks blair tory 'lies'\r\n\r\ntony blai...,politics
4,microsoft debuts security tools\r\n\r\nmicroso...,tech
...,...,...
2220,michael film signals 'retirement'\r\n\r\nsinge...,entertainment
2221,ray charles studio becomes museum\r\n\r\na mus...,entertainment
2222,chancellor rallies labour voters\r\n\r\ngordon...,politics
2223,oscar nominees gear up for lunch\r\n\r\nleonar...,entertainment


Remove special characters from the 'text' column in the bbc DataFrame

In [48]:
bbc["text"] = bbc["text"].str.replace(r"[;!?,.:()\"']", "", regex=True)
bbc

Unnamed: 0,text,topic
0,dallaglio his own man to the end\r\n\r\ncontro...,sport
1,best person for top legal job\r\n\r\nthe best ...,politics
2,viewers to be able to shape tv\r\n\r\nimagine ...,tech
3,fox attacks blair tory lies\r\n\r\ntony blair ...,politics
4,microsoft debuts security tools\r\n\r\nmicroso...,tech
...,...,...
2220,michael film signals retirement\r\n\r\nsinger ...,entertainment
2221,ray charles studio becomes museum\r\n\r\na mus...,entertainment
2222,chancellor rallies labour voters\r\n\r\ngordon...,politics
2223,oscar nominees gear up for lunch\r\n\r\nleonar...,entertainment


The code you've provided is used to remove stop words from the 'text' column in the bbc DataFrame using a set of stop words from the sklearn library.

In [49]:
# Create set of stop words from sklearn
stop_words = set(ENGLISH_STOP_WORDS)

# This function is use for removing stop words in text 
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

bbc.loc[:, 'text'] = bbc['text'].apply(remove_stopwords)

bbc.head()

Unnamed: 0,text,topic
0,dallaglio man end controversy lawrence dallagl...,sport
1,best person legal job best person job appointe...,politics
2,viewers able shape tv imagine editing titanic ...,tech
3,fox attacks blair tory lies tony blair lied to...,politics
4,microsoft debuts security tools microsoft rele...,tech


The code you've provided defines a function to remove specific abbreviations like '&' and '/' from the 'text' column in the bbc DataFrame.

In [50]:
import re

def remove_abbreviations_and_slash(text):
    abbreviations = r'\b(&|/)\b'
    cleaned_text = re.sub(abbreviations, '', text)
    # Remove extra spaces after clean
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

# Apply the function to the 'text' column in the DataFrame
bbc.loc[:,'text'] = bbc['text'].apply(remove_abbreviations_and_slash)
bbc

Unnamed: 0,text,topic
0,dallaglio man end controversy lawrence dallagl...,sport
1,best person legal job best person job appointe...,politics
2,viewers able shape tv imagine editing titanic ...,tech
3,fox attacks blair tory lies tony blair lied to...,politics
4,microsoft debuts security tools microsoft rele...,tech
...,...,...
2220,michael film signals retirement singer george ...,entertainment
2221,ray charles studio museum museum dedicated car...,entertainment
2222,chancellor rallies labour voters gordon brown ...,politics
2223,oscar nominees gear lunch leonardo dicaprio ja...,entertainment


The code you've provided applies lemmatization to the text in the 'text' column of the bbc DataFrame using NLTK's WordNet Lemmatizer. This process reduces words to their base or root form, which can help in text normalization for natural language processing (NLP) tasks.

In [51]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
bbc["text"] = bbc["text"].apply(lambda x: ' '.join([lemmatizer.lemmatize(word, 'v') for word in x.split()]))
bbc

Unnamed: 0,text,topic
0,dallaglio man end controversy lawrence dallagl...,sport
1,best person legal job best person job appoint ...,politics
2,viewers able shape tv imagine edit titanic wat...,tech
3,fox attack blair tory lie tony blair lie take ...,politics
4,microsoft debut security tool microsoft releas...,tech
...,...,...
2220,michael film signal retirement singer george m...,entertainment
2221,ray charles studio museum museum dedicate care...,entertainment
2222,chancellor rally labour voters gordon brown is...,politics
2223,oscar nominees gear lunch leonardo dicaprio ja...,entertainment



This code is designed to remove currency symbols, numbers, and other specified patterns from the 'text' column of the bbc DataFrame.

In [52]:

if 'text' in bbc.columns:
    def remove_currency_and_numbers_with_logging(text):
        removed_items = re.findall(r'\b\d+\b|\$\w*|%\w*|₫\w*|â£\w*|£\w*|â\w*', text)
        cleaned_text = re.sub(r'\b\d+\b|\$\w*|%\w*|₫\w*|â£\w*|£\w*|â\w*', '', text)
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
        return cleaned_text, removed_items
    
    # Apply function to 'text' column and store the removed items
    removed_data = []
    def process_text(row):
        cleaned_text, removed_items = remove_currency_and_numbers_with_logging(row)
        removed_data.extend(removed_items)
        return cleaned_text

    # Use .loc to modify the values in the DataFrame
    bbc.loc[:, 'text'] = bbc['text'].apply(process_text)

    # Stored the list of removed items in variable
    removed_data = list(set(removed_data)) 

    print(f"Number of removed words: {len(removed_data)}")
    print("Removed words:", removed_data)

Number of removed words: 2287
Removed words: ['030', '529', 'â£2659bn', '$84bn', '$147m', '578', '277', '2005', '94', '12600', '1945', 'â£484bn', 'â£132', '1200', 'â£70m', '2850', '$1085m', 'â£275', '$150m', '$53', '$1044', '$1399bn', 'â£28bn', '$155m', '9', 'â£15bn', '$2bn', '$112m', '$130', '0619', 'â£520m', '$447m', 'â£38bn', '12', '$109bn', 'â£59000', '$144bn', '$120bn', '20034', '$8m', 'â£7960', '$818bn', '$387m', '920', '464000', '1215', '$4710', '1125', '1444', '$085', '184', 'â£150000', 'â£245m', '78000', '662', '318', '4341', '137000', 'â£48m', '11000', '$188', '80000', '400', 'â£15m', '1875', '98', 'â£19', '780', 'â£44bn', '19989', '2001', 'â£248bn', '$960m', '397', '2003', '77', 'â£306m', 'â£500m', 'â£263000', '1967', 'â£48bn', 'â£1152m', 'â£458m', '201', '169000', '$4862', '$4bn', '$600m', '1728', '$176m', '615', '$143bn', '$248bn', 'â£182920', '3217', '38000', 'â£3788m', '$50m', '$47m', '449', '$74bn', 'â£178m', 'â£102m', '3997', '6650', '101115', '623', '$38bn', 'â£317bn'

Eliminate 5 rare words:

In [53]:

from collections import Counter

if 'text' in bbc.columns:
    # Count the frequency of words
    all_words = ' '.join(bbc['text']).split()
    word_counts = Counter(all_words)

    # Filter out words that appear 5 or more times
    common_words = {word for word, count in word_counts.items() if count >= 5}
    print(f"Number of common words: {len(common_words)}")

    # List of removed words (rare words)
    removed_words = {word for word in word_counts.keys() if word not in common_words}
    print(f"Number of words deleted: {len(removed_words)}")

    bbc.loc[:, 'text'] = bbc['text'].apply(lambda x: ' '.join([word for word in x.split() if word in common_words]))

Number of common words: 8790
Number of words deleted: 17375


Removes hyphens (-) that are not connected to words

In [54]:
if 'text' in bbc.columns:
    def clean_text(text):
        # Remove hyphens that are not attached to words
        cleaned_text = re.sub(r'(?<=\s)-(?=\s)', '', text)
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
        return cleaned_text
    
    # Apply the function to the 'text' column
    bbc.loc[:, 'text'] = bbc['text'].apply(clean_text)
print(bbc)


                                                   text          topic
0     dallaglio man end controversy lawrence dallagl...          sport
1     best person legal job best person job appoint ...       politics
2     viewers able shape tv imagine edit titanic wat...           tech
3     fox attack blair tory lie tony blair lie take ...       politics
4     microsoft debut security tool microsoft releas...           tech
...                                                 ...            ...
2220  michael film signal retirement singer george m...  entertainment
2221  ray charles studio museum museum dedicate care...  entertainment
2222  chancellor rally labour voters gordon brown is...       politics
2223  oscar nominees gear leonardo dicaprio jamie fo...  entertainment
2224  california set fin spyware makers computer pro...           tech

[2225 rows x 2 columns]


This script identifies, inspects, and removes duplicate rows from the bbc DataFrame:

In [55]:
# Find duplicate rows
duplicate_rows = bbc[bbc.duplicated(subset=['text'], keep=False)]

# Print duplicate datas
grouped_duplicates = duplicate_rows.sort_values(by=['text']).groupby('text')
for idx, (group, rows) in enumerate(grouped_duplicates):
    if idx >= 5:
        break
    print(f"Duplicate group: {group}")
    print(rows)

# Check total duplicated rows 
print("Duplicate data:", bbc.duplicated().sum())

# Start cleaning the data by removing duplicate rows
bbc = bbc.drop_duplicates()

bbc

Duplicate group: 2d metal slug offer retro fun like drill sergeant past metal slug today gamers slick visuals fancy 2d consider retro release years ago frantic shooter end year yes include halo simply choose 2d level video game blast encounter toughest game likely play enemies live pile pressure players battle soldier zombies giant alien mention huge boss guard level gameplay pepper moments genius fan robotic note title refer instead vast array vehicles offer game stuff bizarre hardware tank jet offer iraq justice joy thank ultra control tough crack addictive mere metal slug cheap slice fry man say course ignore lack do visual modern time blockbuster title offer fresh paint favour real innovation metal slug fresh air era xbox gate eye
                                                   text topic
1325  2d metal slug offer retro fun like drill serge...  tech
1314  2d metal slug offer retro fun like drill serge...  tech
Duplicate group: apple attack source row civil liberties group electr

Unnamed: 0,text,topic
0,dallaglio man end controversy lawrence dallagl...,sport
1,best person legal job best person job appoint ...,politics
2,viewers able shape tv imagine edit titanic wat...,tech
3,fox attack blair tory lie tony blair lie take ...,politics
4,microsoft debut security tool microsoft releas...,tech
...,...,...
2219,rapper cent end protege feud rapper cent end p...,entertainment
2220,michael film signal retirement singer george m...,entertainment
2221,ray charles studio museum museum dedicate care...,entertainment
2222,chancellor rally labour voters gordon brown is...,politics


Save the cleaned dataset to a new CSV file:

In [56]:
bbc.to_csv('bbc_cleaned.csv', index=False)