In [1]:
import pandas as pd

bbc = pd.read_csv('bbc.csv')
print(bbc.shape)
print(bbc.info())
print(bbc.head())

(2225, 2)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    2225 non-null   object
 1   topic   2225 non-null   object
dtypes: object(2)
memory usage: 34.9+ KB
None
                                                text     topic
0  Dallaglio his own man to the end\r\n\r\nContro...     sport
1  Best person' for top legal job\r\n\r\nThe "bes...  politics
2  Viewers to be able to shape TV\r\n\r\nImagine ...      tech
3  Fox attacks Blair's Tory 'lies'\r\n\r\nTony Bl...  politics
4  Microsoft debuts security tools\r\n\r\nMicroso...      tech


In [3]:
# Check for missing data and print the total number of missing values for each column
print("Missing data:", bbc.isnull().sum())

# Check for duplicate rows in the dataset and print the total number of duplicate rows
print("Duplicate data:", bbc.duplicated().sum())

# Display the unique values of the 'topic' column to examine the different categories
print("Unique values in the 'topic' column:", bbc['topic'].unique())

Missing data: text     0
topic    0
dtype: int64
Duplicate data: 98
Unique values in the 'topic' column: ['sport' 'politics' 'tech' 'business' 'entertainment']


- There are 98 duplicate rows in the dataset.
- There is no missing data.
- There are 5 unique values in the 'topic' column, and none of them are incorrect or inconsistent.
- Therefore, we only need to remove the duplicate rows.

In [6]:
# Start cleaning the data by removing duplicate rows
bbc = bbc.drop_duplicates()

# Display summary statistics of the cleaned dataset
bbc.describe()
print(bbc.head())


                                                text     topic
0  Dallaglio his own man to the end\r\n\r\nContro...     sport
1  Best person' for top legal job\r\n\r\nThe "bes...  politics
2  Viewers to be able to shape TV\r\n\r\nImagine ...      tech
3  Fox attacks Blair's Tory 'lies'\r\n\r\nTony Bl...  politics
4  Microsoft debuts security tools\r\n\r\nMicroso...      tech


In [7]:
bbc["text"] = bbc["text"].str.lower()
print(bbc.head())

                                                text     topic
0  dallaglio his own man to the end\r\n\r\ncontro...     sport
1  best person' for top legal job\r\n\r\nthe "bes...  politics
2  viewers to be able to shape tv\r\n\r\nimagine ...      tech
3  fox attacks blair's tory 'lies'\r\n\r\ntony bl...  politics
4  microsoft debuts security tools\r\n\r\nmicroso...      tech


In [11]:
bbc["text"] = bbc["text"].str.replace(r"[;!?,.\'\"']", "", regex=True)
print(bbc.head())



                                                text     topic
0  dallaglio his own man to the end\r\n\r\ncontro...     sport
1  best person for top legal job\r\n\r\nthe best ...  politics
2  viewers to be able to shape tv\r\n\r\nimagine ...      tech
3  fox attacks blairs tory lies\r\n\r\ntony blair...  politics
4  microsoft debuts security tools\r\n\r\nmicroso...      tech


In [12]:
from sklearn.model_selection import train_test_split

# Extract the 'text' column as the input feature
texts = bbc['text']

# Extract the 'topic' column as the target label
labels = bbc['topic']

# Step 2: Split the dataset into training and testing sets with 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Display the lengths of the training and testing sets for both input features and labels
len(X_train), len(X_test), len(y_train), len(y_test)


(1701, 426, 1701, 426)

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Step 3: Preprocess the text data using TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)  # Limit features to 5000

# Fit and transform the training data to create the TF-IDF matrix for the training set
X_train_tfidf = tfidf.fit_transform(X_train)

# Transform the testing data using the fitted TF-IDF vectorizer (no fitting on the test data)
X_test_tfidf = tfidf.transform(X_test)

# Print the TF-IDF matrix for the training set
print(X_train_tfidf)

# Print the shape of the transformed testing set
print(X_test_tfidf.shape)


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 197387 stored elements and shape (1701, 5000)>
  Coords	Values
  (0, 577)	0.26222699102579267
  (0, 344)	0.16547965560011532
  (0, 3029)	0.2731293578781763
  (0, 1827)	0.4943461207361027
  (0, 2003)	0.12383354340295713
  (0, 2984)	0.06133337449010956
  (0, 1474)	0.07121690608757793
  (0, 1805)	0.22211410980029725
  (0, 1830)	0.09407526477399572
  (0, 1246)	0.1448053679818659
  (0, 2873)	0.06231864118409581
  (0, 4898)	0.1319796682408119
  (0, 3887)	0.11438039858187314
  (0, 2982)	0.07592414660000676
  (0, 59)	0.06427278386169191
  (0, 403)	0.15985616813936868
  (0, 710)	0.07592414660000676
  (0, 1418)	0.17946385272372503
  (0, 4688)	0.10292455081310799
  (0, 2194)	0.20035421705633613
  (0, 1386)	0.04556114479135536
  (0, 2802)	0.08740899700859754
  (0, 3927)	0.017370023540195442
  (0, 1793)	0.07543002979163956
  (0, 3802)	0.07085377496180187
  :	:
  (1700, 3)	0.02020068329952273
  (1700, 3843)	0.024471917924631163
  (1700, 1

In [36]:
# Save the cleaned dataset to a new CSV file
bbc_cleaned.to_csv('bbc_cleaned.csv', index=False)