In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('bbc_data.csv')
df.head()

Unnamed: 0,data,labels
0,Musicians to tackle US red tape Musicians gro...,entertainment
1,"U2s desire to be number one U2, who have won ...",entertainment
2,Rocker Doherty in on-stage fight Rock singer ...,entertainment
3,Snicket tops US box office chart The film ada...,entertainment
4,"Oceans Twelve raids box office Oceans Twelve,...",entertainment


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   data    2225 non-null   object
 1   labels  2225 non-null   object
dtypes: object(2)
memory usage: 34.9+ KB


In [4]:
df.isnull().sum()

data      0
labels    0
dtype: int64

In [5]:
df['labels'].value_counts()

labels
sport            511
business         510
politics         417
tech             401
entertainment    386
Name: count, dtype: int64

All the labels are of equal sizes

In [6]:
#preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to clean text: remove stopwords, punctuation, and lemmatize
def preprocess_text(text):
    # Tokenize the text
    words = word_tokenize(text)
    
    # Lowercasing
    words = [word.lower() for word in words]
    
    # Remove punctuation
    words = [word for word in words if word not in string.punctuation]
    
    # Remove stopwords
    words = [word for word in words if word not in stop_words]
    
    # Lemmatize words
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # Return the cleaned text
    return ' '.join(words)

# Apply the preprocessing to the 'text' column
df['data'] = df['data'].apply(preprocess_text)

# Display the first few rows of the cleaned dataset
df['data'].head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\princ\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\princ\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\princ\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0    musician tackle u red tape musician group tack...
1    u2s desire number one u2 three prestigious gra...
2    rocker doherty on-stage fight rock singer pete...
3    snicket top u box office chart film adaptation...
4    ocean twelve raid box office ocean twelve crim...
Name: data, dtype: object

In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['labels'] = le.fit_transform(df['labels'])
df.head()

Unnamed: 0,data,labels
0,musician tackle u red tape musician group tack...,1
1,u2s desire number one u2 three prestigious gra...,1
2,rocker doherty on-stage fight rock singer pete...,1
3,snicket top u box office chart film adaptation...,1
4,ocean twelve raid box office ocean twelve crim...,1


In [8]:
print("Category to Label mapping:", dict(zip(le.classes_, range(len(le.classes_)))))

Category to Label mapping: {'business': 0, 'entertainment': 1, 'politics': 2, 'sport': 3, 'tech': 4}


In [9]:
#tfidf
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df['data'])
y = df['labels']
X.shape, y.shape

((2225, 27449), (2225,))

In [10]:
#See the first five values of X
X[0:5].toarray()

array([[0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.0533808, 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ]])

In [11]:
#train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1780, 27449), (445, 27449), (1780,), (445,))

In [12]:
#model building
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
xgb = XGBClassifier()
lr = LogisticRegression()

In [13]:
xgb.fit(X_train, y_train)

In [14]:
#accuracy
from sklearn.metrics import accuracy_score
y_pred = xgb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9617977528089887


In [15]:
#predict on unseen data
text = "Tech giant unveils groundbreaking quantum computer, promising unprecedented processing power to revolutionize industries from healthcare to cybersecurity"
text = preprocess_text(text)
vector = tfidf.transform([text])

predicted_labels = xgb.predict(vector)

In [16]:
# Convert predicted labels back to class names
predicted_classes = le.inverse_transform(predicted_labels)
# Display the predicted class names
print("Predicted class names:", predicted_classes)

Predicted class names: ['tech']


In [17]:
#model with naive bayes
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)

In [18]:
y_pred1 = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred1)
print("Accuracy:", accuracy)

Accuracy: 0.9685393258426966


In [20]:
#predict on unseen data
text = "After months of negotiations, the government passed a landmark healthcare reform bill aimed at expanding access to affordable medical services. Opposition parties criticized the policy, citing budget concerns, while supporters praised its focus on improving coverage for low-income families."
text = preprocess_text(text)
vector = tfidf.transform([text])

predicted_labels = model.predict(vector)
# Convert predicted labels back to class names
predicted_classes = le.inverse_transform(predicted_labels)
# Display the predicted class names
print("Predicted class names:", predicted_classes)

Predicted class names: ['politics']


In [25]:
#cross validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, y, cv=10)
print("Cross-validation scores of Naiye Bayes:", scores.mean())


Cross-validation scores of Naiye Bayes: 0.9707893992647356
