In [58]:
import pandas as pd
df = pd.read_csv('bbc-news-data.csv',sep="\t")
df.head()

Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...


In [59]:
df.shape

(2225, 4)

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  2225 non-null   object
 1   filename  2225 non-null   object
 2   title     2225 non-null   object
 3   content   2225 non-null   object
dtypes: object(4)
memory usage: 69.7+ KB


In [61]:
df["category"].value_counts()

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: category, dtype: int64

In [62]:
df.isnull().sum()

category    0
filename    0
title       0
content     0
dtype: int64

In [63]:
df['title'].head()

0    Ad sales boost Time Warner profit
1     Dollar gains on Greenspan speech
2    Yukos unit buyer faces loan claim
3    High fuel prices hit BA's profits
4    Pernod takeover talk lifts Domecq
Name: title, dtype: object

In [64]:
df['content'].head()

0     Quarterly profits at US media giant TimeWarne...
1     The dollar has hit its highest level against ...
2     The owners of embattled Russian oil giant Yuk...
3     British Airways has blamed high fuel prices f...
4     Shares in UK drinks and food firm Allied Dome...
Name: content, dtype: object

In [65]:
df['content'][0]

' Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.  The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.  Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL\'s underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sign up AOL\'s existing customers for high-

In [66]:
df = df[['category' , 'content']]

In [67]:
df.head()

Unnamed: 0,category,content
0,business,Quarterly profits at US media giant TimeWarne...
1,business,The dollar has hit its highest level against ...
2,business,The owners of embattled Russian oil giant Yuk...
3,business,British Airways has blamed high fuel prices f...
4,business,Shares in UK drinks and food firm Allied Dome...


In [68]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [69]:
stemmer = PorterStemmer()

In [70]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [71]:
corpus = []
for i in range(0,len(df)):
    text = re.sub("[^a-zA-Z]"," ", df['content'][i])
    text = text.lower()
    text = text.split()
    text = [lemmatizer.lemmatize(word,pos="v") for word in text if word not in set(stopwords.words('english'))]
    text = ' '.join(text)
    corpus.append(text)

In [72]:
corpus

['quarterly profit us media giant timewarner jump bn three months december year earlier firm one biggest investors google benefit sales high speed internet connections higher advert sales timewarner say fourth quarter sales rise bn bn profit buoy one gain offset profit dip warner bros less users aol time warner say friday own search engine google internet business aol mix fortunes lose subscribers fourth quarter profit lower precede three quarter however company say aol underlie profit exceptional items rise back stronger internet advertise revenues hop increase subscribers offer online service free timewarner internet customers try sign aol exist customers high speed broadband timewarner also restate result follow probe us securities exchange commission sec close conclude time warner fourth quarter profit slightly better analysts expectations film division saw profit slump help box office flop alexander catwoman sharp contrast year earlier third final film lord ring trilogy boost resu

In [73]:
df['category']

0       business
1       business
2       business
3       business
4       business
          ...   
2220        tech
2221        tech
2222        tech
2223        tech
2224        tech
Name: category, Length: 2225, dtype: object

In [74]:
# Encode the labels into numerical format
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['category']= label_encoder.fit_transform(df['category'])

In [75]:
df['category'].value_counts()

3    511
0    510
2    417
4    401
1    386
Name: category, dtype: int64

In [76]:
#mapping label to the category
category_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(category_mapping)

{'business': 0, 'entertainment': 1, 'politics': 2, 'sport': 3, 'tech': 4}


In [77]:
y = df['category']

In [78]:
#train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(corpus,y,test_size=0.2,random_state=42)

In [42]:
#Create Bag of Models
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1,1))

In [44]:
X_train = cv.fit_transform(X_train).toarray()
X_test = cv.transform(X_test).toarray()

In [47]:
cv.get_feature_names_out().shape

(20493,)

In [48]:
X_train.shape, y_train.shape

((1780, 20493), (1780,))

In [51]:
len(cv.vocabulary_)

20493

In [53]:
from sklearn.naive_bayes import MultinomialNB
bbc_news_classifier = MultinomialNB()

In [54]:
bbc_news_classifier.fit(X_train,y_train)

MultinomialNB()

In [55]:
y_pred = bbc_news_classifier.predict(X_test)

In [56]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
print(accuracy_score(y_pred,y_test))
print(classification_report(y_pred,y_test))

0.9707865168539326
              precision    recall  f1-score   support

           0       0.96      0.97      0.96       113
           1       0.94      0.99      0.96        69
           2       0.97      0.94      0.95        79
           3       0.99      1.00      1.00       101
           4       0.99      0.95      0.97        83

    accuracy                           0.97       445
   macro avg       0.97      0.97      0.97       445
weighted avg       0.97      0.97      0.97       445



In [57]:
#Using TF-IDF Model 

In [80]:
#Crearting TF-IDF model
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(ngram_range=(1,1))
X_train = tv.fit_transform(X_train).toarray()
X_test = tv.transform(X_test).toarray()

In [84]:
from sklearn.naive_bayes import MultinomialNB
bbc_news_classifier = MultinomialNB().fit(X_train,y_train)

In [85]:
y_pred = bbc_news_classifier.predict(X_test)

In [86]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
print(accuracy_score(y_pred,y_test))
print(classification_report(y_pred,y_test))

0.9595505617977528
              precision    recall  f1-score   support

           0       0.95      0.96      0.96       113
           1       0.90      0.98      0.94        66
           2       0.97      0.90      0.94        82
           3       0.99      0.99      0.99       102
           4       0.97      0.95      0.96        82

    accuracy                           0.96       445
   macro avg       0.96      0.96      0.96       445
weighted avg       0.96      0.96      0.96       445

