# BBC News Classification
##### Dataset: BBC text categorization
##### Dataset link: https://www.kaggle.com/code/yufengdev/bbc-text-categorization/data

- Implemented Feature Extraction using:
    - Count Vectorizer
    - TF-IDF Vectorizer
- ML Models Implemented:
    - Naive Bayes
    - Logistic Regression
    - SVM
    - Random Forest

In [1]:
# Importing required libraries.
import pandas as pd
import numpy as np
import re

In [17]:
news_df=pd.read_csv("bbc-text.csv")

In [18]:
news_df.columns

Index(['category', 'text'], dtype='object')

In [19]:
news_df.shape

(2225, 2)

In [20]:
news_df['category'].value_counts()

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: category, dtype: int64

In [21]:
news_df.sample(5)

Unnamed: 0,category,text
126,tech,ibm frees 500 software patents computer giant ...
1472,entertainment,early elvis recordings go on sale some of elvi...
1551,tech,moving mobile improves golf swing a mobile pho...
1373,business,uk economy facing major risks the uk manufac...
858,entertainment,british stars denied major oscars british hope...


## Feature Extraction
Removing Stopwords
1. Adding custom stopwords to the existing ones

In [22]:
from nltk.corpus import stopwords
stop_words=set(stopwords.words("english"))

In [23]:
# Creating list of custom stopwords
new_stopwords=["some","one","like","time","movie","film","good","even","get","would","make","really","see","will","much","great","first","people","also","way","know","watch","look","many","said","say","new","take"]

stop_words=stop_words.union(new_stopwords)

### Creating corpus 

In [24]:
from nltk.stem.wordnet import WordNetLemmatizer

In [26]:
corpus=[]
for i in range(0,news_df.shape[0]):
    text=re.sub('[^a-zA-Z]',' ',news_df['text'][i])
    text=text.lower()
    text=text.split()
    lm=WordNetLemmatizer()
    text=[lm.lemmatize(word) for word in text if word not in stop_words]
    text=" ".join(text)
    corpus.append(text)

#### Creating Bag of Words
- Feature extraction using Count Vectorizer.

In [30]:
from sklearn.feature_extraction.text import CountVectorizer
cv= CountVectorizer(max_features=4000, ngram_range=(1,3))

X=cv.fit_transform(corpus).toarray()
y=news_df[['category']]

In [32]:
X.shape,y.shape

((2225, 4000), (2225, 1))

In [33]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.3,random_state=25)

In [34]:
X_train.shape,y_train.shape

((1557, 4000), (1557, 1))

In [35]:
X_test.shape,y_test.shape

((668, 4000), (668, 1))

### Naive Bayes Classifier

In [36]:
from sklearn.naive_bayes import GaussianNB
model=GaussianNB()
model.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


In [37]:
y_pred=model.predict(X_test)
y_pred[:5]

array(['business', 'business', 'entertainment', 'entertainment',
       'business'], dtype='<U13')

In [41]:
from sklearn.metrics import confusion_matrix,accuracy_score

In [39]:
cm=confusion_matrix(y_test,y_pred)

In [40]:
cm

array([[132,   4,   5,   0,  12],
       [  1, 102,   3,   0,   4],
       [  4,   9, 115,   0,   2],
       [  2,   1,   0, 149,   1],
       [  1,   0,   1,   0, 120]], dtype=int64)

In [42]:
accuracy_score(y_test,y_pred)

0.9251497005988024

#### Logistic Regression

In [44]:
from sklearn.linear_model import LogisticRegression
log_model=LogisticRegression(random_state=0)
log_model.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


In [46]:
y_predlog=log_model.predict(X_test)
cm_log=confusion_matrix(y_test,y_predlog)
cm_log

array([[146,   3,   2,   0,   2],
       [  1, 105,   2,   1,   1],
       [  2,   1, 125,   1,   1],
       [  1,   0,   0, 152,   0],
       [  1,   2,   1,   0, 118]], dtype=int64)

In [47]:
accuracy_score(y_test,y_predlog)

0.9670658682634731

#### SVM

In [48]:
from sklearn.svm import SVC
model_svm=SVC(kernel='linear')
model_svm.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


In [49]:
y_predsvm=model_svm.predict(X_test)

In [50]:
cm_log=confusion_matrix(y_test,y_predsvm)
cm_log

array([[146,   3,   2,   0,   2],
       [  1, 105,   2,   1,   1],
       [  3,   2, 122,   2,   1],
       [  1,   0,   0, 152,   0],
       [  2,   2,   1,   0, 117]], dtype=int64)

In [51]:
accuracy_score(y_test,y_predsvm)

0.9610778443113772

#### Random Forest

In [52]:
from sklearn.ensemble import RandomForestClassifier
model_rfc=RandomForestClassifier(n_estimators=500, random_state=123, verbose=4)
model_rfc.fit(X_train,y_train)

  model_rfc.fit(X_train,y_train)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s


building tree 1 of 500
building tree 2 of 500
building tree 3 of 500
building tree 4 of 500
building tree 5 of 500
building tree 6 of 500
building tree 7 of 500
building tree 8 of 500
building tree 9 of 500
building tree 10 of 500
building tree 11 of 500
building tree 12 of 500
building tree 13 of 500
building tree 14 of 500
building tree 15 of 500
building tree 16 of 500
building tree 17 of 500
building tree 18 of 500
building tree 19 of 500
building tree 20 of 500
building tree 21 of 500
building tree 22 of 500
building tree 23 of 500
building tree 24 of 500
building tree 25 of 500
building tree 26 of 500
building tree 27 of 500
building tree 28 of 500
building tree 29 of 500
building tree 30 of 500
building tree 31 of 500
building tree 32 of 500
building tree 33 of 500
building tree 34 of 500
building tree 35 of 500
building tree 36 of 500
building tree 37 of 500
building tree 38 of 500
building tree 39 of 500
building tree 40 of 500
building tree 41 of 500
building tree 42 of 500
b

building tree 335 of 500
building tree 336 of 500
building tree 337 of 500
building tree 338 of 500
building tree 339 of 500
building tree 340 of 500
building tree 341 of 500
building tree 342 of 500
building tree 343 of 500
building tree 344 of 500
building tree 345 of 500
building tree 346 of 500
building tree 347 of 500
building tree 348 of 500
building tree 349 of 500
building tree 350 of 500
building tree 351 of 500
building tree 352 of 500
building tree 353 of 500
building tree 354 of 500
building tree 355 of 500
building tree 356 of 500
building tree 357 of 500
building tree 358 of 500
building tree 359 of 500
building tree 360 of 500
building tree 361 of 500
building tree 362 of 500
building tree 363 of 500
building tree 364 of 500
building tree 365 of 500
building tree 366 of 500
building tree 367 of 500
building tree 368 of 500
building tree 369 of 500
building tree 370 of 500
building tree 371 of 500
building tree 372 of 500
building tree 373 of 500
building tree 374 of 500


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   13.2s finished


In [53]:
y_predrfc=model_rfc.predict(X_test)
cm=confusion_matrix(y_test,y_predrfc)
cm

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.2s finished


array([[148,   0,   2,   0,   3],
       [  1, 101,   5,   3,   0],
       [  3,   0, 122,   1,   4],
       [  1,   0,   1, 151,   0],
       [  1,   0,   1,   2, 118]], dtype=int64)

In [54]:
accuracy_score(y_test,y_predrfc)

0.9580838323353293

#### Creating Bag of Words
- Feature extraction using Tfidf Vectorizer..

In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vec= TfidfVectorizer(max_features=4500, ngram_range=(1,3))
X= tfidf_vec.fit_transform(corpus).toarray()
y=news_df[['category']]

In [57]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.3,random_state=25)

In [61]:
from sklearn.naive_bayes import GaussianNB
model=GaussianNB()
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
accuracy_score(y_test,y_pred)

  y = column_or_1d(y, warn=True)


0.9161676646706587

In [60]:
from sklearn.linear_model import LogisticRegression
log_model=LogisticRegression(random_state=0)
log_model.fit(X_train,y_train)
y_predlog=log_model.predict(X_test)
accuracy_score(y_test,y_predlog)

  y = column_or_1d(y, warn=True)


0.9715568862275449

In [62]:
from sklearn.svm import SVC
model_svm=SVC(kernel='linear')
model_svm.fit(X_train,y_train)
y_predsvm=model_svm.predict(X_test)
accuracy_score(y_test,y_predsvm)

  y = column_or_1d(y, warn=True)


0.9745508982035929

In [63]:
from sklearn.ensemble import RandomForestClassifier
model_rfc=RandomForestClassifier(n_estimators=500, random_state=123, verbose=4)
model_rfc.fit(X_train,y_train)
y_predrfc=model_rfc.predict(X_test)

  model_rfc.fit(X_train,y_train)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s


building tree 1 of 500
building tree 2 of 500
building tree 3 of 500
building tree 4 of 500
building tree 5 of 500
building tree 6 of 500
building tree 7 of 500
building tree 8 of 500
building tree 9 of 500
building tree 10 of 500
building tree 11 of 500
building tree 12 of 500
building tree 13 of 500
building tree 14 of 500
building tree 15 of 500
building tree 16 of 500
building tree 17 of 500
building tree 18 of 500
building tree 19 of 500
building tree 20 of 500
building tree 21 of 500
building tree 22 of 500
building tree 23 of 500
building tree 24 of 500
building tree 25 of 500
building tree 26 of 500
building tree 27 of 500
building tree 28 of 500
building tree 29 of 500
building tree 30 of 500
building tree 31 of 500
building tree 32 of 500
building tree 33 of 500
building tree 34 of 500
building tree 35 of 500
building tree 36 of 500
building tree 37 of 500
building tree 38 of 500
building tree 39 of 500
building tree 40 of 500
building tree 41 of 500
building tree 42 of 500
b

building tree 340 of 500
building tree 341 of 500
building tree 342 of 500
building tree 343 of 500
building tree 344 of 500
building tree 345 of 500
building tree 346 of 500
building tree 347 of 500
building tree 348 of 500
building tree 349 of 500
building tree 350 of 500
building tree 351 of 500
building tree 352 of 500
building tree 353 of 500
building tree 354 of 500
building tree 355 of 500
building tree 356 of 500
building tree 357 of 500
building tree 358 of 500
building tree 359 of 500
building tree 360 of 500
building tree 361 of 500
building tree 362 of 500
building tree 363 of 500
building tree 364 of 500
building tree 365 of 500
building tree 366 of 500
building tree 367 of 500
building tree 368 of 500
building tree 369 of 500
building tree 370 of 500
building tree 371 of 500
building tree 372 of 500
building tree 373 of 500
building tree 374 of 500
building tree 375 of 500
building tree 376 of 500
building tree 377 of 500
building tree 378 of 500
building tree 379 of 500


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   15.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    0.2s finished


0.9550898203592815

In [64]:
accuracy_score(y_test,y_predrfc)

0.9550898203592815