In [1]:
%matplotlib inline

from pathlib import Path

import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
DATA_DIR = Path('data')
path = DATA_DIR / 'bbc'
files = sorted(list(path.glob('**/*.txt')))
doc_list = []
for i, file in enumerate(files):
    topic = file.parts[-2]
    article = file.read_text(encoding='latin1').split('\n')
    heading = article[0].strip()
    body = ' '.join([l.strip() for l in article[1:]])
    doc_list.append([topic, heading, body])
docs = pd.DataFrame(doc_list, columns=['topic', 'heading', 'body'])
docs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   topic    2225 non-null   object
 1   heading  2225 non-null   object
 2   body     2225 non-null   object
dtypes: object(3)
memory usage: 52.3+ KB


In [4]:
# prep stratified train_test split
y = pd.factorize(docs.topic)[0] # pd.factorize: reform to numeric variables from categorical variables
X = docs.body
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [5]:
vectorizer = CountVectorizer()
X_train_dtm = vectorizer.fit_transform(X_train)
X_test_dtm = vectorizer.transform(X_test)
X_train_dtm.shape, X_test_dtm.shape

((1668, 25951), (557, 25951))

In [6]:
# Train Multi-class Naive Bayes model
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)

In [7]:
accuracy_score(y_test, y_pred_class)

0.9712746858168761

In [8]:
pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_pred_class))

Unnamed: 0,0,1,2,3,4
0,120,0,6,0,2
1,0,94,2,0,1
2,1,0,103,0,0
3,0,0,1,127,0
4,0,1,2,0,97
