In [4]:
%pip install pandas scikit-learn

Collecting pandas
  Using cached pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.5.2-cp311-cp311-macosx_12_0_arm64.whl.metadata (13 kB)
Collecting numpy>=1.23.2 (from pandas)
  Downloading numpy-2.1.3-cp311-cp311-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2024.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Using cached scipy-1.14.1-cp311-cp311-macosx_14_0_arm64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Using cached pandas-2.2.3-cp311-cp311-macosx_11_0_arm64.whl (11.3 MB)
Using cached scikit_learn-1.5.2-cp311-cp311-m

In [5]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

In [8]:
# Path to the dataset
data_path = "/Users/devambani/Projects/data-science-pipeline/NewsSnap_quick_digestable_news_summaries/bbc-dataset"

data = []
labels = []

# Load data from each folder
for label in os.listdir(data_path):
    folder_path = os.path.join(data_path, label)
    if os.path.isdir(folder_path):
        for file_name in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                data.append(text)
                labels.append(label)

# Create a DataFrame
df = pd.DataFrame({'text': data, 'label': labels})

In [9]:
# Split the data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42
)

In [10]:
# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Fit the vectorizer on the training data and transform both train and test sets
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)

In [11]:
# Encode labels as numeric values
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_labels)
y_test = label_encoder.transform(test_labels)

In [12]:
# Train a Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

In [13]:
# Predict on the test set
y_pred = nb_classifier.predict(X_test)

# Calculate and print accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Print detailed classification report
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

Test Accuracy: 97.53%
               precision    recall  f1-score   support

     business       0.97      0.98      0.98       103
entertainment       1.00      0.96      0.98        84
     politics       0.93      0.99      0.96        80
        sport       0.99      0.99      0.99        98
         tech       0.99      0.95      0.97        80

     accuracy                           0.98       445
    macro avg       0.98      0.97      0.97       445
 weighted avg       0.98      0.98      0.98       445



In [14]:
# Test the model with a single example
example_text = "The stock market saw a significant decline today due to fears of an economic slowdown."
example_vector = vectorizer.transform([example_text])  # Transform the example text

# Predict the label
predicted_label_index = nb_classifier.predict(example_vector)[0]
predicted_label = label_encoder.inverse_transform([predicted_label_index])[0]

print(f"Example Text: {example_text}")
print(f"Predicted Label: {predicted_label}")

Example Text: The stock market saw a significant decline today due to fears of an economic slowdown.
Predicted Label: business
