# Phase 1: Text Classification with ML
This notebook will focus on training Naive Bayes and Logistic Regression using Bag-of-Words and TF-IDF, and evaluates their performance.
Goal : Build Intution before NNs

In [3]:
# Data fetching
from sklearn.datasets import fetch_20newsgroups
# Train/test split
from sklearn.model_selection import train_test_split
# Text vectorization
from sklearn.feature_extraction.text import CountVectorizer

# Models
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

# Evaluation
from sklearn.metrics import accuracy_score, classification_report

In [7]:
dataset = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
news_data = dataset.data # list of text docs (news articles)
news_target = dataset.target # numeric labels
target_names = dataset.target_names # text labels (category names)
print(f"Number of documents: {len(news_data)}")
print(f"Number of categories: {len(target_names)}")

Number of documents: 18846
Number of categories: 20


In [12]:
X_train, x_test, y_train, y_test = train_test_split(news_data, news_target, test_size=0.2, random_state=42)
print(f"Training documents: {len(X_train)}")
print(f"Test samples: {len(x_test)}")

Training documents: 15076
Test samples: 3770


In [16]:
# Bag of Words (BoW) vectorization
bow_vectorizer = CountVectorizer(stop_words = 'english')
X_train_bow = bow_vectorizer.fit_transform(X_train)
x_test_bow = bow_vectorizer.transform(x_test)
print(f"Number of features (BoW): {X_train_bow.shape[1]}")

Number of features (BoW): 111275
