# Load 20 Newsgroups Dataset
This cell fetches the training and test subsets of the 20 Newsgroups dataset for text classification.

In [144]:
from sklearn.datasets import fetch_20newsgroups

# Fetch training and test datasets
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')


# Create or Load Preprocessor
The preprocessor transforms raw text into TF-IDF features and applies PCA for dimensionality reduction.
You can either create a new preprocessor or load an existing one.

In [145]:
from text_helper import Preprocessor

# Uncomment to create a new preprocessor (optional)
# preprocessor = Preprocessor(newsgroups_train.data, force_create=True, n_components_pca=500)

Start creating preprocessor. It would be saved to models\Preprocessor_pretrained.pkl

Start creating TfIdf tokenizer.

Tokenizer already created. Loaded from models\TfIdf_pretrained.pkl
Start creating pca transformer with 500 components.

PCA transformer already exists. Loaded from models\pca_transformer.pkl


In [141]:
from text_helper import load_preprocessor

# Load the pretrained preprocessor
preprocessor = load_preprocessor()

# Assert preprocessor is loaded
assert preprocessor is not None, "Failed to load preprocessor"

# Preprocess Data
Transform the training and test data using the preprocessor.

In [103]:
# Preprocess training and test data
x_train = preprocessor.preprocess(newsgroups_train.data)
x_test = preprocessor.preprocess(newsgroups_test.data)

# Train CNN Model
Train a convolutional neural network (CNN) model on the preprocessed data.

In [134]:
from dl_functions import train

# Train CNN model with 10 epochs
model = train(x_train, x_test, newsgroups_train.target, newsgroups_test.target, model_type="CNN", epoch=10)

# Train Logistic Regression Model
Train a logistic regression model on the preprocessed data.

In [137]:
from dl_functions import train

# Train logistic regression model
model = train(x_train, x_test, newsgroups_train.target, newsgroups_test.target, model_type="LogReg")

# Predict on a Single Sample
Use the trained model to predict the category of a test sample.

In [140]:
from dl_functions import predict

# Preprocess a single test sample and predict its category
sample = preprocessor.preprocess(newsgroups_test.data[0])
prediction = predict(sample, model_type="Linear")

print(f"Predicted class: {prediction}")

# Run Flask Server
Start the Flask server to handle PDF uploads and predict their categories.

In [143]:
!python server.py