# News Classification Project
# Author: Thabang Mmakola

*Packages*

In [129]:
# Import packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

*Loading the dataset*

In [131]:
# Load datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print(train_df.head())

Train shape: (5520, 5)
Test shape: (2000, 5)
                                           headlines  \
0  RBI revises definition of politically-exposed ...   
1  NDTV Q2 net profit falls 57.4% to Rs 5.55 cror...   
2  Akasa Air ‘well capitalised’, can grow much fa...   
3  India’s current account deficit declines sharp...   
4  States borrowing cost soars to 7.68%, highest ...   

                                         description  \
0  The central bank has also asked chairpersons a...   
1  NDTV's consolidated revenue from operations wa...   
2  The initial share sale will be open for public...   
3  The current account deficit (CAD) was 3.8 per ...   
4  The prices shot up reflecting the overall high...   

                                             content  \
0  The Reserve Bank of India (RBI) has changed th...   
1  Broadcaster New Delhi Television Ltd on Monday...   
2  Homegrown server maker Netweb Technologies Ind...   
3  India’s current account deficit declined sharp...   
4

In [132]:
# Cleaning function
def clean_text(text):
    if pd.isnull(text):
        return ""
    
    # Lowercase
    text = text.lower()
    
    # Remove URLs and keep only domain (e.g., bbc.com)
    text = re.sub(r'http\S+', '', text)
    
    # Remove emails
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove website mentions (like “www.cnn.com”)
    text = re.sub(r'www\.\S+', '', text)
    
    # Remove numbers and punctuation
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [133]:
# Apply cleaning
for col in ['Headlines', 'Description', 'Content']:
    if col in train_df.columns:
        train_df[col] = train_df[col].astype(str).apply(clean_text)
    if col in test_df.columns:
        test_df[col] = test_df[col].astype(str).apply(clean_text)

In [134]:
# Combine text columns into one for better representation
train_df['text'] = train_df['headlines'] + ' ' + train_df['description'] + ' ' + train_df['content']
test_df['text'] = test_df['headlines'] + ' ' + test_df['description'] + ' ' + test_df['content']

X = train_df['text']
y = train_df['category']

In [146]:
# Train/validation split
X = train_df["content"]
y = train_df["category"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [164]:
# TF-IDF vectors
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)

# Use the SAME vectorizer for val & test
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)

In [166]:
# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=200),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": LinearSVC()
}


In [168]:
# Models and parameter grids
param_grids = {
    "Logistic Regression": {
        "model": LogisticRegression(max_iter=300),
        "params": {
            "C": [0.1, 1, 10],
            "solver": ["liblinear", "saga"]
        }
    },
    "Naive Bayes": {
        "model": MultinomialNB(),
        "params": {
            "alpha": [0.1, 0.5, 1.0]
        }
    },
    "Random Forest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [None, 10, 20],
            "min_samples_split": [2, 5]
        }
    },
    "SVM": {
        "model": LinearSVC(),
        "params": {
            "C": [0.1, 1, 10]
        }
    }
}

In [170]:
#  Train & evaluate
results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train_tfidf, y_train)
    preds = model.predict(X_val_tfidf)
    acc = accuracy_score(y_val, preds)
    print(f"{name} Accuracy: {acc:.4f}")
    results[name] = acc
    print(classification_report(y_val, preds))


Training Logistic Regression...
Logistic Regression Accuracy: 0.9918
               precision    recall  f1-score   support

     business       0.98      0.99      0.99       245
    education       1.00      0.99      1.00       274
entertainment       1.00      1.00      1.00       178
       sports       0.99      0.99      0.99       137
   technology       0.99      0.99      0.99       270

     accuracy                           0.99      1104
    macro avg       0.99      0.99      0.99      1104
 weighted avg       0.99      0.99      0.99      1104


Training Naive Bayes...
Naive Bayes Accuracy: 0.9864
               precision    recall  f1-score   support

     business       0.99      0.97      0.98       245
    education       0.99      1.00      0.99       274
entertainment       0.98      1.00      0.99       178
       sports       0.99      0.99      0.99       137
   technology       0.98      0.98      0.98       270

     accuracy                           0.99  

In [172]:
# Train and tune each model
results = {}
best_models = {}

for name, config in param_grids.items():
    print(f"\n🔍 Tuning {name}...")
    grid = GridSearchCV(config["model"], config["params"], cv=3, n_jobs=-1, scoring="accuracy")
    grid.fit(X_train_tfidf, y_train)

    best_model = grid.best_estimator_
    best_params = grid.best_params_

    
    best_model = grid.best_estimator_
    preds = best_model.predict(X_val_tfidf)
    acc = accuracy_score(y_val, preds)
    
    results[name] = acc
    best_models[name] = best_model
    
    print(f"✅ Best Params for {name}: {grid.best_params_}")
    print(f"✅ Validation Accuracy: {acc:.4f}")
    print(classification_report(y_val, preds))



🔍 Tuning Logistic Regression...
✅ Best Params for Logistic Regression: {'C': 10, 'solver': 'saga'}
✅ Validation Accuracy: 0.9982
               precision    recall  f1-score   support

     business       1.00      1.00      1.00       245
    education       1.00      1.00      1.00       274
entertainment       1.00      1.00      1.00       178
       sports       1.00      1.00      1.00       137
   technology       1.00      1.00      1.00       270

     accuracy                           1.00      1104
    macro avg       1.00      1.00      1.00      1104
 weighted avg       1.00      1.00      1.00      1104


🔍 Tuning Naive Bayes...
✅ Best Params for Naive Bayes: {'alpha': 0.1}
✅ Validation Accuracy: 0.9900
               precision    recall  f1-score   support

     business       0.99      0.98      0.98       245
    education       1.00      1.00      1.00       274
entertainment       0.99      1.00      0.99       178
       sports       0.99      1.00      0.99      

In [173]:
# Pick the best-performing model
best_model_name = max(results, key=results.get)
final_model = best_models[best_model_name]

print(f"\n🏆 Best Model: {best_model_name} (Accuracy = {results[best_model_name]:.4f})")


🏆 Best Model: Logistic Regression (Accuracy = 0.9982)


In [176]:
# Save vectorizer and model
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
joblib.dump(final_model, f"{best_model_name.replace(' ', '_')}_best_model.pkl")

['Logistic_Regression_best_model.pkl']

In [178]:
# Predict on test set (optional)
test_tfidf = vectorizer.transform(test_df["text"])
test_preds = final_model.predict(test_tfidf)
test_df["Predicted_Category"] = test_preds
test_df.to_csv("test_predictions_tuned.csv", index=False)
print("\n💾 Predictions saved to test_predictions_tuned.csv")


💾 Predictions saved to test_predictions_tuned.csv


In [180]:
!pip install mlflow streamlit scikit-learn pandas numpy



In [182]:
import mlflow
import mlflow.sklearn
from sklearn.metrics import accuracy_score

# Start an MLflow experiment
mlflow.set_experiment("text_classification_experiment")

with mlflow.start_run(run_name="RandomForest_tuned"):
    # Log parameters
    mlflow.log_params(best_params)

    # Train and evaluate
    y_pred = best_model.predict(X_test_tfidf)
    acc = accuracy_score(y_test, y_pred)

    # Log metric
    mlflow.log_metric("accuracy", acc)

    # Log model
    mlflow.sklearn.log_model(best_model, "model")

    print(f"✅ Model logged with accuracy: {acc:.4f}")



✅ Model logged with accuracy: 0.9810


In [194]:
import joblib

joblib.dump(best_model, "best_model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")

['vectorizer.pkl']