In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score


# Load dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
vectorizer = TfidfVectorizer(max_features=2000, stop_words='english')
X = vectorizer.fit_transform(newsgroups.data)  # Features (18846 samples, 2000 features each)
y = newsgroups.target  # Labels (digits 0 to19)

# Convert sparse to dense for PCA
X_dense = X.toarray()

# Split into training and test sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_dense, y, test_size=0.2, random_state=42, stratify=y)


In [6]:
# --- Feature Selector using RF Importance ---
def select_top_rf_features(X, y, k):
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X, y)
    importances = rf.feature_importances_
    top_indices = np.argsort(importances)[::-1][:k]
    return top_indices



In [7]:
# --- Evaluate Classifiers ---
def evaluate_classifiers(X_train_k, X_test_k, y_train, y_test):
    models = {
        "Logistic": LogisticRegression(max_iter=2000, random_state=42),
        "Linear SVM": LinearSVC(max_iter=2000, random_state=42),
        "RBF SVM": SVC(kernel='rbf', gamma='scale', random_state=42),
        "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
    }
    
    results = []
    for name, clf in models.items():
        pipe = Pipeline([
            ('scaler', StandardScaler()),
            ('clf', clf)
        ])
        pipe.fit(X_train_k, y_train)
        acc = pipe.score(X_test_k, y_test)
        results.append((name, acc))
        print(f"{name} → Accuracy: {acc * 100:.2f}%")
    
    return results

# --- Hyperparameter Tuning for RBF SVM ---
def tune_rbf(X_train_k, y_train):
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('clf', SVC(kernel='rbf'))
    ])
    param_grid = {
        'clf__C': [0.1, 1, 10],
        'clf__gamma': [0.001, 0.01, 0.1]
    }
    grid = GridSearchCV(pipe, param_grid, cv=3, verbose=1, n_jobs=-1)
    grid.fit(X_train_k, y_train)
    return grid.best_estimator_, grid.best_params_



In [13]:
# --- Find Minimum Features ---
def find_min_features_rf(X_train, X_test, y_train, y_test, threshold=0.625):
    acc_list = []
    best_k = None

    for k in range(60, 201, 10):
        indices = select_top_rf_features(X_train, y_train, k)
        X_train_k = X_train[:, indices]
        X_test_k = X_test[:, indices]

        model = Pipeline([
            ('scaler', StandardScaler()),
            ('clf', SVC(kernel='rbf', C=1, gamma=0.01))  # Reasonable default, can update later
        ])
        model.fit(X_train_k, y_train)
        acc = model.score(X_test_k, y_test)
        acc_list.append((k, acc))
        print(f"top_k = {k} → Test Accuracy = {acc * 100:.2f}%")

        if acc >= threshold:
            best_k = k
            print(f" Minimum features for ≥{threshold * 100}%: top_k = {k}")
            break

    return acc_list, best_k



In [14]:
# --- Plot Accuracy vs Feature Count ---
def plot_accuracy_curve(acc_list, best_k, threshold):
    top_ks, accuracies = zip(*acc_list)
    plt.figure(figsize=(10, 6))
    plt.plot(top_ks, np.array(accuracies) * 100, marker='o')
    plt.axhline(threshold * 100, color='r', linestyle='--', label=f'{threshold * 100}% Threshold')
    if best_k:
        plt.axvline(best_k, color='g', linestyle='--', label=f'Min top_k = {best_k}')
    plt.title("Test Accuracy vs. Number of Top RF Features")
    plt.xlabel("Number of Top Features")
    plt.ylabel("Test Accuracy (%)")
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.show()


In [None]:

# --- Run Everything ---
def run_text_pipeline():
    print("\n Finding minimum features...")
    acc_list, best_k = find_min_features_rf(X_train, X_test, y_train, y_test, threshold=0.625)

    print("\n Final Evaluation with best_k =", best_k)
    final_indices = select_top_rf_features(X_train, y_train, best_k)
    X_train_final = X_train[:, final_indices]
    X_test_final = X_test[:, final_indices]

    print("\n Evaluating all classifiers:")
    results = evaluate_classifiers(X_train_final, X_test_final, y_train, y_test)

    print("\n Tuning RBF SVM:")
    best_model, best_params = tune_rbf(X_train_final, y_train)
    acc = best_model.score(X_test_final, y_test)
    print(f"\n Final RBF Accuracy with tuned params: {acc * 100:.2f}%")
    print("Best Parameters:", best_params)

    plot_accuracy_curve(acc_list, best_k, threshold=0.625)

# Run the pipeline
run_text_pipeline()



🔍 Finding minimum features...
top_k = 60 → Test Accuracy = 40.64%
top_k = 70 → Test Accuracy = 41.46%
top_k = 80 → Test Accuracy = 42.20%
top_k = 90 → Test Accuracy = 43.45%
top_k = 100 → Test Accuracy = 44.75%
top_k = 110 → Test Accuracy = 46.53%
top_k = 120 → Test Accuracy = 47.37%
top_k = 130 → Test Accuracy = 48.59%
top_k = 140 → Test Accuracy = 48.86%
top_k = 150 → Test Accuracy = 49.66%
top_k = 160 → Test Accuracy = 48.99%
top_k = 170 → Test Accuracy = 49.39%
top_k = 180 → Test Accuracy = 49.31%
top_k = 190 → Test Accuracy = 49.92%
top_k = 200 → Test Accuracy = 49.92%

🎯 Final Evaluation with best_k = None

📈 Evaluating all classifiers:
Logistic → Accuracy: 54.88%
