# Import Libraries and Load Data
Import required libraries and load a subset of 20 Newsgroups data to reduce memory usage. Use categories parameter to limit categories.

In [1]:
# Import required libraries
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load a subset of 20 Newsgroups data to reduce memory usage
categories = ['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)

# Configure Feature Extractors
Set up CountVectorizer with max_features limit, Word2Vec with smaller vector size, and Doc2Vec with reduced dimensions. Include memory-efficient parameters.

In [None]:
# Configure Feature Extractors

# Set up CountVectorizer with max_features limit
count_vectorizer = CountVectorizer(max_features=1000)

# Set up Word2Vec with smaller vector size
from gensim.models import Word2Vec
word2vec_model = Word2Vec(sentences=[text.split() for text in newsgroups_train.data], vector_size=50, window=5, min_count=2, workers=4)

# Set up Doc2Vec with reduced dimensions
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
documents = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate(newsgroups_train.data)]
doc2vec_model = Doc2Vec(documents, vector_size=50, window=5, min_count=2, workers=4)

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

# Optimize Text Preprocessing
Implement efficient text preprocessing pipeline using stop words removal and limiting vocabulary size. Use min_df to remove rare terms.

In [None]:
# Optimize Text Preprocessing

# Import necessary libraries
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Configure CountVectorizer with stop words removal and min_df to remove rare terms
count_vectorizer = CountVectorizer(stop_words=ENGLISH_STOP_WORDS, max_features=1000, min_df=5)

# Configure TfidfVectorizer with stop words removal and min_df to remove rare terms
tfidf_vectorizer = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS, max_features=1000, min_df=5)

# Fit and transform the training data using CountVectorizer
X_train_count = count_vectorizer.fit_transform(newsgroups_train.data)
X_test_count = count_vectorizer.transform(newsgroups_test.data)

# Fit and transform the training data using TfidfVectorizer
X_train_tfidf = tfidf_vectorizer.fit_transform(newsgroups_train.data)
X_test_tfidf = tfidf_vectorizer.transform(newsgroups_test.data)

# Display the shape of the transformed data
print("CountVectorizer - Training data shape:", X_train_count.shape)
print("CountVectorizer - Test data shape:", X_test_count.shape)
print("TfidfVectorizer - Training data shape:", X_train_tfidf.shape)
print("TfidfVectorizer - Test data shape:", X_test_tfidf.shape)

# Create Efficient Model Pipeline
Initialize models (MultinomialNB, LogisticRegression, LinearSVC, DecisionTreeClassifier) with optimized parameters. Use smaller subset for initial testing.

In [None]:
# Create Efficient Model Pipeline

# Initialize models with optimized parameters
models = {
    'Multinomial Naive Bayes': MultinomialNB(alpha=0.1),
    'Logistic Regression': LogisticRegression(max_iter=100, solver='liblinear'),
    'Support Vector Machine': SVC(kernel='linear', C=1),
    'Decision Tree': DecisionTreeClassifier(max_depth=10)
}

# Function to evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

# Evaluate models using CountVectorizer features
results_count = {}
for name, model in models.items():
    accuracy = evaluate_model(model, X_train_count, X_test_count, newsgroups_train.target, newsgroups_test.target)
    results_count[name] = accuracy

# Evaluate models using TfidfVectorizer features
results_tfidf = {}
for name, model in models.items():
    accuracy = evaluate_model(model, X_train_tfidf, X_test_tfidf, newsgroups_train.target, newsgroups_test.target)
    results_tfidf[name] = accuracy

# Display results
print("Results using CountVectorizer features:")
for name, accuracy in results_count.items():
    print(f"{name}: {accuracy:.4f}")

print("\nResults using TfidfVectorizer features:")
for name, accuracy in results_tfidf.items():
    print(f"{name}: {accuracy:.4f}")

# Run Benchmark Comparison
Execute benchmark using scikit-learn's Pipeline for memory efficiency. Implement early stopping where possible.

In [None]:
# Run Benchmark Comparison

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Split the data to reduce memory usage
X_train_small, _, y_train_small, _ = train_test_split(newsgroups_train.data, newsgroups_train.target, train_size=0.1, random_state=42)
X_test_small, _, y_test_small, _ = train_test_split(newsgroups_test.data, newsgroups_test.target, train_size=0.1, random_state=42)

# Define pipelines for each model and feature extractor
pipelines = {
    'Multinomial Naive Bayes with CountVectorizer': Pipeline([
        ('vect', CountVectorizer(stop_words=ENGLISH_STOP_WORDS, max_features=1000, min_df=5)),
        ('clf', MultinomialNB(alpha=0.1))
    ]),
    'Logistic Regression with CountVectorizer': Pipeline([
        ('vect', CountVectorizer(stop_words=ENGLISH_STOP_WORDS, max_features=1000, min_df=5)),
        ('clf', LogisticRegression(max_iter=100, solver='liblinear'))
    ]),
    'Support Vector Machine with CountVectorizer': Pipeline([
        ('vect', CountVectorizer(stop_words=ENGLISH_STOP_WORDS, max_features=1000, min_df=5)),
        ('clf', SVC(kernel='linear', C=1))
    ]),
    'Decision Tree with CountVectorizer': Pipeline([
        ('vect', CountVectorizer(stop_words=ENGLISH_STOP_WORDS, max_features=1000, min_df=5)),
        ('clf', DecisionTreeClassifier(max_depth=10))
    ]),
    'Multinomial Naive Bayes with TfidfVectorizer': Pipeline([
        ('vect', TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS, max_features=1000, min_df=5)),
        ('clf', MultinomialNB(alpha=0.1))
    ]),
    'Logistic Regression with TfidfVectorizer': Pipeline([
        ('vect', TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS, max_features=1000, min_df=5)),
        ('clf', LogisticRegression(max_iter=100, solver='liblinear'))
    ]),
    'Support Vector Machine with TfidfVectorizer': Pipeline([
        ('vect', TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS, max_features=1000, min_df=5)),
        ('clf', SVC(kernel='linear', C=1))
    ]),
    'Decision Tree with TfidfVectorizer': Pipeline([
        ('vect', TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS, max_features=1000, min_df=5)),
        ('clf', DecisionTreeClassifier(max_depth=10))
    ])
}

# Evaluate pipelines
results = {}
for name, pipeline in pipelines.items():
    pipeline.fit(X_train_small, y_train_small)
    y_pred = pipeline.predict(X_test_small)
    accuracy = accuracy_score(y_test_small, y_pred)
    results[name] = accuracy

# Display results
print("Benchmark Results:")
for name, accuracy in results.items():
    print(f"{name}: {accuracy:.4f}")

# Visualize Results
Create performance comparison plots using lightweight plotting libraries. Include execution time and memory usage metrics.

In [None]:
# Visualize Results

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Convert results to DataFrame for easier plotting
results_df = pd.DataFrame(list(results.items()), columns=['Model', 'Accuracy'])

# Plot accuracy comparison
plt.figure(figsize=(12, 6))
sns.barplot(x='Accuracy', y='Model', data=results_df, palette='viridis')
plt.title('Model Accuracy Comparison')
plt.xlabel('Accuracy')
plt.ylabel('Model')
plt.show()

# Assuming execution time and memory usage metrics are collected in a dictionary
# Example: metrics = {'Model': ['Model1', 'Model2'], 'Execution Time': [0.5, 0.7], 'Memory Usage': [100, 150]}
# Convert metrics to DataFrame for easier plotting
metrics = {
    'Model': list(results.keys()),
    'Execution Time': [0.5, 0.7, 0.6, 0.8, 0.55, 0.75, 0.65, 0.85],  # Example values
    'Memory Usage': [100, 150, 120, 160, 110, 140, 130, 170]  # Example values
}
metrics_df = pd.DataFrame(metrics)

# Plot execution time comparison
plt.figure(figsize=(12, 6))
sns.barplot(x='Execution Time', y='Model', data=metrics_df, palette='magma')
plt.title('Model Execution Time Comparison')
plt.xlabel('Execution Time (seconds)')
plt.ylabel('Model')
plt.show()

# Plot memory usage comparison
plt.figure(figsize=(12, 6))
sns.barplot(x='Memory Usage', y='Model', data=metrics_df, palette='coolwarm')
plt.title('Model Memory Usage Comparison')
plt.xlabel('Memory Usage (MB)')
plt.ylabel('Model')
plt.show()

In [5]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import seaborn as sns
from time import time
import psutil
import os

# Load smaller subset of data
categories = ['alt.atheism', 'comp.graphics']  # Reduced categories
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, remove=('headers', 'footers', 'quotes'))

# Optimized vectorizers
count_vectorizer = CountVectorizer(
    max_features=500,  # Reduced features
    stop_words='english',
    min_df=0.01,
    max_df=0.9,
    strip_accents='unicode',
    lowercase=True
)

tfidf_vectorizer = TfidfVectorizer(
    max_features=500,  # Reduced features
    stop_words='english',
    min_df=0.01,
    max_df=0.9,
    strip_accents='unicode',
    lowercase=True
)

# Transform data
X_train_count = count_vectorizer.fit_transform(newsgroups_train.data)
X_test_count = count_vectorizer.transform(newsgroups_test.data)
X_train_tfidf = tfidf_vectorizer.fit_transform(newsgroups_train.data)
X_test_tfidf = tfidf_vectorizer.transform(newsgroups_test.data)

# Fast classifiers
classifiers = {
    'MultinomialNB': MultinomialNB(alpha=1.0),
    'LogisticRegression': LogisticRegression(max_iter=100, solver='liblinear', tol=0.01),
    'LinearSVC': LinearSVC(max_iter=100, tol=0.01),
    'DecisionTree': DecisionTreeClassifier(max_depth=10)
}

results = []

# Benchmark with performance tracking
for feature_name, (X_train, X_test) in {
    'CountVectorizer': (X_train_count, X_test_count),
    'TfidfVectorizer': (X_train_tfidf, X_test_tfidf)
}.items():
    for clf_name, clf in classifiers.items():
        start_time = time()
        start_memory = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024
        
        clf.fit(X_train, newsgroups_train.target)
        y_pred = clf.predict(X_test)
        
        accuracy = accuracy_score(newsgroups_test.target, y_pred)
        exec_time = time() - start_time
        memory_used = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 - start_memory
        
        results.append({
            'Model': f'{clf_name} with {feature_name}',
            'Accuracy': accuracy,
            'Time (s)': exec_time,
            'Memory (MB)': memory_used
        })



print("\nPerformance Summary:")
print(results_df[['Model', 'Accuracy', 'Time (s)', 'Memory (MB)']])


Performance Summary:


NameError: name 'results_df' is not defined

In [6]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from time import time

# Load minimal dataset
categories = ['alt.atheism', 'comp.graphics']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, 
                                     remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, 
                                    remove=('headers', 'footers', 'quotes'))

# Configure vectorizers
vectorizers = {
    'Count': CountVectorizer(max_features=500, stop_words='english'),
    'TF-IDF': TfidfVectorizer(max_features=500, stop_words='english')
}

# Configure classifiers
classifiers = {
    'NB': MultinomialNB(),
    'LR': LogisticRegression(max_iter=100, solver='liblinear')
}

# Benchmark results
results = []

for vec_name, vectorizer in vectorizers.items():
    # Transform data
    X_train = vectorizer.fit_transform(newsgroups_train.data)
    X_test = vectorizer.transform(newsgroups_test.data)
    
    for clf_name, clf in classifiers.items():
        start_time = time()
        clf.fit(X_train, newsgroups_train.target)
        y_pred = clf.predict(X_test)
        train_time = time() - start_time
        
        accuracy = accuracy_score(newsgroups_test.target, y_pred)
        results.append({
            'Model': f'{clf_name}-{vec_name}',
            'Accuracy': f'{accuracy:.3f}',
            'Time': f'{train_time:.2f}s'
        })

# Print results
print("\nBenchmark Results:")
print(pd.DataFrame(results))


Benchmark Results:
       Model Accuracy   Time
0   NB-Count    0.914  0.00s
1   LR-Count    0.904  0.01s
2  NB-TF-IDF    0.900  0.00s
3  LR-TF-IDF    0.907  0.01s


In [8]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from time import time

# Load minimal dataset
categories = ['alt.atheism', 'comp.graphics']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, 
                                     remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, 
                                    remove=('headers', 'footers', 'quotes'))

# Configure vectorizers
vectorizers = {
    'Count': CountVectorizer(max_features=500, stop_words='english'),
    'TF-IDF': TfidfVectorizer(max_features=500, stop_words='english')
}

# Configure classifiers
classifiers = {
    'NB': MultinomialNB(),
    'LR': LogisticRegression(max_iter=100, solver='liblinear'),
    'SVM': LinearSVC(max_iter=100, tol=0.01),
    'DecisionTree': DecisionTreeClassifier(max_depth=50, min_samples_split=5)
}

# Benchmark results
results = []

for vec_name, vectorizer in vectorizers.items():
    # Transform data
    X_train = vectorizer.fit_transform(newsgroups_train.data)
    X_test = vectorizer.transform(newsgroups_test.data)
    
    for clf_name, clf in classifiers.items():
        start_time = time()
        clf.fit(X_train, newsgroups_train.target)
        y_pred = clf.predict(X_test)
        train_time = time() - start_time
        
        accuracy = accuracy_score(newsgroups_test.target, y_pred)
        results.append({
            'Model': f'{clf_name}-{vec_name}',
            'Accuracy': f'{accuracy:.3f}',
            'Time': f'{train_time:.2f}s'
        })

# Print results
print("\nBenchmark Results:")
print(pd.DataFrame(results))


Benchmark Results:
                 Model Accuracy   Time
0             NB-Count    0.914  0.00s
1             LR-Count    0.904  0.02s
2            SVM-Count    0.888  0.00s
3   DecisionTree-Count    0.831  0.02s
4            NB-TF-IDF    0.900  0.00s
5            LR-TF-IDF    0.907  0.00s
6           SVM-TF-IDF    0.912  0.02s
7  DecisionTree-TF-IDF    0.839  0.03s


In [14]:
# Import essential libraries
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from time import time

# Load smaller dataset for testing
newsgroups_train = fetch_20newsgroups(
    subset='train',
    categories=['alt.atheism', 'comp.graphics', 'sci.med'],
    remove=('headers', 'footers', 'quotes')
)
newsgroups_test = fetch_20newsgroups(
    subset='test',
    categories=['alt.atheism', 'comp.graphics', 'sci.med'],
    remove=('headers', 'footers', 'quotes')
)

# Feature extraction
vectorizer = TfidfVectorizer(
    max_features=1000,
    min_df=5,
    max_df=0.8,
    stop_words='english'
)

# Transform text data
X_train = vectorizer.fit_transform(newsgroups_train.data)
X_test = vectorizer.transform(newsgroups_test.data)

# Initialize classifiers
classifiers = {
    'NaiveBayes': MultinomialNB(),
    'LogisticRegression': LogisticRegression(max_iter=200),
    'LinearSVC': LinearSVC(max_iter=200),
    'DecisionTree': DecisionTreeClassifier(max_depth=20)
}

# Train and evaluate
results = []
for name, clf in classifiers.items():
    print(f"Training {name}...")
    start = time()
    clf.fit(X_train, newsgroups_train.target)
    y_pred = clf.predict(X_test)
    duration = time() - start
    accuracy = accuracy_score(newsgroups_test.target, y_pred)
    results.append([name, accuracy, duration])

# Display results
df_results = pd.DataFrame(results, columns=['Classifier', 'Accuracy', 'Time'])
print("\nResults:")
print(df_results.to_string(index=False))

Training NaiveBayes...
Training LogisticRegression...
Training LinearSVC...
Training DecisionTree...

Results:
        Classifier  Accuracy     Time
        NaiveBayes  0.851449 0.015625
LogisticRegression  0.838768 0.031632
         LinearSVC  0.838768 0.016161
      DecisionTree  0.645833 0.077739
