# **Install Required Libraries:**

In [1]:
!pip install scikit-learn numpy pandas





[notice] A new release of pip is available: 23.1.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# **Import Libraries**

In [10]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score


# **Load the Data**

In [None]:
# Load the 20 Newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
X = newsgroups.data 
y = newsgroups.target 


# **Split the Data**

In [13]:
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# **TF-IDF Vectorization**

In [None]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


# **Train Classification Models**

In [None]:
# Initialize the Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)


# **Logistic Regression**

In [None]:
# Initialize the Logistic Regression classifier
lr_classifier = LogisticRegression(max_iter=1000)
lr_classifier.fit(X_train_tfidf, y_train)


# **Support Vector Machine (SVM)**

In [17]:
# Initialize the SVM classifier
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train_tfidf, y_train)


# **Evaluate Model Performance**

In [18]:
# Predict on the test set
nb_pred = nb_classifier.predict(X_test_tfidf)
lr_pred = lr_classifier.predict(X_test_tfidf)
svm_pred = svm_classifier.predict(X_test_tfidf)


# **Calculate Accuracy**

In [19]:
# Calculate accuracy for each model
nb_accuracy = accuracy_score(y_test, nb_pred)
lr_accuracy = accuracy_score(y_test, lr_pred)
svm_accuracy = accuracy_score(y_test, svm_pred)

print(f"Naive Bayes Accuracy: {nb_accuracy:.4f}")
print(f"Logistic Regression Accuracy: {lr_accuracy:.4f}")
print(f"SVM Accuracy: {svm_accuracy:.4f}")


Naive Bayes Accuracy: 0.6841
Logistic Regression Accuracy: 0.6907
SVM Accuracy: 0.6700


# **Generate Classification Reports**

In [21]:
# Classification report for Naive Bayes
print("Naive Bayes Classification Report:")
print(classification_report(y_test, nb_pred, target_names=newsgroups.target_names))

# Classification report for Logistic Regression
print("Logistic Regression Classification Report:")
print(classification_report(y_test, lr_pred, target_names=newsgroups.target_names))

# Classification report for SVM
print("SVM Classification Report:")
print(classification_report(y_test, svm_pred, target_names=newsgroups.target_names))


Naive Bayes Classification Report:
                          precision    recall  f1-score   support

             alt.atheism       0.59      0.46      0.52       151
           comp.graphics       0.58      0.63      0.61       202
 comp.os.ms-windows.misc       0.61      0.63      0.62       195
comp.sys.ibm.pc.hardware       0.54      0.69      0.61       183
   comp.sys.mac.hardware       0.77      0.62      0.69       205
          comp.windows.x       0.81      0.79      0.80       215
            misc.forsale       0.74      0.69      0.72       193
               rec.autos       0.70      0.68      0.69       196
         rec.motorcycles       0.42      0.73      0.54       168
      rec.sport.baseball       0.83      0.79      0.81       211
        rec.sport.hockey       0.90      0.87      0.88       198
               sci.crypt       0.77      0.75      0.76       201
         sci.electronics       0.73      0.58      0.65       202
                 sci.med       0.80     