In [None]:
# Import necessary libraries
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import NearestCentroid
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, classification_report,confusion_matrix

# Load the 20 Newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all')

# Preprocess the data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(newsgroups.data)
y = newsgroups.target

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4.1 Naive Bayes Classifier
# Train the Naive Bayes classifier
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred_nb = naive_bayes_classifier.predict(X_test)

# Calculate F-score for Naive Bayes
f1_nb = f1_score(y_test, y_pred_nb, average='weighted')
print("Naive Bayes Classifier F-Score:", f1_nb)
print(classification_report(y_test, y_pred_nb, target_names=newsgroups.target_names))
print(confusion_matrix(y_test,y_pred_nb))






Naive Bayes Classifier F-Score: 0.8391099208032284
                          precision    recall  f1-score   support

             alt.atheism       0.86      0.73      0.79       151
           comp.graphics       0.88      0.79      0.83       202
 comp.os.ms-windows.misc       0.89      0.81      0.85       195
comp.sys.ibm.pc.hardware       0.62      0.86      0.72       183
   comp.sys.mac.hardware       0.95      0.82      0.88       205
          comp.windows.x       0.96      0.82      0.89       215
            misc.forsale       0.96      0.62      0.75       193
               rec.autos       0.88      0.93      0.90       196
         rec.motorcycles       0.88      0.95      0.91       168
      rec.sport.baseball       0.96      0.95      0.95       211
        rec.sport.hockey       0.90      0.98      0.94       198
               sci.crypt       0.80      0.97      0.88       201
         sci.electronics       0.91      0.79      0.85       202
                 sci.med

In [None]:
# 4.2 Rocchio Classifier
# Train the Rocchio classifier
rocchio_classifier = NearestCentroid()
rocchio_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred_rocchio = rocchio_classifier.predict(X_test)

# Calculate F-score for Rocchio
f1_rocchio = f1_score(y_test, y_pred_rocchio, average='weighted')
print("Rocchio Classifier F-Score:", f1_rocchio)
print(classification_report(y_test, y_pred_rocchio, target_names=newsgroups.target_names))
print(confusion_matrix(y_test,y_pred_rocchio))

Rocchio Classifier F-Score: 0.7620053009138394
                          precision    recall  f1-score   support

             alt.atheism       0.80      0.68      0.73       151
           comp.graphics       0.56      0.80      0.66       202
 comp.os.ms-windows.misc       0.78      0.70      0.74       195
comp.sys.ibm.pc.hardware       0.68      0.58      0.63       183
   comp.sys.mac.hardware       0.87      0.76      0.81       205
          comp.windows.x       0.80      0.76      0.78       215
            misc.forsale       0.43      0.79      0.55       193
               rec.autos       0.89      0.80      0.84       196
         rec.motorcycles       0.97      0.85      0.90       168
      rec.sport.baseball       0.94      0.84      0.89       211
        rec.sport.hockey       0.98      0.80      0.88       198
               sci.crypt       0.98      0.78      0.87       201
         sci.electronics       0.51      0.75      0.60       202
                 sci.med    

In [None]:
# 4.3 k-Nearest Neighbor Classifier
# Train the k-Nearest Neighbor classifier (you can choose different values for k)
knn_classifier = KNeighborsClassifier(n_neighbors=3)  # You can experiment with different values of n_neighbors
knn_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred_knn = knn_classifier.predict(X_test)

# Calculate F-score for k-Nearest Neighbor
f1_knn = f1_score(y_test, y_pred_knn, average='weighted')
print("k-Nearest Neighbor Classifier F-Score:", f1_knn)
print(classification_report(y_test, y_pred_knn, target_names=newsgroups.target_names))
print(confusion_matrix(y_test,y_pred_knn))

k-Nearest Neighbor Classifier F-Score: 0.8116557854506878
                          precision    recall  f1-score   support

             alt.atheism       0.65      0.88      0.75       151
           comp.graphics       0.61      0.75      0.67       202
 comp.os.ms-windows.misc       0.68      0.76      0.72       195
comp.sys.ibm.pc.hardware       0.62      0.66      0.64       183
   comp.sys.mac.hardware       0.75      0.75      0.75       205
          comp.windows.x       0.84      0.73      0.79       215
            misc.forsale       0.78      0.54      0.64       193
               rec.autos       0.87      0.86      0.86       196
         rec.motorcycles       0.91      0.91      0.91       168
      rec.sport.baseball       0.91      0.90      0.90       211
        rec.sport.hockey       0.89      0.93      0.91       198
               sci.crypt       0.86      0.94      0.90       201
         sci.electronics       0.83      0.68      0.75       202
                 