In [1]:
from sklearn.datasets import fetch_20newsgroups

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

In [2]:
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)

In [3]:
tfidf_Vect = TfidfVectorizer()
X_train_tfidf = tfidf_Vect.fit_transform(twenty_train.data)

clf = MultinomialNB()
clf.fit(X_train_tfidf, twenty_train.target)

MultinomialNB()

In [4]:
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)

In [5]:
X_test_tfidf = tfidf_Vect.transform(twenty_test.data)

In [6]:
predicted = clf.predict(X_test_tfidf)

In [7]:
score = metrics.accuracy_score(twenty_test.target, predicted)
print(score)

0.7738980350504514


# SVM implementation and its Classification report

In [8]:
from sklearn.svm import SVC,LinearSVC
from sklearn import metrics

svc = LinearSVC()

svc.fit(X_train_tfidf, twenty_train.target)

Y_Pred = svc.predict(X_test_tfidf)

# Finding the accuracy of SVM model
acc_svc = round(metrics.accuracy_score(twenty_test.target, Y_Pred) * 100, 2)
print("SVM accuracy is:",acc_svc)

# Classification report for SVM algorithm
print(
    f"Classification report for classifier {svc}:\n"
    f"{metrics.classification_report(twenty_test.target, Y_Pred)}\n"
)

SVM accuracy is: 85.32
Classification report for classifier LinearSVC():
              precision    recall  f1-score   support

           0       0.82      0.80      0.81       319
           1       0.76      0.80      0.78       389
           2       0.77      0.73      0.75       394
           3       0.71      0.76      0.74       392
           4       0.84      0.86      0.85       385
           5       0.87      0.76      0.81       395
           6       0.83      0.91      0.87       390
           7       0.92      0.91      0.91       396
           8       0.95      0.95      0.95       398
           9       0.92      0.95      0.93       397
          10       0.96      0.98      0.97       399
          11       0.93      0.94      0.93       396
          12       0.81      0.79      0.80       393
          13       0.90      0.87      0.88       396
          14       0.90      0.93      0.92       394
          15       0.84      0.93      0.88       398
        

# KNN implementation and its Classification report

In [9]:
from sklearn.neighbors import KNeighborsClassifier

# Used th KNN model with nearest neighbor = 3
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train_tfidf, twenty_train.target)
Y_Pred = knn.predict(X_test_tfidf)

# Finding the accuracy of KNN model
acc_knn = round(metrics.accuracy_score(twenty_test.target, Y_Pred) * 100, 2)
print("knn accuracy is:",acc_knn)

# Classification report for KNN algorithm
print(
    f"Classification report for classifier {knn}:\n"
    f"{metrics.classification_report(twenty_test.target, Y_Pred)}\n"
)

knn accuracy is: 65.79
Classification report for classifier KNeighborsClassifier(n_neighbors=3):
              precision    recall  f1-score   support

           0       0.43      0.75      0.54       319
           1       0.45      0.65      0.53       389
           2       0.52      0.56      0.54       394
           3       0.52      0.62      0.57       392
           4       0.55      0.58      0.56       385
           5       0.69      0.59      0.64       395
           6       0.59      0.48      0.53       390
           7       0.75      0.68      0.71       396
           8       0.83      0.80      0.82       398
           9       0.77      0.75      0.76       397
          10       0.88      0.82      0.85       399
          11       0.74      0.83      0.78       396
          12       0.72      0.47      0.57       393
          13       0.78      0.51      0.62       396
          14       0.81      0.77      0.79       394
          15       0.80      0.72     

# Implementing SVM algorithm with bigram and finding the accuracy change

In [10]:
tfidf_Vect_new = TfidfVectorizer(ngram_range=(1, 2))
X_train_tfidf = tfidf_Vect_new.fit_transform(twenty_train.data)

twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
X_test_tfidf = tfidf_Vect_new.transform(twenty_test.data)

# Using the SVM model
svc = LinearSVC()
svc.fit(X_train_tfidf, twenty_train.target)

# Predicting the values using SVM model
Y_Pred = svc.predict(X_test_tfidf)

# Finding the accuracy of SVM model when used with bigrams
acc_svc = round(metrics.accuracy_score(twenty_test.target, Y_Pred) * 100, 2)
print("SVM accuracy is:",acc_svc)

SVM accuracy is: 85.74


On observation, Accuracy changes from 85.32 to 85.74 with bigram

# Implementing the SVM algorithm with tfidf vectorizer argument using stop words = 'english'

In [11]:
tfidf_Vect_new = TfidfVectorizer(stop_words = 'english')
X_train_tfidf = tfidf_Vect_new.fit_transform(twenty_train.data)

twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
X_test_tfidf = tfidf_Vect_new.transform(twenty_test.data)

svc = LinearSVC()
svc.fit(X_train_tfidf,twenty_train.target)

Y_Pred = svc.predict(X_test_tfidf)

acc_svc = round(metrics.accuracy_score(twenty_test.target, Y_Pred) * 100,2)
print("SVM Accuracy is:",acc_svc)

SVM Accuracy is: 85.1


On observation, Accuracy changes from 85.32 to 85.1 