In [1]:
import numpy as np

from sklearn import datasets
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# Dataset loading and preproccessing

news_test = datasets.load_files('./category', encoding='utf-8')

X_train, X_test, y_train, y_test = train_test_split(news_test.data, news_test.target, test_size=0.2)
print(len(X_test))


20420


In [3]:
# Multinomial Naive Bayes Classifier

clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
    ])

clf.fit(X_train, y_train)
predicted = clf.predict(X_test)

# Accuracy

print('Accuracy of MultinomialNB: ',clf.score(X_test, y_test))


Accuracy of MultinomialNB:  0.4944662095984329


In [4]:
# SVM Classifier

clf = Pipeline([
    ('vect', CountVectorizer()), 
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42,max_iter=5, tol=None)),
    ])

clf.fit(X_train, y_train)  

predicted = clf.predict(X_test)

# Accuracy 
print('Accuracy of SVC: ',np.mean(predicted == y_test))            

Accuracy of SVC:  0.6786483839373163


In [5]:
# Decision Tree

clf = Pipeline([
    ('vect', CountVectorizer()), 
    ('tfidf', TfidfTransformer()),
    ('clf', DecisionTreeClassifier(min_samples_split=50)),
    ])

clf.fit(X_train, y_train)  

predicted = clf.predict(X_test)

# Accuracy 
print('Accuracy of Decision tree: ',np.mean(predicted == y_test))   

Accuracy of Decision tree:  0.5791380999020568


In [3]:
# KNN

clf = Pipeline([
    ('vect', CountVectorizer()), 
    ('tfidf', TfidfTransformer()),
    ('clf', KNeighborsClassifier(n_neighbors=5,algorithm='kd_tree')),
    ])

clf.fit(X_test, y_test)  

predicted = clf.predict(X_test)

# Accuracy 
print('Accuracy of K Nearest Neighbors: ',np.mean(predicted == y_test))



Accuracy of K Nearest Neighbors:  0.42365328109696376
