In [21]:
import os
import re
import nltk
import glob
import random
import pickle
import io
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

In [22]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\petro\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\petro\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Read and preprocess data. Distribute data sets.

In [23]:
raw_files = glob.glob("*.txt")
print(raw_files)

['business.txt', 'entertainment.txt', 'health.txt', 'politics.txt', 'sports.txt', 'technology.txt']


In [24]:
documents= []
all_words  = []
stp = stopwords.words('english')

for f in raw_files:
    t = open(f, encoding='utf-8').read()
    for p in t.split('\n'):
        p = re.sub(r'[^\w\s]','',p)
        p = re.sub(" \d+", " ", p)
        p = [i.lower() for i in list(set(nltk.word_tokenize(p)) - set(stp))]
        all_words+=p
        documents.append((p, f[:-4]))

random.shuffle(documents)
print(documents[0:5])

[(['unlicensed', 'platform', 'spectrum', 'get', 'iot'], 'technology'), (['digital', 'storage', 'hard', 'drive', 'heliumfilled', 'western', 'tb', 'crammed'], 'technology'), (['ad', 'republican', 'scare', 'health', 'every', 'house', 'care', 'bill', 'this', 'voted', 'tv'], 'politics'), (['months', 'cup', 'go', 'come', 'wait', 'u', 'india', 'host', 'world', 'tournament', 'fifa', 'first'], 'sports'), (['sports', 'association', 'ministry', 'rightly', 'test', 'olympic', 'failed', 'recognises', 'much', 'vaunted', 'indian', 'autonomy', 'put'], 'sports')]


In [25]:
word_features = list(all_words)
print(word_features[0:5])

['studying', 'report', 'financial', 'govt', 'changing']


In [26]:
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % str(word)] = (word in document_words)
    return features

In [27]:
featuresets = [(document_features(d), c) for (d,c) in documents]

train_set, test_set = featuresets[300:], featuresets[:300]

print(len(train_set), len(test_set))

2850 300


In [28]:
print(test_set[0])



Train and test

In [29]:
classifier = nltk.NaiveBayesClassifier.train(train_set)
print("NaiveBayes accuracy:", (nltk.classify.accuracy(classifier, test_set)) * 100)

NaiveBayes accuracy: 74.0


In [30]:
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(train_set)
print("LinearSVC_classifier accuracy:", (nltk.classify.accuracy(LinearSVC_classifier, test_set)) * 100)

LinearSVC_classifier accuracy: 69.66666666666667


In [31]:
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(train_set)
print("MNB_classifier accuracy:", (nltk.classify.accuracy(MNB_classifier, test_set)) * 100)

MNB_classifier accuracy: 75.33333333333333


Save

save_word_features = open("word_features.pickle", "wb")
pickle.dump(word_features, save_word_features)
save_word_features.close()

save_classifier = open("NaiveBayes.pickle", "wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

save_classifier = open("SVC.pickle", "wb")
pickle.dump(LinearSVC_classifier, save_classifier)
save_classifier.close()

save_classifier = open("MNB.pickle","wb")
pickle.dump(MNB_classifier, save_classifier)
save_classifier.close()
