# Automated Classification of Abstracts (Math and Physics)

In [1]:
import matplotlib.pyplot as plt
import urllib
import feedparser
import random
import nltk
import os

from nltk.tokenize import word_tokenize
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.classify import ClassifierI
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from statistics import mode
from wordcloud import WordCloud

## Functions

In [3]:
def search_abstracts(search_query, start, max_results):
    # calling the api: http://export.arxiv.org/api/{method_name}?{parameters}
    base_url = 'http://export.arxiv.org/api/query?';

    query = 'search_query=%s&start=%i&max_results=%i' % (search_query,
                                                     start,
                                                     max_results)

    feedparser._FeedParserMixin.namespaces['http://a9.com/-/spec/opensearch/1.1/'] = 'opensearch'
    feedparser._FeedParserMixin.namespaces['http://arxiv.org/schemas/atom'] = 'arxiv'

    # perform a GET request using the base_url and query
    response = urllib.request.urlopen(base_url+query).read()
    feed = feedparser.parse(response)
    return feed

In [9]:
stop_words = set(stopwords.words("english"))
def tokenize_abst(area, abst_list):
    tokenized = []
    add_words = []
    for abst in abst_list:
        words = [ w.lower() for w in word_tokenize(abst) if not w in stop_words]
        tokenized.append((words,area))
        add_words += words
    return tokenized, add_words

In [10]:
def feature_selection(abstracts):
    words = set(abstracts)
    features = {}
    for i in word_features:
        features[i] = (i in words)
    return features

In [11]:
def scikit_classifier(name, classf, train_set, test_set):
    sele_classifier = SklearnClassifier(classf())
    sele_classifier.train(train_set)
    print(name, "accuracy percent:", (nltk.classify.accuracy(sele_classifier, test_set))*100)
    return sele_classifier

In [13]:
def create_files():
    physics_feed = search_abstracts('all:physics', start = 1500, max_results = 50)
    math_feed = search_abstracts('all:math', start = 1500, max_results = 50)

    physics_abstracts = [entry.summary for entry in physics_feed.entries]
    math_abstracts = [entry.summary for entry in math_feed.entries]

    all_abst = physics_abstracts + math_abstracts

    random.shuffle(all_abst)

    # get current directory
    current_dir = os.getcwd()
    new_folder_path = current_dir+os.sep+'math+physics_files'
    try:
        os.mkdir(new_folder_path)
    except FileExistsError:
        pass
    
    for abstr in range(len(all_abst)):
        name = 'file'+'_'+str(abstr)+'.txt'
        with open(new_folder_path+os.sep+name, 'w') as f:
            f.write(all_abst[abstr])
    return new_folder_path

In [14]:
def classify_abstract(path_to_files, classifier):
    for file in os.listdir(path_to_files):
        with open(path_to_files+os.sep+file, 'r') as f:
            abstr_words = [w.lower() for w in word_tokenize(f.read())]

            abstract_features = feature_selection(abstr_words)

            abstr_class = selected_classifier.classify(abstract_features)

            abstr_class_path = path_to_files+os.sep+abstr_class+'_abstracts'

            try:
                os.mkdir(abstr_class_path)
            except FileExistsError:
                pass
         os.rename(f.name, abstr_class_path+os.sep+abstr_class+'_'+file)

In [None]:
def createWordCloud(words, stopwords):
    wordcloud = WordCloud(max_words = 20, stopwords = stopwords)
    wordcloud.generate(' '.join(words))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.figure()
    plt.show()

## Class

In [12]:
class best_classifier(ClassifierI):
    def __init__(self,*classifiers):
        self.classifiers_ = classifiers 

    def classify(self, features):
        votes = []
        for c in self.classifiers_:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)

    def confidence(self, features):
        votes = []
        for c in self.classifiers_:
            v = c.classify(features)
        votes.append(v)
        choice_votes = votes.count(mode(votes))
        confi = choice_votes/len(votes)
        return confi

## Main

In [None]:
physics_feed = search_abstracts('all:physics', start = 0, max_results = 1500)
math_feed = search_abstracts('all:math', start = 0, max_results = 1500)

physics_abstracts = [entry.summary for entry in physics_feed.entries]
math_abstracts = [entry.summary for entry in math_feed.entries]

physics_tokenized, phys_words = tokenize_abst('physics', physics_abstracts)
math_tokenized, math_words = tokenize_abst('math', math_abstracts)

all_words = phys_words + math_words
all_abstracts = physics_tokenized + math_tokenized 

# shuffle all_abtracts
random.shuffle(all_abstracts)

all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:5000]

abstract_features = [(feature_selection(abst), area) for (abst, area) in all_abstracts]

training_set = abstract_features[:2500]
testing_set = abstract_features[2500:]

classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Naive Bayes NLTK accurary percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

MNB_classif = scikit_classifier('MultinomialNB', MultinomialNB, training_set, testing_set)
BernoulliNB_classif = scikit_classifier('BernoulliNB', BernoulliNB, training_set, testing_set)
LogisticRegr_classif = scikit_classifier('LogisticRegression', LogisticRegression, training_set, testing_set)
SGD_classif = scikit_classifier('SGDClassifier', SGDClassifier, training_set, testing_set)
LinearSVC_classif = scikit_classifier('LinearSVC', LinearSVC, training_set, testing_set)
NuSVC_classif = scikit_classifier('NuSVC', NuSVC, training_set, testing_set)

selected_classifier = best_classifier(classifier, MNB_classif, BernoulliNB_classif, LogisticRegr_classif, 
                                    SGD_classif, LinearSVC_classif, NuSVC_classif)

In [None]:
print("Selected classifier accuracy percent:", (nltk.classify.accuracy(selected_classifier, testing_set))*100)
print("Classified:", selected_classifier.classify(testing_set[0][0]), "with",
	  "Confidence:",selected_classifier.confidence(testing_set[0][0])*100,'%')

In [None]:
path = create_files()
classify_abstract(path, selected_classifier)

createWordCloud(phys_words, stop_words)
createWordCloud(math_words, stop_words)