In [159]:
import zipfile
import requests
import os
import codecs
import time
import nltk
import unicodedata
import pickle
import unicodedata
import numpy as np
import tabulate
from collections import defaultdict
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from nltk import pos_tag, sent_tokenize, wordpunct_tokenize
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from yellowbrick.text.freqdist import FreqDistVisualizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV

In [31]:
dirpath = r'D:\NLP\BBC'
url = 'http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip'
zipFile = url[url.rfind("/",0) + 1:]


In [32]:
def DownloadandExtract(dir_path,file_name,url):
    try:
        resp = requests.get(url)
        file_path = os.path.join(dir_path, file_name)
        zfile = open(file_path, 'wb')
        zfile.write(resp.content)
        zfile.close()
        with zipfile.ZipFile(file_path,"r") as zip_ref:
            zip_ref.extractall(dir_path)
    except Exception as e:
        print("Error: {}".format(e))

In [33]:
DownloadandExtract(dirpath,zipFile,url)

In [34]:
class BBCNewsCorpusReader(CategorizedPlaintextCorpusReader):
    
    def resolve(self, fileids, categories):
        if fileids is not None and categories is not None:
            raise ValueError("Specify fileids or categories, not both")

        if categories is not None:
            return self.fileids(categories)
        return fileids
    
    def docs(self, fileids=None, categories=None):
        """
        Returns the complete text of the document, closing the document
        after we are done reading it and yielding it in a memory safe fashion.
        """
        # Resolve the fileids and the categories
        fileids = self.resolve(fileids, categories)

        # Create a generator, loading one document into memory at a time.
        for path, encoding in self.abspaths(fileids, include_encoding=True):
            with codecs.open(path, 'r', encoding=encoding) as f:
                yield f.read()
    
    def sizes(self, fileids=None, categories=None):
        """
        Returns a list of tuples, the fileid and size on disk of the file.
        This function is used to detect oddly large files in the corpus.
        """
        # Resolve the fileids and the categories
        fileids = self.resolve(fileids, categories)

        # Create a generator, getting every path and computing filesize
        for path in self.abspaths(fileids):
            yield os.path.getsize(path)
    
    def describe(self, fileids=None, categories=None):
        """
        Performs a single pass of the corpus and
        returns a dictionary with a variety of metrics
        concerning the state of the corpus.
        """
        started = time.time()

        # Structures to perform counting.
        counts  = nltk.FreqDist()
        tokens  = nltk.FreqDist()

        # Perform single pass over paragraphs, tokenize and count
        for para in self.paras(fileids, categories):
            counts['paras'] += 1

            for sent in para:
                counts['sents'] += 1

                for word in sent:
                    counts['words'] += 1
                    tokens[word] += 1

        # Compute the number of files and categories in the corpus
        n_fileids = len(self.resolve(fileids, categories) or self.fileids())
        n_topics  = len(self.categories(self.resolve(fileids, categories)))

        # Return data structure with information
        return {
            'files':  n_fileids,
            'topics': n_topics,
            'paras':  counts['paras'],
            'sents':  counts['sents'],
            'words':  counts['words'],
            'vocab':  len(tokens),
            'lexdiv': float(counts['words']) / float(len(tokens)),
            'ppdoc':  float(counts['paras']) / float(n_fileids),
            'sppar':  float(counts['sents']) / float(counts['paras']),
            'secs':   time.time() - started,
        }            

In [35]:
DOC_PATTERN = r'(?!\.)[\w_\s]+/[\w\s\d\-]+\.txt'
CAT_PATTERN = r'([\w_\s]+)/.*'
corpus = BBCNewsCorpusReader(r'D:\NLP\BBC\bbc',DOC_PATTERN, cat_pattern=CAT_PATTERN,encoding='latin-1')

In [36]:
corpus.describe()

{'files': 2225,
 'topics': 5,
 'paras': 12772,
 'sents': 43990,
 'words': 1007937,
 'vocab': 33754,
 'lexdiv': 29.861260887598508,
 'ppdoc': 5.740224719101124,
 'sppar': 3.4442530535546507,
 'secs': 4.6499903202056885}

In [37]:
class Preprocessor(object):
    
    def __init__(self, corpus, target=None,**kwargs):
        self.corpus = corpus
        self.target = target
        
    def fileids(self, fileids=None, categories=None):
        fileids = self.corpus.resolve(fileids, categories)
        if fileids:
            return fileids
        return self.corpus.fileids()
    
    def abspath(self, fileid):
        # Find the directory, relative to the corpus root.
        parent = os.path.relpath(
            os.path.dirname(self.corpus.abspath(fileid)), self.corpus.root
        )

        # Compute the name parts to reconstruct
        basename  = os.path.basename(fileid)
        name, ext = os.path.splitext(basename)

        # Create the pickle file extension
        basename  = name + '.pickle'

        # Return the path to the file relative to the target.
        return os.path.normpath(os.path.join(self.target, parent, basename))
    
    def process(self, fileid):
        """For a single file, checks the location on disk to ensure no errors,
        uses +tokenize()+ to perform the preprocessing, and writes transformed
        document as a pickle to target location.
        """
        # Compute the outpath to write the file to.
        target = self.abspath(fileid)
        parent = os.path.dirname(target)

        # Make sure the directory exists
        if not os.path.exists(parent):
            os.makedirs(parent)

        # Make sure that the parent is a directory and not a file
        if not os.path.isdir(parent):
            raise ValueError(
                "Please supply a directory to write preprocessed data to."
            )

        # Create a data structure for the pickle
        document = list(self.tokenize(fileid))
        # Open and serialize the pickle to disk
        with open(target, 'wb') as f:
            pickle.dump(document, f, pickle.HIGHEST_PROTOCOL)

        # Clean up the document
        del document
    
    def transform(self, fileids=None, categories=None):
        # Make the target directory if it doesn't already exist
        if not os.path.exists(self.target):
            os.makedirs(self.target)

        # Resolve the fileids to start processing
        for fileid in self.fileids(fileids, categories):
            yield self.process(fileid)
            
    def tokenize(self, fileid):
        for paragraph in self.corpus.paras(fileids=fileid):
            yield [
                pos_tag(sent) 
                for sent in paragraph
            ]

In [38]:
target = r'D:\NLP\BBC\processed'
processed_corpus = Preprocessor(corpus,target)
vect = np.vectorize(processed_corpus.process)
vect(processed_corpus.fileids())

array([None, None, None, ..., None, None, None], dtype=object)

In [39]:
class PickledCorpusReader(BBCNewsCorpusReader):
    def docs(self, fileids=None, categories=None):
        fileids = self.resolve(fileids, categories)
        # Load one pickled document into memory at a time.
        for path in self.abspaths(fileids):
            with open(path, 'rb') as f:
                yield pickle.load(f)
            
    def paras(self, fileids=None, categories=None):
        for doc in self.docs(fileids, categories):
            for para in doc:
                yield para
    
    def tagged(self, fileids=None, categories=None):
        for sent in self.sents(fileids, categories):
            for tagged_token in sent:
                yield tagged_token
                
    def words(self, fileids=None, categories=None):
        for tagged in self.tagged(fileids, categories):
            yield tagged[0]

In [46]:
DOC_PATTERN = r'(?!\.)[\w_\s]+/[\w\s\d\-]+\.pickle'
CAT_PATTERN = r'([\w_\s]+)/.*'
target = r'D:\NLP\BBC\processed'
processed_corpus = PickledCorpusReader(target,DOC_PATTERN, cat_pattern=CAT_PATTERN)

In [47]:
class TextNormalizer(BaseEstimator, TransformerMixin):

    def __init__(self, language='english'):
        self.stopwords  = set(nltk.corpus.stopwords.words(language))
        self.lemmatizer = WordNetLemmatizer()

    def is_punct(self, token):
        return all(unicodedata.category(char).startswith('P') for char in token)

    def is_stopword(self, token):
        return token.lower() in self.stopwords
    
    def normalize(self, allDocuments):
        docsAsList = list()
        for catDocs in allDocuments:
            for document in catDocs:
                wordsList = list()
                for paragraph in document:
                    for sentence in paragraph:
                        for token, tag in sentence:
                            if not self.is_punct(token) and not self.is_stopword(token) and not token.isdigit() and len(token) > 1:
                                wordsList.append(self.lemmatize(token, tag).lower())
                docsAsList.append(" ".join(wordsList))
        return docsAsList
    
    def lemmatize(self, token, pos_tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(pos_tag[0], wn.NOUN)
        return self.lemmatizer.lemmatize(token, tag)

    def fit(self,X,y):
        return self

    def transform(self, X):
        return self.normalize(X)

In [165]:
class CorpusLoader(object):
    def __init__(self, reader, folds=12, shuffle=True, categories=None):
        self.reader = reader
        self.folds  = KFold(n_splits=folds, shuffle=shuffle)
        self.all_files  = np.asarray(self.reader.fileids(categories=categories))
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.fileids(), self.labels(), test_size=0.33,shuffle=shuffle)
        self.cv_files = np.asarray(self.X_train)
        self.test_files = np.asarray(self.X_test)
        
    def fileids(self,file_type="All",idx=None):
        files = []
        if file_type == "TEST":
            files = self.test_files 
        elif file_type == "CV":
            files = self.cv_files
        else:
            files = self.all_files
        if idx is None:
            return files
        return files[idx]
    
    def documents(self,file_type="All",idx=None):
        for fileid in self.fileids(file_type,idx):
            yield list(self.reader.docs(fileids=[fileid]))
            
    def labels(self,file_type="All",idx=None):
        return [
            self.reader.categories(fileids=[fileid])[0]
            for fileid in self.fileids(file_type,idx)
        ]
    def __iter__(self):
        file_type = "CV"
        for train_index, test_index in self.folds.split(self.X_train):
            X_train_fold = self.documents(file_type,train_index)
            y_train_fold = self.labels(file_type,train_index)

            X_test_fold = self.documents(file_type,test_index)
            y_test_fold = self.labels(file_type,test_index)

            yield X_train_fold, X_test_fold, y_train_fold, y_test_fold

In [None]:
vectorizer = TfidfVectorizer(max_df=0.5, min_df=0.1,smooth_idf=True,norm='l2',sublinear_tf=True)
def create_pipeline(estimator):
    steps = [
        ('normalize', TextNormalizer()),
        ('vectorize', vectorizer),
        ('classifier',estimator)
    ]
    return Pipeline(steps)

models = []
for form in (MultinomialNB,):
    models.append(create_pipeline(form()))
loader = CorpusLoader(processed_corpus)
scores = []
for model in models:
    search = GridSearchCV(model, cv=12,param_grid={
        'vectorize__max_df' : [1.0,0.9,0.8,0.7,0.6,0.5],
        'vectorize__min_df' : [0.4,0.3,0.2,0.15,0.1,0.05],
        'vectorize__smooth_idf' : [True,False],
        'vectorize__norm' : ['l1','l2'],
        'vectorize__sublinear_tf' : [True,False]
        })
    search.fit([list([doc]) for doc in loader.reader.docs(loader.X_train)],y_train)
    y_pred = search.predict([list([doc]) for doc in loader.reader.docs(loader.X_test)])
    print(classification_report(y_test, y_pred, labels=corpus.categories()))