In [1]:
# Start logging process at root level
import logging
log_dir = "logs/build-dict-ponctuation.log"
#logging.basicConfig(filename=log_dir, format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.root.setLevel(level=logging.INFO)

In [2]:
from collections import Counter
class Vocabulary(object):
    """
    Wrapper class for vocabulary
    """
    def __init__(self):
            self._word2idx = {}
            self._idx2word = {}
            self._counter = Counter()
            self._size = 0
            self._punctuation2token = {';': "<semicolon>",
                                       ':': "<colon>",
                                       "'": "<inverted_comma>",
                                       '"': "<quotation_mark>",
                                       ',': "<comma>",
                                       '\n': "<new_line>",
                                       '!': "<exclamation_mark>",
                                       '-': "<hyphen>",
                                       '--': "<hyphens>",
                                       '.': "<period>",
                                       '?': "<question_mark>",
                                       '(': "<left_paren>",
                                       ')': "<right_paren>",
                                       '♪': "<music_note>",
                                       '[': "<left_square>",
                                       ']': "<right_square>",
                                       "’": "<inverted_comma>",
                                       }
            self.add_text('<pad>')
            self.add_text('<unknown>')

    def add_word(self, word):
            """
            Adds a token to the vocabulary
            :param word: (str) word to add to vocabulary
            :return: None
            """
            word = word.lower()
            if word not in self._word2idx:
                self._idx2word[self._size] = word
                self._word2idx[word] = self._size
                self._size += 1
            self._counter[word] += 1

    def add_text(self, text):
            """
            Splits text into tokens and adds to the vocabulary
            :param text: (str) text to add to vocabulary
            :return: None
            """
            text = self.clean_text(text)
            tokens = self.tokenize(text)
            for token in tokens:
                self.add_word(token)

    def clean_text(self, text):
            """
            Cleans text for processing
            :param text: (str) text to be cleaned
            :return: (str) cleaned text
            """
            text = text.lower().strip()
            for key, token in self._punctuation2token.items():
                text = text.replace(key, ' {} '.format(token))
            text = text.strip()
            while '  ' in text:
                text = text.replace('  ', ' ')
            return text

    def tokenize(self, text):
            """
            Splits text into individual tokens
            :param text: (str) text to be tokenized
            :return: (list) list of tokens in text
            """
            return text.split(' ')

    def set_vocab(self, vocab):
            self._word2idx = {}
            self._idx2word = {}
            self._counter = Counter()
            self._size = 0
            self.add_text('<pad>')
            self.add_text('<unknown>')
            for word in vocab:
                self.add_word(word)

    def most_common(self, n):
        """
        Creates a new vocabulary object containing the n most frequent tokens from current vocabulary
        :param n: (int) number of most frequent tokens to keep
        :return: (Vocabulary) vocabulary containing n most frequent tokens
        """
        tmp = Vocabulary()
        for w in self._counter.most_common(n):
            tmp.add_word(w[0])
            tmp._counter[w[0]] = w[1]
        return tmp
    
    def load(self, path='vocab.pkl'):
            """
            Loads vocabulary from given path
            :param path: (str) path to pkl object
            :return: None
            """
            with open(path, 'rb') as f:
                self.__dict__.clear()
                self.__dict__.update(pickle.load(f))
            print("\nVocabulary successfully loaded from [{}]\n".format(path))
            
    def save(self, path='vocab.pkl'):
            """
            Saves vocabulary to given path
            :param path: (str) path where vocabulary should be stored
            :return: None
            """
            with open(path, 'wb') as f:
                pickle.dump(self.__dict__, f)
            print("\nVocabulary successfully stored as [{}]\n".format(path))

    def add_punctuation(self, text):
            """
            Replces punctuation tokens with corresponding characters
            :param text: (str) text to process
            :return: text with punctuation tokens replaced with characters
            """
            for key, token in self._punctuation2token.items():
                text = text.replace(token, ' {} '.format(key))
            text = text.strip()
            while '  ' in text:
                text = text.replace('  ', ' ')
            text = text.replace(' :', ':')
            text = text.replace(" ' ", "'")
            text = text.replace("[ ", "[")
            text = text.replace(" ]", "]")
            text = text.replace(" .", ".")
            text = text.replace(" ,", ",")
            text = text.replace(" !", "!")
            text = text.replace(" ?", "?")
            text = text.replace(" ’ ", "’")
            return text

    def __len__(self):
            """
            Number of unique words in vocabulary
            """
            return self._size

    def __str__(self):
            s = "Vocabulary contains {} tokens\nMost frequent tokens:\n".format(self._size)
            for w in self._counter.most_common(10):
                s += "{} : {}\n".format(w[0], w[1])
            return s

    def __getitem__(self, item):
            """
            Returns the word corresponding to an id or and id corresponding to a word in the vocabulary.
            Return <unknown> if id/word is not present in the vocabulary
            """
            if isinstance(item, int):
                return self._idx2word[item]
            elif isinstance(item, str):
                if item in self._word2idx:
                    return self._word2idx[item]
                else:
                    return self._word2idx['<unknown>']
            return None

In [3]:
import bz2
with bz2.BZ2File('datasets/enwiki-chunk-999-1.xml.bz2', 'r') as f:
    text = f.readlines()

In [None]:
from gensim.corpora import WikiCorpus


In [10]:
vocab = Vocabulary()

In [25]:
sentences = []
for sentence in text:
    #print(sentence)
    if b'<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd" version="0.10" xml:lang="en">\n' in sentence:
        continue
    if b'  <siteinfo>\n' in sentence:
        continue
    if b'    <sitename>Wikipedia</sitename>\n' in sentence:
        continue
    if b'    <dbname>enwiki</dbname>\n' in sentence:
        continue
    if b'    <base>https://en.wikipedia.org/wiki/Main_Page</base>\n' in sentence:
        continue
    if b'    <generator>MediaWiki 1.33.0-wmf.19</generator>\n' in sentence:
        continue
    if b'    <case>first-letter</case>\n' in sentence:
        continue
    if b'    <namespaces>\n' in sentence:
        continue
    if b'      <namespace key="-2" case="first-letter">Media</namespace>\n' in sentence:
        continue
    if b'    <namespaces>\n' in sentence:
        continue
    if b'    <namespaces>\n' in sentence:
        continue
    if b'</mediawiki>' in sentence:
        continue
    try:
        sentence = sentence.decode()
    except AttributeError:
        print(AttributeError)
        break
        pass
    #break
    print(sentence)
    sentence = vocab.clean_text(sentence)
    sentence = vocab.tokenize(sentence) + [vocab._punctuation2token['\n']]
    sentences.append(sentence)
print(sentence)

NameError: name 'data' is not defined

In [None]:
from gensim.corpora.wikicorpus import *

def tokenize(content):
    #override original method in wikicorpus.py
    return [token.encode('utf8') for token in content.split() 
           if len(token) <= 15 and not token.startswith('_')]

wiki = WikiCorpus('datasets/enwiki-chunk-999-1.xml.bz2', lemmatize=False)

for text in wiki.get_texts():
    print(text)
    break

In [None]:
import sys
import os

#parent = os.path.dirname(os.path.realpath(__file__))
#sys.path.append(parent + '/../venv/lib/python2.7/site-packages/gensim/corpora/')

from gensim.corpora.wikicorpus import *

def tokenize(content):
    # override original method in wikicorpus.py
    return [token.encode('utf8') for token in utils.tokenize(content, lower=True, errors='ignore')
            if len(token) <= 15 and not token.startswith('_')]

def process_article(args):
   # override original method in wikicorpus.py
    text, lemmatize, title, pageid = args
    text = filter_wiki(text)
    if lemmatize:
        result = utils.lemmatize(text)
    else:
        result = tokenize(text)
    return result, title, pageid


class MyWikiCorpus(WikiCorpus):
    def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0',)):
        WikiCorpus.__init__(self, fname, processes, lemmatize, dictionary, filter_namespaces)

    def get_texts(self):
        articles, articles_all = 0, 0
        positions, positions_all = 0, 0
        texts = ((text, self.lemmatize, title, pageid) for title, text, pageid in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces))
        pool = multiprocessing.Pool(self.processes)
        # process the corpus in smaller chunks of docs, because multiprocessing.Pool
        # is dumb and would load the entire input into RAM at once...
        for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1):
            for tokens, title, pageid in pool.imap(process_article, group):  # chunksize=10):
                articles_all += 1
                positions_all += len(tokens)
                # article redirects and short stubs are pruned here
                if len(tokens) < ARTICLE_MIN_WORDS or any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES):
                    continue
                articles += 1
                positions += len(tokens)
                if self.metadata:
                    yield (tokens, (pageid, title))
                else:
                    yield tokens
        pool.terminate()

        logger.info(
            "finished iterating over Wikipedia corpus of %i documents with %i positions"
            " (total %i articles, %i positions before pruning articles shorter than %i words)",
            articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS)
        self.length = articles  # cache corpus length

In [None]:
mywiki = MyWikiCorpus('datasets/enwiki-chunk-999-1.xml.bz2', lemmatize=False)

In [None]:
for text in wiki.get_texts():
    print(text)
    break