## tCoIR - Text Analysis
### <span style='color: green'>SETUP </span> Prepare and Setup Notebook <span style='float: right; color: red'>MANDATORY</span>

In [4]:
# Setup
%load_ext autoreload
%autoreload 2

import sys, os, collections, zipfile
import re, typing.re
import nltk, textacy, spacy 
import pandas as pd
import ipywidgets as widgets

sys.path = list(set(['.', '..']) - set(sys.path)) + sys.path

import matplotlib.pyplot as plt
import common.utility as utility
import common.widgets_utility as widgets_utility
import common.widgets_config as widgets_config
import common.config as config
import common.utility as utility
import text_corpus
import textacy.keyterms
import gui_utility

from beakerx.object import beakerx
from beakerx import *
from IPython.display import display, set_matplotlib_formats

logger = utility.getLogger('corpus_text_analysis')

utility.setup_default_pd_display(pd)

DATA_FOLDER = '../../data'
DF_TAGSET = pd.read_csv(os.path.join(DATA_FOLDER, 'tagset.csv'), sep='\t').fillna('')

%matplotlib inline

current_corpus_container = lambda: textacy_utility.CorpusContainer.container()
current_corpus = lambda: textacy_utility.CorpusContainer.corpus()
current_document_index = lambda: current_corpus_container().document_index

import domain_logic_vatican as domain_logic

extract_args = dict(
    args=dict(
        ngrams=[1],
        named_entities=False,
        normalize='lemma',
        as_strings=True
    ),
    kwargs=dict(
        min_freq=1,
        include_pos=['NOUN'],
        filter_stops=True,
        filter_punct=True
    ),
    extra_stop_words=None,
    substitutions=None
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## <span style='color: green'>PREPARE </span> Load and Prepare Corpus <span style='float: right; color: red'>MANDATORY</span>


In [5]:
import textacy_corpus_utility as textacy_utility
import textacy_corpus_gui

try:
    container = current_corpus_container()
    textacy_corpus_gui.display_corpus_load_gui(DATA_FOLDER, container=container)
except Exception as ex:
    raise
    logger.error(ex)


VBox(children=(IntProgress(value=0, layout=Layout(width='90%'), max=5), HBox(children=(Dropdown(description='C…

In [77]:
import array
import scipy.sparse as sp
import itertools

        
def sliding_window(seq, n):
    "Returns a sliding centered window of size +/-n from (of width 2 * n + 1) "
    y = [None] * n + list(seq) + [None] * n
    for i in range(n, len(y)+n):
        yield y[i-n:i+n+1]

def sliding_window_it(it, n):
    it = itertools.chain([None] * n, it, [None] * n * 2)
    tail = tuple(itertools.islice(it, n))
    head = tuple(itertools.islice(it, n+1))
    for v in it:
        yield list(tail + head + (v,))
        tail = tail[1:] + (head[0],)
        head = head[1:] + (v,)

def sliding_window3(seq, n=2):
    "Returns a sliding window (of width n) over data from the iterable"
    "   s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...                   "
    it = iter(seq)
    result = tuple(itertools.islice(it, n))
    if len(result) == n:
        yield result
    for elem in it:
        result = result[1:] + (elem,)
        yield result

class HyperspaceAnalogueToLanguageVectorizer():
    
    def __init__(self, corpus=None, token2id=None):
        """
        Build vocabulary and create P_ij term-term matrix and P_i term global occurence vector
        
        Parameter:
            corpus Iterable[Iterable[str]]

        """
        self.token2id = token2id
        self.corpus = corpus
        
        if corpus is not None and token2id is None:
            self.token2id = self._build_vocabulary(corpus)
        
        self.p_ij = None
        self.p_i = None

        self._id2token = None
        
    def _build_vocabulary(self, corpus):
        ''' Iterates corpus and add distict terms to vocabulary '''
        logger.info('Builiding vocabulary...')
        token2id = collections.defaultdict()
        token2id.default_factory = token2id.__len__
        term_count = 0
        for doc in corpus:
            for term in doc:
                token2id[term]
                term_count += 1
        self.term_count = term_count
        logger.info('Vocabulary of size {} built from {} terms.'.format(len(token2id), term_count))
        return dict(token2id)
    
    @property
    def id2token(self):
        if self._id2token is None:
            if self.token2id is not None:
                self._id2token = { v:k for k,v in self.token2id.items() }
        return self._id2token
    
    def sliding_window(self, seq, n):
        it = itertools.chain(iter(seq), [None] * n)
        memory = tuple(itertools.islice(it, n+1))
        if len(memory) == n+1:
            yield memory
        for x in it:
            memory = memory[1:] + (x,)
            yield memory
        
    def fit(self, corpus=None, size=2, weighing=0):
        
        '''Trains HAL for a document. Note that sentence borders (for now) are ignored'''
        
        if corpus is not None:
            self.corpus = corpus
            self.token2id = self._build_vocabulary(corpus)
            
        assert self.token2id is not None, "Fit with no vocabulary!"
        assert self.corpus is not None, "Fit with no corpus!"

        p_ij = sp.lil_matrix ((len(self.token2id), len(self.token2id)), dtype=np.float64)
        p_i = np.zeros(len(self.token2id), dtype=np.int32)
        
        for terms in corpus:
            
            id_terms = ( self.token2id[size] for size in terms)

            for win in self.sliding_window(id_terms, size):
                
                logger.info([ self.id2token[x] if x is not None else None for x in win])
                
                if win[0] is None:
                    continue
                    
                for x in win:
                    if x is not None:
                        p_i[x] += 1

                for i in range(1, size+1):

                    if win[i] is None:
                        continue
                        
                    d = i # abs(n - i)
                    if weighing == 0: #  linear i.e. adjacent equals window size, then decreasing by one
                        w = size - d + 1
                    elif weighing == 1: # f(d) = 1 / d
                        w = 1.0 / d
                    elif weighing == 2: # Constant value of 1
                        w = 1

                    #print('*', i, self.id2token[win[0]], self.id2token[win[i]], w, [ self.id2token[x] if x is not None else None for x in win])
                    p_ij[win[0], win[i]] += w

        self.p_i = p_i
        self.p_ij = p_ij

        return self
    
    def to_df(self):
        columns = [ self.id2token[i] for i in range(0,len(self.token2id))]
        return pd.DataFrame(
            data=self.p_ij.todense(),
            index=list(columns),
            columns=list(columns),
            dtype=np.float64
        ).T
    
    def cooccurence(self):
        
        coo = self.p_ij.tocoo(copy=False)
        df_p_i = pd.DataFrame(self.p_i, columns=['p_i_count'])
        df = pd.DataFrame({'x_id': coo.row, 'y_id': coo.col, 'p_xy': coo.data})[['x_id', 'y_id', 'p_xy']].sort_values(['x_id', 'y_id']).reset_index(drop=True)
        df = df.assign(x_term=df.x_id.apply(lambda x: self.id2token[x]), y_term=df.y_id.apply(lambda x: self.id2token[x]))
        df = df.merge(df_p_i, left_on='x_id', right_index=True, how='inner').rename(columns={'p_i_count': 'p_x'})
        df = df.merge(df_p_i, left_on='y_id', right_index=True, how='inner').rename(columns={'p_i_count': 'p_y'})
        df = df[['x_id', 'y_id', 'x_term', 'y_term', 'p_xy', 'p_x', 'p_y']]
        df.p_xy = df.p_xy / self.term_count
        df.p_x = df.p_x / self.term_count
        df.p_y = df.p_y / self.term_count
        
        df = df.assign(score=df.p_xy / (df.p_x + df.p_y - df.p_xy))
        
        return df
    
test_burgess_litmus_test()

2019-02-24 08:05:13,262 : INFO : Builiding vocabulary...
2019-02-24 08:05:13,264 : INFO : Vocabulary of size 7 built from 8 terms.
2019-02-24 08:05:13,266 : INFO : ['the', 'horse', 'raced', 'past', 'the', 'barn']
2019-02-24 08:05:13,268 : INFO : ['horse', 'raced', 'past', 'the', 'barn', 'fell']
2019-02-24 08:05:13,269 : INFO : ['raced', 'past', 'the', 'barn', 'fell', '.']
2019-02-24 08:05:13,271 : INFO : ['past', 'the', 'barn', 'fell', '.', None]
2019-02-24 08:05:13,272 : INFO : ['the', 'barn', 'fell', '.', None, None]
2019-02-24 08:05:13,273 : INFO : ['barn', 'fell', '.', None, None, None]
2019-02-24 08:05:13,275 : INFO : ['fell', '.', None, None, None, None]
2019-02-24 08:05:13,276 : INFO : ['.', None, None, None, None, None]


Test run OK


In [66]:
def test_burgess_litmus_test():
    terms = 'The Horse Raced Past The Barn Fell .'.lower().split()
    answer = {
     'barn':  {'.': 4,  'barn': 0,  'fell': 5,  'horse': 0,  'past': 0,  'raced': 0,  'the': 0},
     'fell':  {'.': 5,  'barn': 0,  'fell': 0,  'horse': 0,  'past': 0,  'raced': 0,  'the': 0},
     'horse': {'.': 0,  'barn': 2,  'fell': 1,  'horse': 0,  'past': 4,  'raced': 5,  'the': 3},
     'past':  {'.': 2,  'barn': 4,  'fell': 3,  'horse': 0,  'past': 0,  'raced': 0,  'the': 5},
     'raced': {'.': 1,  'barn': 3,  'fell': 2,  'horse': 0,  'past': 5,  'raced': 0,  'the': 4},
     'the':   {'.': 3,  'barn': 6,  'fell': 4,  'horse': 5,  'past': 3,  'raced': 4,  'the': 2}
    }
    df_answer = pd.DataFrame(answer).astype(np.int32)[['the', 'horse', 'raced', 'past', 'barn', 'fell']].sort_index()
    #display(df_answer)
    vectorizer = HyperspaceAnalogueToLanguageVectorizer()
    vectorizer.fit([terms], size=5)
    df_imp = vectorizer.to_df().astype(np.int32)[['the', 'horse', 'raced', 'past', 'barn', 'fell']].sort_index()
    assert df_imp.equals(df_answer), "Test failed"
    #df_imp == df_answer
    print('Test run OK')
        

test_burgess_litmus_test()

2019-02-24 07:59:39,018 : INFO : Builiding vocabulary...
2019-02-24 07:59:39,019 : INFO : Vocabulary of size 7 built from 8 terms.


* 1 the horse 5 ['the', 'horse', 'raced', 'past', 'the', 'barn']
* 2 the raced 4 ['the', 'horse', 'raced', 'past', 'the', 'barn']
* 3 the past 3 ['the', 'horse', 'raced', 'past', 'the', 'barn']
* 4 the the 2 ['the', 'horse', 'raced', 'past', 'the', 'barn']
* 5 the barn 1 ['the', 'horse', 'raced', 'past', 'the', 'barn']
* 1 horse raced 5 ['horse', 'raced', 'past', 'the', 'barn', 'fell']
* 2 horse past 4 ['horse', 'raced', 'past', 'the', 'barn', 'fell']
* 3 horse the 3 ['horse', 'raced', 'past', 'the', 'barn', 'fell']
* 4 horse barn 2 ['horse', 'raced', 'past', 'the', 'barn', 'fell']
* 5 horse fell 1 ['horse', 'raced', 'past', 'the', 'barn', 'fell']
* 1 raced past 5 ['raced', 'past', 'the', 'barn', 'fell', '.']
* 2 raced the 4 ['raced', 'past', 'the', 'barn', 'fell', '.']
* 3 raced barn 3 ['raced', 'past', 'the', 'barn', 'fell', '.']
* 4 raced fell 2 ['raced', 'past', 'the', 'barn', 'fell', '.']
* 5 raced . 1 ['raced', 'past', 'the', 'barn', 'fell', '.']
* 1 past the 5 ['past', 'the', 'b

In [20]:
corpus = [ current_corpus()[0] ]
terms = [ list(doc) for doc in textacy_utility.extract_corpus_terms(corpus, extract_args) ]
#terms = 'The Horse Raced Past The Barn Fell .'.lower().split()
vectorizer = HyperspaceAnalogueToLanguageVectorizer()
vectorizer.fit(terms)
df = vectorizer.cooccurence()
df

2019-02-24 07:15:03,129 : INFO : Builiding vocabulary...
2019-02-24 07:15:03,132 : INFO : Vocabulary of size 115 built from 171 terms.


In [16]:
df

In [59]:
def sliding_window(seq, n):
    "Returns a sliding centered window of size +/-n from (of width 2 * n + 1) "
    y = [None] * n + list(seq) + [None] * n
    #y = itertools.chain([pad_value] * pad_left, it, [pad_value] * n * 2)
    for i in range(n, len(y)+n):
        yield y[i-n:i+n+1]

def sliding_window2(seq, n):
    it = itertools.chain(iter(seq), [None] * n)
    memory = tuple(itertools.islice(it, n+1))
    if len(memory) == n+1:
        yield memory
    for x in it:
        memory = memory[1:] + (x,)
        yield memory
                    
terms = 'The Horse Raced Past The Barn Fell .'.lower().split()
print(list(sliding_window2(terms, 5)))
print(list(sliding_window_it(terms, 2)))


[('the', 'horse', 'raced', 'past', 'the', 'barn'), ('horse', 'raced', 'past', 'the', 'barn', 'fell'), ('raced', 'past', 'the', 'barn', 'fell', '.'), ('past', 'the', 'barn', 'fell', '.', None), ('the', 'barn', 'fell', '.', None, None), ('barn', 'fell', '.', None, None, None), ('fell', '.', None, None, None, None), ('.', None, None, None, None, None)]
[[None, None, 'the', 'horse', 'raced', 'past'], [None, 'the', 'horse', 'raced', 'past', 'the'], ['the', 'horse', 'raced', 'past', 'the', 'barn'], ['horse', 'raced', 'past', 'the', 'barn', 'fell'], ['raced', 'past', 'the', 'barn', 'fell', '.'], ['past', 'the', 'barn', 'fell', '.', None], ['the', 'barn', 'fell', '.', None, None], ['barn', 'fell', '.', None, None, None], ['fell', '.', None, None, None, None]]
