# BOW

In [408]:
import spacy
from spacy.lang.en import English
from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, HYPHENS
from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
from spacy.util import compile_infix_regex
from scipy.sparse import coo_matrix
from sklearn.feature_extraction.text import TfidfTransformer
import scipy.io

In [83]:
ROOT_DIR = 'C:/Users/rossz/OneDrive/CC'
DATA_DIR = f'{ROOT_DIR}/data'

## init nlp

In [3]:
%%time

nlp = spacy.load('en_core_web_lg', disable=['ner', 'tagger', 'sentencizer', 'parser'])

infixes = (
    LIST_ELLIPSES
    + LIST_ICONS
    + [
        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
        ),
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
        # EDIT: commented out regex that splits on hyphens between letters:
        # r"(?<=[{a}])(?:{h})(?=[{a}])".format(a=ALPHA, h=HYPHENS),
        # EDIT: '/' not considered as a separator:
        r"(?<=[{a}0-9])[:<>=](?=[{a}])".format(a=ALPHA),
    ])

infix_re = compile_infix_regex(infixes)
nlp.tokenizer.infix_finditer = infix_re.finditer
nlp.max_length = 8000000

Wall time: 10 s


## tokenize

In [84]:
cc = pd.read_feather(f'{DATA_DIR}/f_sue_keydevid_car_finratio_transcriptid_text.feather')

In [5]:
%time cc['text_present'] = list(nlp.pipe(cc['text_present'])) # 2min 4s

Wall time: 2min 4s


In [61]:
%time cc['text_qa'] = list(nlp.pipe(cc['text_qa'])) # 8min 38s

Wall time: 7min 50s


In [63]:
%time cc['text_all'] = list(nlp.pipe(cc['text_all']))

Wall time: 26min 4s


## get n_gram

Note:
- Only use lowercase (`use_norm=True`)

In [59]:
def get_bigram_count(doc, use_norm=False):
    bigram = defaultdict(int)
    for token in doc[:(len(doc)-1)]:
        nbor = token.nbor()
        if token.is_punct or nbor.is_punct or token.is_space or nbor.is_space \
            or token.like_num or nbor.like_num or token.is_currency or nbor.is_currency \
            or token.like_url or nbor.like_url or token.like_email or nbor.like_email:
            continue
        if use_norm:
            token, nbor = token.norm_, nbor.norm_
        bigram[f'{token} {nbor}'] += 1
    return bigram

def get_unigram_count(doc, use_norm=False):
    unigram = defaultdict(int)
    for token in doc:
        if token.is_punct or token.is_space or token.like_num or token.is_currency \
            or token.like_url or token.like_email:
            continue
        if use_norm: 
            token = token.norm_
        unigram[f'{token}'] += 1
    return unigram

In [56]:
%%time
cc['text_present_unigram_count'] = cc.text_present.apply(get_unigram_count, use_norm=True)
cc['text_present_bigram_count'] = cc.text_present.apply(get_bigram_count, use_norm=True)
cc['text_present_allgram_count'] = [{**row.text_present_unigram_count, **row.text_present_bigram_count} for row in cc.itertuples()]

In [62]:
%%time
cc['text_qa_unigram_count'] = cc.text_qa.apply(get_unigram_count, use_norm=True)
cc['text_qa_bigram_count'] = cc.text_qa.apply(get_bigram_count, use_norm=True)
cc['text_qa_allgram_count'] = [{**row.text_qa_unigram_count, **row.text_qa_bigram_count} for row in cc.itertuples()]

Wall time: 9min 7s


In [64]:
%%time 
cc['text_all_unigram_count'] = cc.text_all.apply(get_unigram_count, use_norm=True) 
cc['text_all_bigram_count'] = cc.text_all.apply(get_bigram_count, use_norm=True) 
cc['text_all_allgram_count'] = [{**row.text_all_unigram_count, **row.text_all_bigram_count} for row in cc.itertuples()]

Wall time: 0 ns


In [73]:
%%time 
cc[['docid', 
    'text_present_unigram_count', 'text_present_bigram_count', 'text_present_allgram_count',
    'text_qa_unigram_count', 'text_qa_bigram_count', 'text_qa_allgram_count',
    'text_all_unigram_count', 'text_all_bigram_count', 'text_all_allgram_count']].to_pickle('data/cc_ngram.pkl')

Wall time: 4min 28s


## get `term_freq`

> Make sure cc is already in the global!!!

In [31]:
ld('cc_ngram')

-cc_ngram- loaded


In [76]:
class CCCorpus():
    def __init__(self, ngram_type: str):
        '''
        ngram_type: 'unigram' or 'bigram'
        '''
        assert 'cc_ngram' in globals(), 'Load `cc_ngram` first!'
        global cc_ngram
        
        self.ngram_type = ngram_type
        self.docid = cc_ngram.docid.to_numpy()
        self.ndoc = len(self.docid)
        
        # create vocab
        self.get_vocab()

    def get_vocab(self):
        vocab = set(itertools.chain(*[d.keys() for d in cc_ngram[f'text_all_{self.ngram_type}_count']]))
        vocab = np.array(list(vocab))
        self.nvocab = len(vocab)
        self.word2idx = {word:idx for idx, word in enumerate(vocab)}
        self.vocab = vocab
        
    def get_dtm_by_cctype(self, cc_type:str):
        '''
        cc_type: 'present' or 'qa'
        '''
        global cc_ngram

        text_col = f'text_{cc_type}_{self.ngram_type}_count'
        
        n_nonzero = sum(len(d) for d in cc_ngram[text_col])

        # make a list of document names
        # the order will be the same as in the dict
        vocab_sorter = np.argsort(self.vocab)    # indices that sort "vocab"

        data = np.empty(n_nonzero, dtype=np.intc)     # all non-zero term frequencies at data[k]
        rows = np.empty(n_nonzero, dtype=np.intc)     # row index for kth data item (kth term freq.)
        cols = np.empty(n_nonzero, dtype=np.intc)     # column index for kth data item (kth term freq.)

        ind = 0     # current index in the sparse matrix data

        # fill dtm
        for doc_i, row in tqdm(enumerate(cc_ngram.itertuples()), total=len(cc_ngram), desc=f'Building {cc_type.upper()} dtm'):
            # find indices into  such that, if the corresponding elements in  were
            # inserted before the indices, the order of  would be preserved
            # -> array of indices of  in 
            unique_indices = vocab_sorter[np.searchsorted(self.vocab, list(getattr(row, text_col).keys()), sorter=vocab_sorter)]

            # count the unique terms of the document and get their vocabulary indices
            counts = np.array(list(getattr(row, text_col).values()), dtype=np.int32)

            n_vals = len(unique_indices)  # = number of unique terms
            ind_end = ind + n_vals  #  to  is the slice that we will fill with data

            data[ind:ind_end] = counts                  # save the counts (term frequencies)
            cols[ind:ind_end] = unique_indices            # save the column index: index in 
            rows[ind:ind_end] = np.repeat(doc_i, n_vals)  # save it as repeated value

            ind = ind_end  # resume with next document -> add data to the end
        
        # final dtm
        dtm = coo_matrix((data, (rows, cols)), shape=(self.ndoc, self.nvocab), dtype=np.intc)
        if cc_type=='present': self.dtm_present = dtm
        if cc_type=='qa': self.dtm_qa = dtm
        if cc_type=='all': self.dtm_all = dtm
        
        return self
    
    def get_dtm_all(self):
        self.get_dtm_by_cctype('present')
        self.get_dtm_by_cctype('qa')
        self.get_dtm_by_cctype('all')
        return self

In [77]:
%%time
corp_unigram = CCCorpus('unigram').get_dtm_all()
sv('corp_unigram')

HBox(children=(FloatProgress(value=0.0, description='Building PRESENT dtm', max=25652.0, style=ProgressStyle(d…




HBox(children=(FloatProgress(value=0.0, description='Building QA dtm', max=25652.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Building ALL dtm', max=25652.0, style=ProgressStyle(descr…


-corp_unigram- saved
Wall time: 2min 45s


In [105]:
%%time
corp_bigram = CCCorpus('bigram').get_dtm_all()
sv('corp_bigram')

HBox(children=(FloatProgress(value=0.0, description='Building PRESENT dtm', max=25652.0, style=ProgressStyle(d…




HBox(children=(FloatProgress(value=0.0, description='Building QA dtm', max=25652.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Building ALL dtm', max=25652.0, style=ProgressStyle(descr…


-corp_bigram- saved
Wall time: 28min 53s


In [None]:
corp_allgram = CCCorpus('allgram').get_dtm_all()
sv('corp_allgram')

## similarity

In [436]:
ld('corp_unigram')
ld('corp_bigram')
ld('corp_allgram')

def get_similarity(corp_list):
    sim_df = []
    for corp in corp_list:
        sim = np.array(corp.dtm_present.multiply(corp.dtm_qa).sum(1)/corp.nvocab).squeeze() # should be (N,)
        sim_df.append(pd.DataFrame({'docid':corp.docid, f'similarity_{corp.ngram_type}':sim}).set_index('docid'))
    
    sim_df = pd.concat(sim_df, axis=1).reset_index()
    sim_df.to_feather('data/similarity.feather')
    return sim_df 
    
sim_df = get_similarity([corp_unigram, corp_bigram, corp_allgram])

-corp_unigram- already exists, will not load again!
-corp_bigram- already exists, will not load again!
-corp_allgram- already exists, will not load again!


> Add `sim_ngram` to `cc`

In [386]:
cc = pd.read_feather(f'{DATA_DIR}/f_sue_keydevid_car_finratio_transcriptid_text.feather')
cc['sim_unigram'] = sim_unigram
cc['sim_bigram'] = sim_bigram
cc['sim_allgram'] = sim_allgram
cc.to_feather(f'{DATA_DIR}/f_sue_keydevid_car_finratio_transcriptid_text_sim.feather')

## filter vocab

Note:
- I didn't use `STOPWORDS`, because our project is very domain-specific and common stopwords might have important meaning.
- I only remove the most least frequent words

In [381]:
ld('corp_unigram', force=True)
ld('corp_bigram', force=True)
ld('corp_allgram', force=True)

-corp_unigram- loaded as -corp_unigram- (forced)
-corp_bigram- loaded as -corp_bigram- (forced)
-corp_allgram- loaded as -corp_allgram- (forced)


In [382]:
%%time
def filter_corpus(corpus, keep_size):
    '''
    criterion: which ngram to use as frequency count base, e.g., if `filter_ngram==all`, then will find the most 
        frequent words, regardless it's unigram or bigram.
    '''
    word_freq_sort_idx = np.argsort(np.asarray(corpus.dtm_all.sum(0)).squeeze())[::-1]
    
    keep_vocab_idx = word_freq_sort_idx[:keep_size]
    keep_vocab = corpus.vocab[keep_vocab_idx]
    
    # overwrite corpus.vocab
    corpus.vocab = keep_vocab
    
    # overwrite corpus.dtm
    corpus.dtm_present = corpus.dtm_present.tocsc()[:,keep_vocab_idx]
    corpus.dtm_qa = corpus.dtm_qa.tocsc()[:,keep_vocab_idx]
    corpus.dtm_all = corpus.dtm_all.tocsc()[:,keep_vocab_idx]
    print(f'Keep top {len(keep_vocab)} words')

    return corpus

keep_size = 10000

f_corp_unigram = filter_corpus(corp_unigram, keep_size)
f_corp_bigram = filter_corpus(corp_bigram, keep_size)
f_corp_allgram = filter_corpus(corp_allgram, keep_size)

Keep top 10000 words
Keep top 10000 words
Keep top 10000 words


In [383]:
sv('f_corp_unigram')
sv('f_corp_bigram')
sv('f_corp_allgram')

-f_corp_unigram- saved
-f_corp_bigram- saved
-f_corp_allgram- saved


## tf-idf

In [389]:
ld('f_corp_unigram', force=True)
ld('f_corp_bigram', force=True)
ld('f_corp_allgram', force=True)

-corp_unigram- loaded as -corp_unigram- (forced)
-corp_bigram- loaded as -corp_bigram- (forced)
-corp_allgram- loaded as -corp_allgram- (forced)


In [392]:
%%time
def get_tfidf(dtm):
    return TfidfTransformer().fit_transform(dtm)

tfidf_present_unigram = get_tfidf(f_corp_unigram.dtm_present)
tfidf_qa_unigram = get_tfidf(f_corp_unigram.dtm_qa)
tfidf_all_unigram = get_tfidf(f_corp_unigram.dtm_all)

tfidf_present_bigram = get_tfidf(f_corp_bigram.dtm_present)
tfidf_qa_bigram = get_tfidf(f_corp_bigram.dtm_qa)
tfidf_all_bigram = get_tfidf(f_corp_bigram.dtm_all)

tfidf_present_allgram = get_tfidf(f_corp_allgram.dtm_present)
tfidf_qa_allgram = get_tfidf(f_corp_allgram.dtm_qa)
tfidf_all_allgram = get_tfidf(f_corp_allgram.dtm_all)

Wall time: 18.5 s


In [410]:
# save a MatrixMarket for regression in R
for cc_type in ['present', 'qa', 'all']:
    for ngram_type in ['unigram', 'bigram', 'allgram']:
        tfidf_name = f'tfidf_{cc_type}_{ngram_type}'
        scipy.io.mmwrite(f'data/{tfidf_name}.mtx', globals()[tfidf_name])