# Find specific collocations
Find any VB-NN relation collocations
Read file *citeseerx100000.tag.txt* in which each word is POS tagged.

In [None]:
from nltk.tokenize import  wordpunct_tokenize 
from nltk.corpus import stopwords 
from collections import defaultdict, Counter
from nltk.tokenize import  wordpunct_tokenize 

from progressbar import AnimatedMarker, Bar, BouncingBar, ETA, \
    AdaptiveETA, FileTransferSpeed, FormatLabel, Percentage, \
    ProgressBar, ReverseBar, RotatingMarker, \
    SimpleProgress, Timer

eng_stopwords = set(stopwords.words('english'))
eng_symbols = '{}"\'()[].,:;+!?-*/&|<>=~$'
TAGGING = {'VBZ':'VB', 'VBN':'VB', 'VBP':'VB', 'VB':'VB', 'VBG':'VB', 'NNP':'NN', 'NNS':'NN', 'NN':'NN'}

def ngram_is_valid(ngram):
    first, last = ngram[0][0], ngram[0][-1]
    if first in eng_stopwords or last in eng_stopwords: return False
    if any( num in first or num in last for num in '0123456789'): return False
    if any( eng_symbol in word for word in ngram[0] for eng_symbol in eng_symbols): return False
    return True

def to_ngrams( unigrams, length):
    tag = zip(*[zip(*unigrams)[1][i:] for i in range(length)])
    grams = [zip(*unigrams)[0][i:] for i in range(length)]
    grams = zip(*grams)
    grams = zip(grams, tag)
    return grams

ngram_counts = defaultdict(Counter)
widgets      = [FormatLabel('Processed: %(value)d records (in: %(elapsed)s)')]
pbar         = ProgressBar(widgets = widgets)
with open('citeseerx100000.tag.txt') as text_file:
    for index,line in pbar(enumerate(text_file)): 
        words = line.strip().split(' ')
        words = [(w.split('/')[0].lower(), TAGGING.get(w.split('/')[1], w.split('/')[1])) for w in words]
        for n in range(2, max_distance + 2):
            ngram_counts[n].update(filter(ngram_is_valid, to_ngrams(words, n)))
    pbar.finish()
            
# print ngram_counts[5]

In [None]:
import pprint
skip_bigram_info = defaultdict(lambda: defaultdict(Counter))
for dist in range(2, max_distance + 2):
    print('Processing skip bigram for distance {}...'.format(dist))
    pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval = len(ngram_counts[dist].keys())).start()
    index = 0
    for ngram, count in ngram_counts[dist].items():
        skip_bigram_info[ngram[0][0]][(ngram[0][-1], ngram[1][0]+'-'+ngram[1][-1])] += Counter({dist-1: count})
        skip_bigram_info[ngram[0][-1]][(ngram[0][0], ngram[1][-1]+'-'+ngram[1][0])] += Counter({1-dist: count})
        pbar.update(index+1)
        index += 1
    pbar.finish()

In [None]:
from itertools import groupby
import numpy as np
skip_bigram_abc = defaultdict(lambda: 0)
pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval = len(skip_bigram_info.keys())).start()
index = 0
for word, vals in skip_bigram_info.items():
    count = []
    for coll, val in vals.items():
        c = val.values()
        c_bar = sum(c) / (2*max_distance)
        skip_bigram_abc[(word, coll, 'freq')] = sum(c)
        skip_bigram_abc[(word, coll, 'spread')] = (sum([x**2 for x in c]) - 2*c_bar*sum(c) + 2*max_distance*c_bar**2) / (2 * max_distance)
        count.append(sum(c))
    skip_bigram_abc[(word, 'avg_freq')] = np.mean(count)
    skip_bigram_abc[(word, 'dev')] = np.std(count)
    pbar.update(index+1)
    index += 1
pbar.finish()

# pprint.pprint(skip_bigram_abc)

In [None]:
import math
def skip_bigram_filter(skip_bigram_info, skip_bigram_abc):
    cc = []
    print('Filtering...')
    pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval = len(skip_bigram_info.keys())).start()
    index = 0
    for word, vals in skip_bigram_info.items():
        f = skip_bigram_abc[(word, 'avg_freq')]
        for coll, val in vals.items():
            if skip_bigram_abc[(word, 'dev')]-0 < 1E-6:
                strength = 0
            else:
                strength = (skip_bigram_abc[(word, coll, 'freq')] - f) / skip_bigram_abc[(word, 'dev')]
            if strength < k0:
                continue
            spread = skip_bigram_abc[(word, coll, 'spread')]
            if spread < U0:
                continue
            c_bar = sum(val.values()) / (2*max_distance)
            peak = c_bar + k1 * math.sqrt(spread)
            for dist, count in val.items():
                if count >= 0:
                    cc.append((word, coll[0], coll[1], dist, strength, spread, peak, count))
        pbar.update(index+1)
        index += 1
    pbar.finish()
    return cc

cc = skip_bigram_filter(skip_bigram_info, skip_bigram_abc)
# print cc

In [None]:
import pandas
collocations_df = pandas.DataFrame(cc,
                                   columns = ['base word', 'collocate', 'POS', 'distance', 'strength', 'spread', 'peak', 'p'])
collocations_df = collocations_df.set_index(['base word', 'collocate', 'POS', 'distance']).sort_index()

# Show VB-NN relation collocations

In [None]:
collocations_df[ collocations_df.index.map(lambda x: x[2] == 'VB-NN')
               ].sort_values(by = 'strength', ascending=False)[:100]