In [261]:
from collections import defaultdict, Counter 
from operator import itemgetter
from pprint import pprint
from akl import akl
import operator
import math

### Step 1: Generate ngrams for a given corpus

In [262]:
def read_ngrams():
    nGrams = defaultdict(int) 
    for line in open('citeseerx.ngms','r'):
        ngram, count = line.split('\t')
        count = int(count)
        nGrams[ngram] += count
                
    return nGrams

In [263]:
nGrams = read_ngrams()

In [264]:
print(nGrams['play-v a-det role-n'])
print(nGrams['play-v a-det important-adj role-n'])

6715
29


### Step 2-1: Generate skip bigrams from ngrams (-5 <= d <= 5) per 100 m. words

In [265]:
def generate_skip_bigrams(nGrams):
    """
    input  nGrams     : (nGram, count)
    output skipBigrams: (skipBigram, position, count)
    """
    skipBigrams = defaultdict(lambda: defaultdict(int))
    
    for nGram, count in nGrams.items():
        terms = nGram.split()
        
        start = terms[0]
        end = terms[-1]
        n = len(terms)
        
        key1 = '%s %s' % (start, end)
        key2 = '%s %s' % (end, start)
        
        skipBigrams[key1][n-1] += count
        skipBigrams[key2][-n+1] += count
        
    return skipBigrams

In [266]:
skipBigrams = generate_skip_bigrams(nGrams)

### Step 2-2: Generate distance counts

In [267]:
def generate_distance_counts(skipBigrams):
    dcSkipBigrams = defaultdict(lambda: defaultdict(lambda: defaultdict(int))) 
    
    for nGram, distanceCounts in skipBigrams.items():
        terms = nGram.split()
        start = terms[0]
        end = terms[1]
        for distance, count in distanceCounts.items():
            dcSkipBigrams[start][end][distance] += count
            
    return dcSkipBigrams

In [268]:
dcSkipBigrams = generate_distance_counts(skipBigrams)

### Step 3: Testing

In [269]:
def collocation_extraction(word):
    global nGrams
    global dcSkipBigrams
    
    C1 = defaultdict(lambda: defaultdict(int))
    final_skipBigrams = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
    final_nGrams = []

    sum_freqi = 0
    N = len(dcSkipBigrams[word].keys())
    targetItem = dcSkipBigrams[word].items()
    for end, distanceCounts in targetItem:
        freqi = sum(distanceCounts.values())
        sum_freqi += freqi
        
    avg_freq = sum_freqi / N
    std_freq = 10e-6
        
    # Calculate standard deviation
    for end, distanceCounts in targetItem:
        freqi = sum(distanceCounts.values())
        std_freq += math.sqrt((freqi - avg_freq) ** 2) / N

    strength = 0.0
        
    # Condition 1
    for end, distanceCounts in targetItem:
        freqi = sum(distanceCounts.values())
        strength = (freqi - avg_freq) / std_freq
        if strength > 1:
            C1[end] = distanceCounts

    # Condition 2, 3
    for end, distanceCounts in C1.items():
        avg_pi = sum(distanceCounts.values()) / 10
        Vi = 0.0
        
        # Condition 2
        for distance, count in distanceCounts.items():
            Vi += math.sqrt((count - avg_pi) ** 2) / 10
            
        if Vi <= 10:
            continue
        
        # Condition 3
        best_distance = 0
        best_count = 0
        for distance, count in distanceCounts.items():
            threshold = avg_pi + math.sqrt(Vi)
            if count > threshold:
                if count > best_count:
                    best_distance, best_count = distance, count
            
        final_skipBigrams[(end, best_distance)] = best_count
        
    # Sort in count
    final_skipBigrams = sorted(final_skipBigrams.items(), key=operator.itemgetter(1), reverse=True)    
                
    # Get correponding nGram through skipBigrams
    for distance, count in final_skipBigrams:
        filter_nGrams = defaultdict(lambda: defaultdict(int))
        collocation, length = distance
        for k, v in nGrams.items():
            terms = k.split()
            start = terms[0]
            end = terms[-1]
            
            if length > 0:
                if start == word and end == collocation and len(terms) == length + 1:
                    filter_nGrams[k] = v
            else:
                if start == collocation and end == word and len(terms) == 1 - length:
                    filter_nGrams[k] = v
        
        final_nGrams.append(sorted(filter_nGrams.items(), key=operator.itemgetter(1), reverse=True)[0])
                
    return final_skipBigrams, final_nGrams

In [270]:
def main(word):
    final_skipBigrams, final_nGrams = collocation_extraction(word)
    print('Skip-Bigrams', 'Ngrams')
    for a, b in zip(final_skipBigrams, final_nGrams):
        print(a, b)

In [271]:
main('role-n')

Skip-Bigrams Ngrams
(('in-prep', 1), 62342) ('role-n in-prep', 62342)
(('play-v', -3), 41994) ('play-v an-det important-adj role-n', 15863)
(('important-adj', -1), 23675) ('important-adj role-n', 23675)
(('for-prep', 1), 8790) ('role-n for-prep', 8790)
(('key-adj', -1), 5595) ('key-adj role-n', 5595)
(('crucial-adj', -1), 4615) ('crucial-adj role-n', 4615)
(('central-adj', -1), 4597) ('central-adj role-n', 4597)
(('we-pron', -3), 3690) ('we-pron investigate-v the-det role-n', 892)
(('critical-adj', -1), 3658) ('critical-adj role-n', 3658)
(('major-adj', -1), 3257) ('major-adj role-n', 3257)
(('on-prep', -2), 3242) ('on-prep the-det role-n', 3067)
(('significant-adj', -1), 2994) ('significant-adj role-n', 2994)
(('investigate-v', -2), 2683) ('investigate-v the-det role-n', 2612)
(('have-v', -3), 2624) ('have-v an-det important-adj role-n', 686)
(('by-prep', 2), 2400) ('role-n play-v by-prep', 2380)
(('examine-v', -2), 2330) ('examine-v the-det role-n', 2301)
(('as-prep', 1), 2296) ('rol

In [272]:
%save lab06_collocation_from_ngram.py 261-271

The following commands were written to file `lab06_collocation_from_ngram.py`:
from collections import defaultdict, Counter 
from operator import itemgetter
from pprint import pprint
from akl import akl
import operator
import math
def read_ngrams():
    nGrams = defaultdict(int) 
    for line in open('citeseerx.ngms','r'):
        ngram, count = line.split('\t')
        count = int(count)
        nGrams[ngram] += count
                
    return nGrams
nGrams = read_ngrams()
print(nGrams['play-v a-det role-n'])
print(nGrams['play-v a-det important-adj role-n'])
def generate_skip_bigrams(nGrams):
    """
    input  nGrams     : (nGram, count)
    output skipBigrams: (skipBigram, position, count)
    """
    skipBigrams = defaultdict(lambda: defaultdict(int))
    
    for nGram, count in nGrams.items():
        terms = nGram.split()
        
        start = terms[0]
        end = terms[-1]
        n = len(terms)
        
        key1 = '%s %s' % (start, end)
        key2 = '%s %s' % (en