## From BNC to Ngram 

### BNC Data:  
https://drive.google.com/file/d/1mKX1DLHDIqKph4e4k1MnYOV3iWtvT7-E/view?usp=sharing

### 1. Extract lines containing id, title, classcode, keywords, sentences from each BNC parts

grep (global search regular RE)
grep是很常見也很常用的命令，它的主要功能是進行字符串數據的比較，然後符合用戶需求的字符串打印出來，但是注意，grep在數據中查找一個字符串時，是以“整行”爲單位進行數據篩選的。

egrep (extended RE)

Reference
https://www.twblogs.net/a/5d26d705bd9eee1e5c84509d

In [None]:
! time ! egrep -o -h \
'(<idno type="bnc">.*?</idno>|<title>.*?</title>|<classCode.*?</classCode>|<keywords>.*?</keywords>|<s n=".*?">|<w c5=".*?" hw=".*?" pos=".*?">.*?</w>|<c c5=".*?">.*?</c>|</s>|<p>|</p>)' \
BNC/Texts/*/*/G*.xml > BNC.G.txt


####        Repeat Step 1 for all sections A, B, C, D, E, F, G, H, J, and K 

 ### 2. Convert sentences to bigram (for all sections A to K, no I)
 ### 2.1 Convert line to word tokens

In [1]:
import re
from pprint import pprint

def line_to_token(line):
    if line.startswith('<s'):
        return ('<s> ', '<s>', '<s>') 
    elif line.startswith('</s'):
        return ('</s>', '</s>', '</s>') 
    elif line.startswith('<w'):
        # <w c5="VVN" hw="discount" pos="VERB">discounted </w>
        match = re.findall('<w c5="(.*?)" hw="(.*?)" pos=".*?">(.*?)</w>', line)
        return (match[0][2].strip(), match[0][0].upper(), match[0][1]) # lemma, tag, word
    elif line.startswith('<c'):
        match = re.findall('<c c5="PUN">(.*?)</c>', line)
        if not match:
            return '??? line'
        return (match[0], match[0], match[0])

def tokens_to_bigram(tokens):
    result = []
    for i in range(len(tokens)-1):
        if i == 1:
            word2tag2lemma2 = [tokens[i][j].lower()+' '+tokens[i+1][j] for j in range(3)]
        else:
            word2tag2lemma2 = [tokens[i][j]+' '+tokens[i+1][j] for j in range(3)]
        if word2tag2lemma2[0][0].isalpha() or word2tag2lemma2[0][0] == '<': 
            result = result + [ '\t'.join(word2tag2lemma2) ]
    return result

### 2.2 Convert token stream to bigram stream

In [None]:
def word_to_bigram(wordfile, bigramfile):
    
    def Batch_to_ngram(batch, fileout):        
        with open(wordfile.format(batch)) as filein:
            lines = filein.readlines()
            for i, line in enumerate(lines):
                if line.startswith('<s'):
                    sent_start = i
                elif line.startswith('</s'):
                    sentence = [line.strip() for line in lines[sent_start:i+1]]
                    tokens = [line_to_token(line) for line in sentence ]
                    #pprint (tokens)
                    bigram = tokens_to_bigram(tokens)
                    print('\n'.join(bigram), file=fileout)
    
    with open(bigramfile, 'w') as fileout:
        for batch in 'ABCDEFGHJK':
            Batch_to_ngram(batch, fileout)
                
word_to_bigram('BNC.{0}.txt', 'BNC.2w.txt')

### 3 Sort and count bigram (word1 word2 \<tab\> count) 

In [5]:
#1 BNC.2w.txt ==> BNC.2w.c.txt
! time sort BNC.2w.txt | uniq -c | \
awk '{ gsub(/^[ ]*/, ""); print }' | awk '{print substr($0, index($0, " ")+1) "\t" $1}' > BNC.2w.c.txt

sort BNC.2w.txt  354.56s user 890.55s system 69% cpu 29:40.28 total
uniq -c  16.02s user 0.63s system 0% cpu 29:40.28 total
awk '{ gsub(/^[ ]*/, ""); print }'  28.34s user 0.15s system 1% cpu 29:40.28 total
awk '{print substr($0, index($0, " ")+1) "\t" $1}' > BNC.2w.c.txt  19.06s user 0.63s system 1% cpu 29:40.28 total


In [2]:
import os
import re
import string

In [3]:
def tokenize(text):
    """
    Input:
    "This is an example.'

    Sample output: 
    ['this', 'is', 'an', 'example', '.']
    """  
    #### [ TODO ] transform text to lower case
    text = text.lower()
    #### [ TODO ] seperate the words by white space
    tokens = text.translate(str.maketrans('','',string.punctuation)).split(' ')
    return tokens
    
from collections import Counter

def calculate_frequency(tokens):
    """
    Input:
    ['this', 'is', 'an', 'example', ...]

    Sample output: 
    {
        'the': 79809, 
        'project': 288,
        ...
    }
    """
    frequency = Counter(tokens)
    return frequency
    #### [ TODO ] 
   


def get_ngram(tokens, n=2):
    """
    Input:
    ['this', 'is', 'an', 'example', ...]

    Sample output: 
    ['this is', 'is an', 'an example', ...]
    """
    #### [TODO] 
    return [' '.join(tokens[i:i+n]) for i in range(len(tokens)-1)]

In [19]:
# Read lang-8 Data
file_path = os.path.join('data','clang8.txt')
lang_bigram = []
lang_accident_bigram = []

#### [ TODO ] generate lang8 unigrams and calculate document frequency of unigram in lang8
with open(file_path, 'r', encoding='UTF-8') as f:
    for line in f:
        tokens = tokenize(line)
        bigram = get_ngram(tokens)
        for word in bigram:
                if ' accident' in word:
                        lang_bigram.append(word)


lang_bigram_counter = calculate_frequency(lang_bigram)

In [21]:
lang_bigram_Rank = {}

#### [ TODO ] Rank unigrams for lang

sorted_lang_bigram = sorted(lang_bigram_counter.items(), key=lambda word: word[1],reverse=True)
j = 1
for i in sorted_lang_bigram:
    lang_bigram_Rank[i[0]] = j
    j = j+1

In [65]:
! egrep -o 'AJ0 NN.*accident[^AZaz].*' BNC.2w.c.txt > BNC.accident.txt

In [24]:
file_path = os.path.join('BNC.accident.txt')

BNC_bigram_counter = {}

with open(file_path, 'r', encoding='UTF-8') as f:
    for line in f:
        tokens =  line.split()
        if tokens[2] + ' ' + tokens[3] in BNC_bigram_counter.keys():
            BNC_bigram_counter[tokens[2] + ' ' + tokens[3]] = int(BNC_bigram_counter[tokens[2] + ' ' + tokens[3]])+ int(tokens[4])
        else:
            BNC_bigram_counter[tokens[2] + ' ' + tokens[3]] = int(tokens[4])


In [25]:
BNC_bigram_Rank = {}

#### [ TODO ] Rank unigrams for BNC

sorted_BNC_bigram = sorted(BNC_bigram_counter.items(), key=lambda word: word[1],reverse=True)
j = 1
for i in sorted_BNC_bigram:
    BNC_bigram_Rank[i[0]] = j
    j = j+1

In [85]:
BNC_bigram_Rank

{'fatal accident': 1,
 'general accident': 2,
 'serious accident': 3,
 'personal accident': 4,
 'nuclear accident': 5,
 'major accident': 6,
 'industrial accident': 7,
 'tragic accident': 8,
 'terrible accident': 9,
 'nasty accident': 10,
 'freak accident': 11,
 'historical accident': 12,
 'minor accident': 13,
 'unfortunate accident': 14,
 'bad accident': 15,
 'similar accident': 16,
 'horrific accident': 17,
 'little accident': 18,
 'recent accident': 19,
 'actual accident': 20,
 'happy accident': 21,
 'pure accident': 22,
 'medical accident': 23,
 'mean accident': 24,
 'hit-and-run accident': 25,
 'particular accident': 26,
 'possible accident': 27,
 'appalling accident': 28,
 'cerebrovascular accident': 29,
 'high accident': 30,
 'inevitable accident': 31,
 'marine accident': 32,
 'other accident': 33,
 'reduce accident': 34,
 'severe accident': 35,
 'catastrophic accident': 36,
 'complete accident': 37,
 'domestic accident': 38,
 'dreadful accident': 39,
 'flying accident': 40,
 '

In [86]:
import pandas as pd
df = pd.DataFrame(columns=['Phrases','Overuse rank/rank', 'BNC rank', 'Lang-8 rank'])
BNC_bigram_Rank_result = {}
lang_bigram_Rank_result= {}

### 找前30個 BNC 及 lang8 裡面都存在的adj. accident
j = 0
for bigram in BNC_bigram_Rank:
    if bigram in lang_bigram_Rank.keys():
        BNC_bigram_Rank_result[bigram] = j+1
        lang_bigram_Rank_result[bigram] = lang_bigram_Rank[bigram]
        j = j+1
        if j == 30:
            break
### 排序 lang8 前30個 adj. accident ranking
for i, element in enumerate(sorted(lang_bigram_Rank_result.items(), key=lambda word: word[1])):
    lang_bigram_Rank_result[element[0]] = i+1

for bigram in BNC_bigram_Rank_result:
    rank_ratio = BNC_bigram_Rank_result[bigram] / lang_bigram_Rank_result[bigram]
    df.loc[len(df)] = [bigram, round(rank_ratio, 2), BNC_bigram_Rank_result[bigram], lang_bigram_Rank_result[bigram]]


In [87]:
df

Unnamed: 0,Phrases,Overuse rank/rank,BNC rank,Lang-8 rank
0,fatal accident,0.12,1,8
1,serious accident,0.4,2,5
2,nuclear accident,3.0,3,1
3,major accident,0.15,4,27
4,industrial accident,0.19,5,26
5,tragic accident,0.6,6,10
6,terrible accident,2.33,7,3
7,historical accident,0.32,8,25
8,minor accident,0.45,9,20
9,unfortunate accident,0.45,10,22


Target output:  
https://drive.google.com/file/d/1xM46aaDIeu4Z0FkikGOcmDoq7u2O47tY/view?usp=sharing