In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import time
import numpy as np
import math
'''
import sys
sys.path.append('Classes')
from arpy import *
'''
import arpa
from tqdm import tqdm

import torch
import torch.nn as nn



# prep cuda

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(torch.cuda.get_device_name(0))
print(torch.cuda.is_available())


GeForce GTX 1060 6GB
True


# Prep dict

In [3]:
file = '../../rsc/dev_set/dev_counts.txt'
#file = '../../rsc/13_14_counts.txt'
first_read = open(file ,'r')

num_lines = sum(1 for line in open(file,'r'))

ngram_dict = {}
for x in tqdm(first_read, total=num_lines, position=0, leave=True):
    line = x.split('\t')
    r = int(line[-1])
    ngram_dict[line[0]] = r


100%|██████████| 15718800/15718800 [00:15<00:00, 1043511.07it/s]



# Prep NN

In [4]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(5, 5)
        self.fc2 = nn.Linear(5, 5)
        self.fc3 = nn.Linear(5, 1)

        
    def forward(self, x):
        x = torch.sigmoid(self.fc1(x))
        x = torch.sigmoid(self.fc2(x)) 
        x = torch.sigmoid(self.fc3(x)) 
        return x
    
net = Net().cuda()
net.load_state_dict(torch.load('NN saves/50-50_5inputs'))
net.eval()

Net(
  (fc1): Linear(in_features=5, out_features=5, bias=True)
  (fc2): Linear(in_features=5, out_features=5, bias=True)
  (fc3): Linear(in_features=5, out_features=1, bias=True)
)

In [7]:
file = '../../rsc/dev_set/gt3dc_dev.txt'
#file = '../../rsc/dc_factors.txt'
first_read = open(file ,'r')
num_lines = sum(1 for line in open(file,'r'))
first_read.readline()
first_read.readline()
first_read.readline()
i = 2
dcf = np.zeros(8)

for x in tqdm(first_read, total=6, position=0, leave=True):
    line = x.split(' ')
    print(line)
    dcf[i] = float(line[2])
    i += 1
    
print(dcf)    

100%|██████████| 6/6 [00:00<00:00, 7943.76it/s]

['discount', '2', '0.496574\n']
['discount', '3', '0.643644\n']
['discount', '4', '0.669976\n']
['discount', '5', '0.773884\n']
['discount', '6', '0.797639\n']
['discount', '7', '0.842846\n']
[0.       0.       0.496574 0.643644 0.669976 0.773884 0.797639 0.842846]





# Load and Write ARPA
## NN with 5 inputs

In [9]:
file = '../../rsc/dev_set/smoothedLM_dev.arpa'
#file = '../../rsc/smoothedLM.arpa'
first_read = open(file ,'r')
new_file = open("../../rsc/temp.arpa","w+")

num_lines = sum(1 for line in open(file,'r'))
current_ngram_len = 0
TH_exceeded = 0
count = 0
nn_input = torch.zeros(1, 5, dtype = torch.float, device = device)


for x in tqdm(first_read, total=num_lines, position=0, leave=True):
    if x == '\\end\\\n':
        current_ngram_len = -1
        new_file.write(x)
    elif x == '\n':
        new_file.write(x)
    elif current_ngram_len < 3:
        new_file.write(x)
    elif current_ngram_len == 3:
        #evaluate count
        line = x.split('\t')
        r = ngram_dict[line[1][:-1]]
        if r == 1 :
            print('oops')

        if r > 1 and r < 8: #only smooth values for r < 8
            prob = 10**float(line[0])
            ngram = line[1].split(' ')
            ngram[2] = ngram[2][:-1]
            count += 1
            
            ######setup nn input#######
            nn_input[0][0] = ngram_dict[ngram[0] + ' ' + ngram[1]]  #prefix count
            nn_input[0][1] = ngram_dict[line[1][:-1]]               #trigram count
            nn_input[0][2] = ngram_dict[ngram[1] + ' ' + ngram[2]]  #backoff bigram count
            nn_input[0][3] = ngram_dict[ngram[2]]                   #unigram count
            nn_input[0][4] = ngram_dict[ngram[0]]                   #pre-prefix count
            
            nn_input = 1/nn_input #normalise

            MLE = nn_input[0][0]/nn_input[0][1] #get threshold value
            nn_prob  = net(nn_input)            #get NN value

            if nn_prob > (MLE):           #check threshold
                TH_exceeded += 1
                nn_prob = MLE
                
            #multiply with GT dc factor
            nn_prob = nn_prob * dcf[r]
            '''
            nn_prob = ngram_dict[line[1][:-1]]  / ngram_dict[ngram[0] + ' ' + ngram[1]]
            nn_prob = nn_prob * dcf[r]
            '''
            
            logbase = math.log(nn_prob, 10)
            new_file.write('{:.7f}\t{}\n'.format(logbase, line[1][:-1]))
        else:
            new_file.write(x)
            
    if x == '\\1-grams:\n':
        current_ngram_len = 1
    if x == '\\2-grams:\n':
        current_ngram_len = 2
    if x == '\\3-grams:\n':
        current_ngram_len = 3
        
new_file.close()
print('MLE estimates exceeded: {:.2f}%'.format((TH_exceeded/count)*100))

100%|██████████| 7123624/7123624 [24:03<00:00, 4934.60it/s]   

MLE estimates exceeded: 7.73%





In [9]:
print(count)

4829103


# copy bigrams

In [7]:
file1, file2 = '../../rsc/temp.arpa', '../../rsc/smoothedLM.arpa'
first_read, second_read = open(file1 ,'r'), open(file2 , 'r')
new_file = open('../../rsc/output_LM.arpa',"w+")
num_lines = sum(1 for line in open(file1,'r'))

for x in tqdm(range(0,num_lines), position=0, leave=True):
    line1, line2 = first_read.readline().split('\t') , second_read.readline().split('\t')
    
    if line1[0][0] != '\\' and line1[0] !='\n' and line1[0][0] != 'n':        
        ngram = line1[1].split(' ')
        r = len(ngram)
        
        if r == 2: #read until bigrams are reached in ARPA file
            for y in line2: #write smoothed bigram value
                new_file.write(y)
                if y[-1:] != '\n':
                    new_file.write('\t')
        else:
            for y in line1: #write values for unigrams and trigrams
                new_file.write(y)
                if y[-1:] != '\n':
                    new_file.write('\t')
    else:
        for y in line1:
            new_file.write(y)
        
new_file.close()

100%|██████████| 12421255/12421255 [00:36<00:00, 341903.72it/s]
