In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import time
import numpy as np
'''
import sys
sys.path.append('Classes')
from arpy import *
'''
import arpa
from tqdm import tqdm
import pickle
import torch

# Input:
### prefix count
### ngram count
### n-1gram count
### unigram count

# Target 
### Pgt


## Read counts into dictionary first

In [12]:
file = '../../rsc/train_counts.txt'
first_read = open(file ,'r')

num_lines = sum(1 for line in open(file,'r'))

ngram_dict = {}
for x in tqdm(first_read, total=num_lines):
    line = x.split('\t')
    r = int(line[-1])
    ngram_dict[line[0]] = r


100%|██████████| 45918515/45918515 [00:46<00:00, 979553.46it/s] 


## check cuda


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(torch.cuda.get_device_name(0))
print(torch.cuda.is_available())

if device.type == 'cuda':
    #print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**2,1), 'MB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**2,1), 'MB')

## Load Trigram Counts to tensor
### i. e. amount of trigrams to be smoothed

In [7]:
file = '../../rsc/train_counts.txt'
first_read = open(file ,'r')

num_lines = sum(1 for line in open(file,'r'))
num_trigrams = 0

for x in tqdm(first_read, total=num_lines):
    line = x.split('\t')
    r = int(line[-1])
    ngram = line[0].split(' ')
    tuple_size = len(ngram)

    if tuple_size == 3 and r < 8 and r!= 1:
        num_trigrams += 1
        
print(num_trigrams)

100%|██████████| 45918515/45918515 [00:43<00:00, 1062493.39it/s]

8813319





## instantiate input tensors on gpu

In [8]:
#inputs = data[0].to(device)
inputs = torch.zeros(4,num_trigrams, dtype=torch.float, device = device)
print(inputs)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')


In [9]:
#print(ngram_dict["\' A P."])

## fill input tensors

In [None]:
file = '../../rsc/train_counts.txt'
first_read = open(file ,'r')

num_lines = sum(1 for line in open(file,'r'))
count = 0

for x in tqdm(first_read, total=num_lines):
    line = x.split('\t')
    r = int(line[-1])
    ngram = line[0].split(' ')
    tuple_size = len(ngram)
    
    if tuple_size == 3 and r < 8 and r!= 1:
        inputs[0][count] = ngram_dict[ngram[0] + ' ' + ngram[1]]  #prefix count
        inputs[1][count] = ngram_dict[line[0]]                    #trigram count
        inputs[2][count] = ngram_dict[ngram[1] + ' ' + ngram[2]]  #backoff bigram count
        inputs[3][count] = ngram_dict[ngram[2]]                   #unigram count
        
        '''
        if inputs[1][count]/inputs[0][count] == 1:
            print(ngram)
        count += 1
        '''

## saving input tensors using pytorch

In [None]:
torch.save(inputs, '../../pickles/train_input_' + str(num_trigrams))

# !!! reset kernel to prevent mem crash !!!

## Load Trigram Counts to tensor again
### i. e. amount of trigrams to be smoothed

In [2]:
file = '../../rsc/train_counts.txt'
first_read = open(file ,'r')

num_lines = sum(1 for line in open(file,'r'))
num_trigrams = 0

for x in tqdm(first_read, total=num_lines,position = 0 , leave = True):
    line = x.split('\t')
    r = int(line[-1])
    ngram = line[0].split(' ')
    tuple_size = len(ngram)

    if tuple_size == 3 and r < 8 and r!= 1:
        num_trigrams += 1
        
print(num_trigrams)

100%|██████████| 45918515/45918515 [00:42<00:00, 1080963.18it/s]

8813319





In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(torch.cuda.get_device_name(0))
print(torch.cuda.is_available())

outputs = torch.zeros(1,num_trigrams, dtype=torch.float, device = device)
print(outputs)

if device.type == 'cuda':
    #print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**2,1), 'MB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**2,1), 'MB')

GeForce GTX 1060 6GB
True
tensor([[0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')
Memory Usage:
Allocated: 34.0 MB
Cached:    36.0 MB


## fill output tensors
### load in smoothed arpa

In [4]:
lm = arpa.loadf('../../rsc/smoothedLM.arpa')
#ABOUT CERTAIN AREAS

In [8]:
#examples
print('Pgt(BENNI | <s>, AND) = ' + str(lm[0].p("<s> AND BENNI")))
print('Pgt(\'HO\' | \'<s>\', \'GUNG\') = ' + str(lm[0].p("<s> GUNG HO")))

Pgt(BENNI | <s>, AND) = 1.228550463195926e-05
Pgt('HO' | '<s>', 'GUNG') = 0.6563238707493194


## Get smoothed probs for output

In [13]:
file = '../../rsc/train_counts.txt'
first_read = open(file ,'r')

num_lines = sum(1 for line in open(file,'r'))
count = 0
for x in tqdm(first_read, total=num_lines, position = 0, leave = True):
    line = x.split('\t')
    r = int(line[-1])
    ngram = line[0].split(' ')
    tuple_size = len(ngram)
    
    if tuple_size == 3 and r < 8 and r!= 1:
        #Pgt
        outputs[0][count] = lm[0].p(line[0])
        count +=1

100%|██████████| 45918515/45918515 [07:43<00:00, 99167.15it/s] 


In [14]:
print(outputs)

tensor([[0.2648, 0.2461, 0.2461,  ..., 0.2883, 0.0623, 0.1158]],
       device='cuda:0')


## saving output tensors using pytorch

In [15]:
torch.save(outputs, '../../pickles/train_output_' + str(num_trigrams))

# Moving on to next Notebook:
## Training NN on counts