In [24]:
import pymongo
import string
import nltk
import numpy as np
from collections import defaultdict

client = pymongo.MongoClient('mongodb://localhost:27017')
db = client['vatican']
dataset = db['wordsDataset']

In [25]:
def preprocess(s):
    out = ""
    for c in s.lower():
        if c in CHAR_INDEX.keys():
            out += c
        else:
            out += '#'
    return "".join(out)

def string_to_matrix(s, n=2):
    z = preprocess(s)
    C = np.zeros((len(CHAR_INDEX), len(CHAR_INDEX)))
    for a, b in nltk.ngrams(z, n=n):
        C[CHAR_INDEX[a], CHAR_INDEX[b]] += 1
    C /= C.max()
    return C



In [27]:
word = list(dataset.find())

In [28]:
wList = []
wCorrectList = []
for w in word: 
    if w['word'] not in wList: 
        wList.append(w['word'])
    if w['word'] not in wCorrectList:
        wCorrectList.append(w['word'])
    for o in w['occurrences']:
        if o not in wList: 
            wList.append(o)

In [31]:
print(len(wCorrectList))
print(len(wList))

14417
16939


In [32]:
chars = ''

for w in wList: 
    for c in w: 
        if chars.count(c) == 0: 
            chars = chars+(c)

In [38]:
CHAR_INDEX = dict((c, i) for i, c in enumerate(chars))
print(chars)

solenitàvj'd\mczfruhbxwqykgp#8192!


In [34]:
training_data, testing_data = [], []

for w in word:
    training_list, testing_list = [], []
    
    wrd = w['word']
    for i, sw in enumerate(w['occurrences']):
        if(wrd == sw): 
            training_list.append(sw)
        else:
            if(i <= 2):
                training_list.append(sw)
            else: 
                testing_list.append(sw)
    
    training_data.append((training_list, wrd))
    testing_data.append((testing_list, wrd))

In [35]:


print('TRAINING DATA: ', sum([len(x[0]) for x in training_data]))

training_stats = defaultdict(lambda: 0)

for t in training_data: 
    training_stats[len(t[0])] += 1

print(sorted(training_stats.items()))

print('TESTING DATA: ', sum([len(x[0]) for x in testing_data]))

testing_stats = defaultdict(lambda: 0)

for t in testing_data:
    testing_stats[len(t[0])] += 1

print(sorted(testing_stats.items()))

TRAINING DATA:  16029
[(1, 13429), (2, 805), (3, 330)]
TESTING DATA:  946
[(0, 14399), (1, 54), (2, 27), (3, 18), (4, 12), (5, 7), (6, 5), (7, 6), (8, 6), (9, 6), (10, 4), (11, 2), (12, 2), (13, 3), (14, 1), (15, 1), (16, 1), (17, 1), (23, 1), (27, 1), (28, 1), (37, 1), (38, 1), (39, 1), (41, 1), (43, 1), (64, 1)]


## Network Setup

In [36]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(42)

<torch._C.Generator at 0x2b3baed2df0>

In [39]:
N_LABELS = len(wCorrectList)
V = len(CHAR_INDEX) * len(CHAR_INDEX)
LABEL_INDEX = dict((l, i) for i, l in enumerate(wCorrectList))

In [40]:
class Simple2Gram(nn.Module):
    def __init__(self, num_labels, size):
        super(Simple2Gram, self).__init__()
        self.linear = nn.Linear(size, num_labels)
    
    def forward(self, vec):
        return F.log_softmax(self.linear(vec), dim=1)

In [41]:
def vector(s, n=2):
    vec = torch.tensor(string_to_matrix(s, n=n)).float()
    return vec.view(1, -1)

def target(label):
    return torch.LongTensor([LABEL_INDEX[label]])

In [42]:
model = Simple2Gram(N_LABELS, V)

### Model Parameters

In [43]:
for param in model.parameters(): 
    print(param)

Parameter containing:
tensor([[ 0.0218,  0.0237, -0.0067,  ...,  0.0085, -0.0187,  0.0139],
        [-0.0244,  0.0246,  0.0277,  ...,  0.0193, -0.0028, -0.0009],
        [-0.0021,  0.0238,  0.0162,  ...,  0.0167, -0.0174,  0.0055],
        ...,
        [-0.0094, -0.0076,  0.0134,  ..., -0.0046, -0.0160, -0.0158],
        [ 0.0189, -0.0012,  0.0063,  ...,  0.0042,  0.0211, -0.0120],
        [-0.0124, -0.0231,  0.0199,  ..., -0.0205, -0.0120, -0.0228]],
       requires_grad=True)
Parameter containing:
tensor([-0.0044, -0.0169, -0.0175,  ...,  0.0158,  0.0195, -0.0231],
       requires_grad=True)


In [44]:
with torch.no_grad():
    sample = training_data[0]
    vec = vector(sample[0][0], n = 2)
    log_probs = model(vec)

In [45]:
np.shape(log_probs)

torch.Size([1, 14417])

### Training

In [46]:
loss = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr = 0.1)

In [55]:
from tqdm.notebook import tqdm

for epoch in tqdm(range(5)): #50
    for samples, label in tqdm(training_data):
        for instance in samples:
            model.zero_grad()
            vec = vector(instance)
            tar = target(label)
            log_probs = model(vec)
            L = loss(log_probs, tar)
            L.backward()
            optimizer.step()

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=14564.0), HTML(value='')))

  C /= C.max()






KeyboardInterrupt: 

In [48]:
with torch.no_grad():
    for test, label in testing_data:
        for word in test: 
            vec = vector(word, n=2)
            log_probs = model(vec)
            print('Input String', word)
            
            prediction = np.argmax(log_probs.numpy())
            print(prediction)
            print('Guess:', wCorrectList[prediction], '\n')
        

Input String solennita'̀
8757
Guess: fuggevole 

Input String solennitat
7460
Guess: quemadmodum 

Input String derlla
274
Guess: parecchi 

Input String delld
6000
Guess: vulnerata 

Input String dellua
6465
Guess: porremo 

Input String dehlla
4744
Guess: prodigiosa 

Input String dellaa
6769
Guess: legittimo 

Input String deula
12137
Guess: succedutesi 

Input String detlla
4114
Guess: parecchie 

Input String zella
12458
Guess: accreditazione 

Input String delbla
45
Guess: suo 

Input String dellea
6000
Guess: vulnerata 

Input String deltla
4736
Guess: convivio 

Input String fella
11404
Guess: profusione 

Input String dellx
10948
Guess: occupazione 

Input String cdella
6769
Guess: legittimo 

Input String bdella
4114
Guess: parecchie 

Input String dellza
269
Guess: impedisce 

Input String dewlla
4114
Guess: parecchie 

Input String dellda
703
Guess: applicate 

Input String dqlla
1702
Guess: quibus 

Input String delly
6769
Guess: legittimo 

Input String delma
4450
Guess: 

Input String re
6403
Guess: all\'inizio 

Input String lp
6463
Guess: ribera 

Input String ce
12456
Guess: porti 

Input String lce
9959
Guess: viareggio 

Input String ble
2006
Guess: praecinit 

Input String lye
2724
Guess: entusiasti 

Input String ls
10546
Guess: praefecti 

Input String lj
5196
Guess: privato 

Input String ne
4475
Guess: fenomeni 

Input String nle
13031
Guess: analoghi 

Input String ltz
13063
Guess: maniera 

Input String ld
2969
Guess: #2# 

Input String lr
76
Guess: magnifiche 

Input String lle
12458
Guess: accreditazione 

Input String te
12488
Guess: coadiuvati 

Input String qucsto
9440
Guess: patrizio 

Input String quessto
6906
Guess: drizzare 

Input String quusto
802
Guess: paterno 

Input String quepsto
14109
Guess: tutelata 

Input String quewstb
5638
Guess: ricercarne 

Input String lal
13031
Guess: analoghi 

Input String axl
9284
Guess: favorita 

Input String apl
13675
Guess: reazioni 

Input String aw
13319
Guess: l\'intenzione 

Input String 

Input String cwhe
9041
Guess: factus 

Input String cjhe
7685
Guess: disponente 

Input String chx
13127
Guess: priva 

Input String chfe
5074
Guess: consumato 

Input String chxe
11623
Guess: rendi 

Input String xhe
4647
Guess: prese 

Input String ache
7035
Guess: definisce 

Input String chbe
1093
Guess: xiii 

Input String cce
10597
Guess: concordiae 

Input String cyhe
12926
Guess: reciteremo 

Input String chle
8927
Guess: cosparse 

Input String chae
2144
Guess: coronam 

Input String kche
6224
Guess: farci 

Input String hche
14289
Guess: studiosi 

Input String czhe
1850
Guess: iter 

Input String dhe
8114
Guess: rappresentano 

Input String khe
7859
Guess: vuoi 

Input String cht
6876
Guess: attaccò 

Input String chce
9959
Guess: viareggio 

Input String chh
14380
Guess: atleti 

Input String jhe
7859
Guess: vuoi 

Input String uhe
630
Guess: distanza 

Input String fche
11623
Guess: rendi 

Input String cson
1413
Guess: alcune 

Input String can
13378
Guess: '' 

Input St

Input String terhra
2334
Guess: santificatrice 

Input String ug
14380
Guess: atleti 

Input String wun
13080
Guess: of 

Input String on
498
Guess: prelati 

Input String uyn
14302
Guess: dell\xe2\x8#\x99enciclica 

Input String sun
7154
Guess: dimore 

Input String uxn
70
Guess: con 

Input String uc
5196
Guess: privato 

Input String tn
9232
Guess: minacciarne 

Input String cn
7154
Guess: dimore 

Input String ut
8833
Guess: traduzioni 

Input String um
2301
Guess: nonostante 

Input String yn
7852
Guess: scalvinoni 

Input String kun
12047
Guess: vanno 

Input String xun
8329
Guess: tenersi 

Input String tutpto
10412
Guess: antistitibus 

Input String ututto
5029
Guess: accanimento 

Input String tbutto
5029
Guess: accanimento 

Input String tutte
2751
Guess: turchia 

Input String allia
12886
Guess: finestre 

Input String abla
10521
Guess: negotium 

Input String clla
11553
Guess: accoglimento 

Input String acla
7179
Guess: caso 

Input String alyla
9021
Guess: concipiant 

In

Input String dq
2105
Guess: pater 

Input String matrth
9055
Guess: iucundissimus 

Input String mattg
14089
Guess: attingono 

Input String cristsiano
9584
Guess: animas 

Input String abbizamo
13157
Guess: vada 

Input String serumo
498
Guess: prelati 

Input String kiii
5349
Guess: inginocchiati 

Input String iimi
784
Guess: compatte 

Input String tiii
5349
Guess: inginocchiati 

Input String miii
12458
Guess: accreditazione 

Input String itii
1287
Guess: diffusa 

Input String figloiuoli
5072
Guess: usciamo 

Input String amex
712
Guess: venezia 

Input String ammey
3194
Guess: prosegue 

Input String namen
1127
Guess: cambiano 

Input String cubm
13296
Guess: generalità 

Input String cwum
2053
Guess: cfr 

Input String cfy
10570
Guess: lasciando 

Input String ifr
11605
Guess: all\'onore 

Input String ccfr
3468
Guess: pecorella 

Input String hcfr
587
Guess: ameremmo 

Input String zfr
13435
Guess: dimostri 

Input String ffr
11605
Guess: all\'onore 

Input String cfn
3248
G

In [20]:
import time

ts = time.time()

print(ts)

1625271490.7319815
