In [1]:
import pymongo
import string
import nltk
import numpy as np
from collections import defaultdict

client = pymongo.MongoClient('mongodb://localhost:27017')
db = client['vatican']
dataset = db['wordsDataset']

In [2]:
def preprocess(s):
    out = ""
    for c in s.lower():
        if c in CHAR_INDEX.keys():
            out += c
        else:
            out += '#'
    return "".join(out)

def string_to_matrix(s, n=2):
    z = preprocess(s)
    C = np.zeros((len(CHAR_INDEX), len(CHAR_INDEX)))
    for a, b in nltk.ngrams(z, n=n):
        C[CHAR_INDEX[a], CHAR_INDEX[b]] += 1
    C /= C.max()
    return C



In [3]:
word = list(dataset.find())

In [4]:
wList = []

for w in word: 
    if w['word'] not in wList: 
        wList.append(w['word'])
    for o in w['occurrences']:
        if o not in wList: 
            wList.append(o)

In [5]:
chars = ''

for w in wList: 
    for c in w: 
        if chars.count(c) == 0: 
            chars = chars+(c)

In [6]:
CHAR_INDEX = dict((c, i) for i, c in enumerate(chars))
wordList = [x['word'] for x in word]

In [7]:
training_data, testing_data = [], []

for w in word:
    training_list, testing_list = [], []
    
    wrd = w['word']
    for i, sw in enumerate(w['occurrences']):
        if(wrd == sw): 
            training_list.append(sw)
        else:
            if(i <= 2):
                training_list.append(sw)
            else: 
                testing_list.append(sw)
    
    training_data.append((training_list, wrd))
    testing_data.append((testing_list, wrd))

In [8]:


print('TRAINING DATA: ', sum([len(x[0]) for x in training_data]))

training_stats = defaultdict(lambda: 0)

for t in training_data: 
    training_stats[len(t[0])] += 1

print(sorted(training_stats.items()))

print('TESTING DATA: ', sum([len(x[0]) for x in testing_data]))

testing_stats = defaultdict(lambda: 0)

for t in testing_data:
    testing_stats[len(t[0])] += 1

print(sorted(testing_stats.items()))

TRAINING DATA:  16414
[(1, 13457), (2, 832), (3, 431)]
TESTING DATA:  2562
[(0, 14469), (1, 60), (2, 26), (3, 22), (4, 16), (5, 11), (6, 11), (7, 10), (8, 3), (9, 16), (10, 4), (11, 8), (12, 3), (13, 3), (14, 5), (15, 3), (18, 6), (19, 7), (20, 1), (25, 6), (26, 1), (30, 5), (34, 5), (35, 5), (45, 5), (47, 1), (51, 1), (54, 5), (59, 2)]


## Network Setup

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(42)

<torch._C.Generator at 0x212b1eb7c10>

In [10]:
N_LABELS = len(wordList)
V = len(CHAR_INDEX) * len(CHAR_INDEX)
LABEL_INDEX = dict((l, i) for i, l in enumerate(wordList))

In [11]:
class Simple2Gram(nn.Module):
    def __init__(self, num_labels, size):
        super(Simple2Gram, self).__init__()
        self.linear = nn.Linear(size, num_labels)
    
    def forward(self, vec):
        return F.log_softmax(self.linear(vec), dim=1)

In [12]:
def vector(s, n=2):
    vec = torch.tensor(string_to_matrix(s, n=n)).float()
    return vec.view(1, -1)

def target(label):
    return torch.LongTensor([LABEL_INDEX[label]])

In [13]:
model = Simple2Gram(N_LABELS, V)

### Model Parameters

In [14]:
for param in model.parameters(): 
    print(param)

Parameter containing:
tensor([[ 0.0207,  0.0224, -0.0063,  ..., -0.0177, -0.0194,  0.0227],
        [ 0.0230,  0.0017,  0.0254,  ..., -0.0230,  0.0270,  0.0233],
        [ 0.0168, -0.0027,  0.0096,  ..., -0.0033, -0.0224, -0.0217],
        ...,
        [-0.0019,  0.0177, -0.0243,  ...,  0.0079,  0.0207,  0.0134],
        [-0.0095,  0.0049, -0.0020,  ...,  0.0074,  0.0213, -0.0092],
        [-0.0021, -0.0089,  0.0027,  ..., -0.0056, -0.0214,  0.0031]],
       requires_grad=True)
Parameter containing:
tensor([-0.0047, -0.0245, -0.0240,  ...,  0.0182,  0.0130,  0.0003],
       requires_grad=True)


In [15]:
with torch.no_grad():
    sample = training_data[0]
    vec = vector(sample[0][0], n = 2)
    log_probs = model(vec)

In [16]:
np.shape(log_probs)

torch.Size([1, 14720])

### Training

In [17]:
loss = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr = 0.1)

In [None]:
for epoch in range(50):
    for samples, label in training_data:
        for instance in samples:
            model.zero_grad()
            vec = vector(instance)
            tar = target(label)
            log_probs = model(vec)
            L = loss(log_probs, tar)
            L.backward()
            optimizer.step()

  C /= C.max()


In [None]:
with torch.no_grad():
    for test, label in testing_data:
        for word in test: 
            vec = vector(word, n=2)
            log_probs = model(vec)
            print('Input String', word)
            prediction = np.argmax(log_probs.numpy())
            print('Guess:', wordList[prediction], '\n')

In [None]:
import time

ts = time.time()

print(ts)