# Network

## Pre processing data

In [1]:
import pymongo
import string
import nltk
import numpy as np
from collections import defaultdict, Counter
from tqdm.notebook import tqdm

In [2]:
client = pymongo.MongoClient('mongodb://localhost:27017')
db = client['vatican']
dataset = db['perturbedDataset']

In [3]:
docs = list(dataset.find())

The first task is to create a list of chars and words used in documents. 

In [4]:
def preprocess(s):
    out = ""
    for c in s.lower():
        if c in CHAR_INDEX.keys():
            out += c
        else:
            out += '#'
            
    if(len(out) == 1):
        return "".join("#")
    
    return "".join(out)

In [5]:
'''chars = ''
for i, d in tqdm(enumerate(docs)):
    for s in d['sentences']:
        for w in s[0]:
            for c in w: 
                if chars.find(c) == -1:
                    chars += c'''

#Utilizzo una stringa di caratteri provvisoria che permette di risparmiare tempo in fase di testing
#In realtà si potrebbero tenere direttamente questi 

chars = string.ascii_lowercase + string.whitespace + "#àùèéò'"
CHAR_INDEX = dict((c, i) for i, c in enumerate(chars))
chars

"abcdefghijklmnopqrstuvwxyz \t\n\r\x0b\x0c#àùèéò'"

Dividing the dataset in training set and test set.

In [6]:
TRAINING_SET_SIZE = 80
TEST_SET_SIZE = 100 - TRAINING_SET_SIZE
TRSH = int((len(docs) / 100) * TRAINING_SET_SIZE)

In [7]:
training_set = docs[:TRSH]
test_set = docs[TRSH:]

In [8]:
#Per ogni parola corretta vi allego la lista di tutte le sue occorrenze (giuste o sbagliate)
wordDataset = dict()
for i, d in tqdm(enumerate(training_set)):
    for j, s in enumerate(d['sentences']):
        for k, w in enumerate(s[0]):
            wordDataset.setdefault(s[1][k], []).append(w)            

#Ad ogni parola corretta allego la lista delle sue parole (uniche) sbagliate
training_data = []
for wk in tqdm(wordDataset.keys()):
    item = []
    for w in wordDataset.get(wk):
        if(w not in item and w != wk):
            item.append(w)
    
    if(len(item) > 0):
        training_data.append((item, wk))

#Potrebbe essere eliminato?
wordOccurrences = []  
words = []
for wk in tqdm(wordDataset.keys()):
    item = dict(Counter(wordDataset.get(wk)))
    wordOccurrences.append({'word': wk, 'occurrences': item})
    words.append(wk)
    
print(len(wordDataset), len(training_data))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=48376.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=48376.0), HTML(value='')))


48376 6298


## Network Setup 

In [9]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import pandas as pd

torch.manual_seed(42)

<torch._C.Generator at 0x19496480d30>

In [10]:
N_LABELS = len(words)
V = len(CHAR_INDEX) * len(CHAR_INDEX)
LABEL_INDEX = dict((l, i) for i, l in enumerate(words))

In [11]:
def string_to_matrix(s, n=2):
    z = preprocess(s)
    C = np.zeros((len(CHAR_INDEX), len(CHAR_INDEX)))
    
    for a, b in nltk.ngrams(z, n=n):
        C[CHAR_INDEX[a], CHAR_INDEX[b]] += 1
    
    C /= (C.max() + 1)

    return C

In [12]:
class Simple2Gram(nn.Module):
    def __init__(self, num_labels, size):
        super(Simple2Gram, self).__init__()
        self.linear = nn.Linear(size, num_labels)
        
    def forward(self, vec):
        return F.log_softmax(self.linear(vec), dim = 1)

In [13]:
def vector(s, n=2):
    vec = torch.tensor(string_to_matrix(s, n = n)).float()
    return vec.view(1, -1)

def target(label):
    return torch.LongTensor([LABEL_INDEX[label]])

In [14]:
model = Simple2Gram(N_LABELS, V)

In [15]:
for param in model.parameters(): 
    print(param)

Parameter containing:
tensor([[ 0.0196,  0.0213, -0.0060,  ...,  0.0195, -0.0047,  0.0169],
        [ 0.0059,  0.0154,  0.0250,  ..., -0.0103, -0.0037, -0.0015],
        [-0.0146,  0.0088, -0.0249,  ..., -0.0193,  0.0127,  0.0069],
        ...,
        [-0.0010,  0.0210, -0.0248,  ..., -0.0228,  0.0020, -0.0154],
        [ 0.0201,  0.0225,  0.0043,  ...,  0.0119,  0.0051,  0.0001],
        [-0.0029, -0.0092,  0.0198,  ..., -0.0253,  0.0118,  0.0166]],
       requires_grad=True)
Parameter containing:
tensor([ 0.0033, -0.0017, -0.0244,  ..., -0.0143,  0.0100, -0.0091],
       requires_grad=True)


In [16]:
#Da togliere?
with torch.no_grad():
    sample = training_data[0]
    vec = vector(sample[0][0], n = 2)
    log_probs = model(vec)

### Training

In [17]:
loss = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr = 0.1)

In [18]:
for epoch in tqdm(range(50)): 
    for samples, label in tqdm(training_data):
        for instance in samples:
            model.zero_grad()
            vec = vector(instance)
            tar = target(label)
            log_probs = model(vec)
            L = loss(log_probs, tar)
            L.backward()
            optimizer.step()

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=6298.0), HTML(value='')))





KeyboardInterrupt: 

### Testing

In [None]:
#Per ogni parola corretta vi allego la lista di tutte le sue occorrenze (giuste o sbagliate)
wordDatasetTest = dict()
for i, d in tqdm(enumerate(test_set)):
    for j, s in enumerate(d['sentences']):
        for k, w in enumerate(s[0]):
            wordDatasetTest.setdefault(s[1][k], []).append(w)            

#Ad ogni parola corretta allego la lista delle sue parole (uniche) sbagliate
test_data = []
for wk in tqdm(wordDatasetTest.keys()):
    item = []
    for w in wordDatasetTest.get(wk):
        if(w not in item and w != wk):
            item.append(w)
    test_data.append((item, wk))

In [None]:
import pandas as pd
from torch.distributions import Categorical 

predictions, y_true = [], []

with torch.no_grad():
    for test, label in test_data:
        for word in test: 
            vec = vector(word, n=2)
            log_probs = model(vec)
            print(vec, log_probs)
            print('Input String', word)
            prediction = np.argmax(log_probs.numpy())
            print(prediction)
            print('Guess:', words[prediction], '(',label ,')' '\n')
            
            predictions.append(model(vec))
            y_true.append(label)

y_true = np.array(y_true)

In [None]:
y_pred = [words[np.argmax(p.numpy())] for p in predictions]

In [None]:
from sklearn.metrics import classification_report, multilabel_confusion_matrix

In [None]:
print(classification_report(y_true, y_pred))

In [None]:
print(multilabel_confusion_matrix(y_true, y_pred))

In [None]:
lenTD = [len(x[0]) for x in training_data]