# Training

In this notebook i'll going to pre process the training dataset and then to train the Network. 

In [1]:
import pymongo
import string
import nltk
import numpy as np
from collections import defaultdict
from tqdm.notebook import tqdm

In [None]:
client = pymongo.MongoClient('mongodb://localhost:27017')
db = client['vatican']
dataset = db['wordsDataset']

In [2]:
def preprocess(s):
    out = ""
    for c in s.lower():
        if c in CHAR_INDEX.keys():
            out += c
        else:
            out += '#'
            
    if(len(s) == 1):
        return '#'
    
    return "".join(out)

def string_to_matrix(s, n=2):
    z = preprocess(s)
    C = np.zeros((len(CHAR_INDEX), len(CHAR_INDEX)))
    for a, b in nltk.ngrams(z, n=n):
        C[CHAR_INDEX[a], CHAR_INDEX[b]] += 1
        
    #print(C, C.max())
    #Add one to avoid division by zero
    C /= C.max()
    
    return C



In [3]:
print(dataset.count())
word = list(dataset.find())

14564


  print(dataset.count())


In [4]:
wList = []
wCorrectList = []
for w in tqdm(word): 
    if w['word'] not in wList: 
        wList.append(w['word'])
    if w['word'] not in wCorrectList:
        wCorrectList.append(w['word'])
    for o in w['occurrences']:
        if o not in wList: 
            wList.append(o)

HBox(children=(FloatProgress(value=0.0, max=14564.0), HTML(value='')))




In [5]:
for i, w in enumerate(wCorrectList):
    if w == '19##':
        print(i)

5122


In [6]:
print(len(wCorrectList))
print(len(wList))

14417
16838


In [7]:
chars = ''

for w in wList: 
    for c in w: 
        if chars.count(c) == 0: 
            chars = chars+(c)

In [8]:
CHAR_INDEX = dict((c, i) for i, c in enumerate(chars))
print(len(chars))

35


In [9]:
training_data, testing_data = [], []

for w in word:
    training_list, testing_list = [], []
    
    wrd = w['word']
    for i, sw in enumerate(w['occurrences']):
        if(wrd != sw): 
            if(i < 5):
                training_list.append(sw)
            else: 
                testing_list.append(sw)

    training_data.append((training_list, wrd))
    testing_data.append((testing_list, wrd))

In [10]:


print('TRAINING DATA: ', sum([len(x[0]) for x in training_data]))

training_stats = defaultdict(lambda: 0)

for t in training_data: 
    training_stats[len(t[0])] += 1

print(sorted(training_stats.items()))

print('TESTING DATA: ', sum([len(x[0]) for x in testing_data]))

testing_stats = defaultdict(lambda: 0)

for t in testing_data:
    testing_stats[len(t[0])] += 1

print(sorted(testing_stats.items()))

TRAINING DATA:  1915
[(0, 13253), (1, 996), (2, 145), (3, 51), (4, 119)]
TESTING DATA:  628
[(0, 14479), (1, 17), (2, 12), (3, 8), (4, 11), (5, 9), (6, 3), (7, 2), (8, 5), (9, 2), (10, 1), (11, 2), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (19, 1), (20, 1), (27, 1), (29, 1), (37, 1), (41, 1), (42, 1), (67, 1)]


## Network Setup

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(42)

<torch._C.Generator at 0x22544ad4ed0>

In [12]:
N_LABELS = len(wCorrectList)
V = len(CHAR_INDEX) * len(CHAR_INDEX)
LABEL_INDEX = dict((l, i) for i, l in enumerate(wCorrectList))

In [15]:
class Simple2Gram(nn.Module):
    def __init__(self, num_labels, size):
        super(Simple2Gram, self).__init__()
        self.linear = nn.Linear(size, num_labels)
    
    def forward(self, vec):
        return F.log_softmax(self.linear(vec), dim=1)

In [16]:
def vector(s, n=2):
    vec = torch.tensor(string_to_matrix(s, n=n)).float()
    return vec.view(1, -1)

def target(label):
    return torch.LongTensor([LABEL_INDEX[label]])

In [17]:
model = Simple2Gram(N_LABELS, V)

### Model Parameters

In [18]:
for param in model.parameters(): 
    print(param)

Parameter containing:
tensor([[ 0.0218,  0.0237, -0.0067,  ...,  0.0085, -0.0187,  0.0139],
        [-0.0244,  0.0246,  0.0277,  ...,  0.0193, -0.0028, -0.0009],
        [-0.0021,  0.0238,  0.0162,  ...,  0.0167, -0.0174,  0.0055],
        ...,
        [-0.0094, -0.0076,  0.0134,  ..., -0.0046, -0.0160, -0.0158],
        [ 0.0189, -0.0012,  0.0063,  ...,  0.0042,  0.0211, -0.0120],
        [-0.0124, -0.0231,  0.0199,  ..., -0.0205, -0.0120, -0.0228]],
       requires_grad=True)
Parameter containing:
tensor([-0.0044, -0.0169, -0.0175,  ...,  0.0158,  0.0195, -0.0231],
       requires_grad=True)


In [19]:
with torch.no_grad():
    sample = training_data[0]
    vec = vector(sample[0][0], n = 2)
    log_probs = model(vec)

In [20]:
np.shape(log_probs)

torch.Size([1, 14417])

### Training

In [21]:
loss = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr = 0.1)

In [22]:
for epoch in tqdm(range(1)): #50
    for samples, label in tqdm(training_data):
        for instance in samples:
            #print(instance, samples, label)
            model.zero_grad()
            vec = vector(instance)
            tar = target(label)
            log_probs = model(vec)
            #print(log_probs)
            L = loss(log_probs, tar)
            L.backward()
            optimizer.step()

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=14564.0), HTML(value='')))





In [23]:
#PATH = "C:\Users\piera\Desktop\Repository\VaticanGenderEvolution\models"
#PATH = "C:\Users\piera\Desktop"
torch.save(model, '.\models\model.pt')

In [None]:
import pandas as pd
from torch.distributions import Categorical 

predictions, y_true = [], []

with torch.no_grad():
    for test, label in testing_data:
        for word in test: 
            vec = vector(word, n=2)
            log_probs = model(vec)
            print('Input String', word)
            prediction = np.argmax(log_probs.numpy())
            print('Guess:', wCorrectList[prediction], '(',label ,')' '\n')
            
            predictions.append(model(vec))
            y_true.append(label)

y_true = np.array(y_true)

In [None]:
#entropy = lambda x: Categorical(probs = x).entropy()
#E = sum([entropy(p) for p in predictions])

In [None]:
y_pred = [wCorrectList[np.argmax(p.numpy())] for p in predictions]
#y_pred = [np.argmax(p.numpy()) for p in predictions]

In [None]:
from sklearn.metrics import classification_report, multilabel_confusion_matrix

In [None]:
print(classification_report(y_true, y_pred))

In [None]:
print(multilabel_confusion_matrix(y_true, y_pred))