## Passos para utilizar o Colab

### Ajustar sistema para utilizar GPU T4
### Subir a pasta zipada para o gdrive


In [None]:
# Montar o gdrive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# descompactar a pasta zipada
!unzip /content/drive/MyDrive/document_classification_class.zip

Archive:  /content/drive/MyDrive/document_classification_class.zip
   creating: document_classification_class/
  inflating: document_classification_class/test.py  
   creating: document_classification_class/checkpoints/
   creating: document_classification_class/data/
  inflating: document_classification_class/data/clean_news_with_splits.csv  
  inflating: document_classification_class/data/glove.6B.100d.txt  
  inflating: document_classification_class/deploy.py  
   creating: document_classification_class/libs/
  inflating: document_classification_class/libs/model.py  
  inflating: document_classification_class/libs/nlpclasses.py  
  inflating: document_classification_class/libs/utils.py  
  inflating: document_classification_class/libs/__init__.py  
  inflating: document_classification_class/looking_at_the_dataset.py  
   creating: document_classification_class/plots/
  inflating: document_classification_class/supervisedTraining.py  
  inflating: document_classification_class/testing

In [None]:
# alterar para o diretório
%cd document_classification_class

/content/document_classification_class


In [None]:
# arquivos
ls

[0m[01;34mcheckpoints[0m/  [01;34mlibs[0m/                      supervisedTraining.py
[01;34mdata[0m/         looking_at_the_dataset.py  testing_implementation.py
deploy.py     [01;34mplots[0m/                     test.py


In [None]:
from libs.utils import make_embedding_matrix
from libs.utils import generate_batches
from libs.utils import monitor_training
from libs.nlpclasses import NewsDataset
from libs.model import NewsClassifier

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch

import numpy as np

import argparse

def compute_accuracy(y_pred, y_target):
	# y_pred.size: (batch_size, n_classes)
	# y_target.size: (batch_size,)

	_, y_pred_indices = y_pred.max(dim=1)
	n_correct = torch.eq(y_pred_indices, y_target).sum().item()
	return n_correct / len(y_pred_indices) * 100

In [None]:
path_loss = "/content/document_classification_class/plots/try13_loss.png"
path_acc = "/content/document_classification_class/plots/try13_acc.png"
news_csv = "/content/document_classification_class/data/clean_news_with_splits.csv"
glove_filepath = "/content/document_classification_class/data/glove.6B.100d.txt"
save_dir = "/content/document_classification_class/checkpoints/model_13.tar"
use_glove = True
embedding_size = 100
hidden_dim = 200
batch_size = 128
learning_rate = 0.0001
dropout = 0.5
num_epochs = 50
l2_regularization = 0.001

train_state = {
	'epoch_index': 0,
	'train_loss': [],
	'train_acc': [],
	'val_loss': [],
	'val_acc': [],
}

if not torch.cuda.is_available():
	device = torch.device("cpu")
else:
	device = torch.device("cuda")
print("[INFO] using device {}".format(device))

# dataset e vetorizador
dataset = NewsDataset.load_dataset_and_make_vectorizer(news_csv)
vectorizer = dataset.get_vectorizer()

# use glove or randomly initialized embeddings
if use_glove:
	words = vectorizer.title_vocab._token_to_idx.keys()
	embeddings = make_embedding_matrix(glove_filepath=glove_filepath, words=words)
	print("[INFO] using pre-trained embeddings. Shape:")
	print(embeddings.shape)
else:
	embeddings = None
	print("[INFO] using randomly initialized embeddings")

classifier = NewsClassifier(embedding_size = embedding_size,
				num_embeddings = len(vectorizer.title_vocab),
				hidden_dim = hidden_dim,
				num_classes = len(vectorizer.category_vocab),
				dropout_p = dropout,
				pretrained_embeddings = embeddings,
				padding_idx = 0)

classifier = classifier.to(device)

# loss and optimizer (Adam, SGD, etc)
loss_func = nn.CrossEntropyLoss()
#optimizer = optim.SGD(classifier.parameters(), lr=learning_rate)
optimizer = optim.Adam(classifier.parameters(),
                       lr=learning_rate,
											 weight_decay=l2_regularization)
#optimizer = optim.Adam(classifier.parameters(),
#                       lr=learning_rate)

[INFO] using device cuda
[INFO] word not in glove embeddings, initialized randomly: <MASK>
[INFO] word not in glove embeddings, initialized randomly: <UNK>
[INFO] word not in glove embeddings, initialized randomly: <BEGIN>
[INFO] word not in glove embeddings, initialized randomly: <END>
[INFO] word not in glove embeddings, initialized randomly: newsfactor
[INFO] word not in glove embeddings, initialized randomly: thedeal
[INFO] word not in glove embeddings, initialized randomly: washingtonpost
[INFO] word not in glove embeddings, initialized randomly: siliconvalley
[INFO] word not in glove embeddings, initialized randomly: maccentral
[INFO] word not in glove embeddings, initialized randomly: techweb
[INFO] using pre-trained embeddings. Shape:
(3297, 100)


In [None]:
best_val_loss = 99999.9

for epoch_index in range(num_epochs):

	train_state['epoch_index'] = epoch_index

	dataset.set_split('train')
	batch_generator = generate_batches(dataset, batch_size=batch_size, device=device)

	running_loss = 0.0
	running_acc = 0.0

	classifier.train()

	for batch_index, batch_dict in enumerate(batch_generator):

		# zero the gradients
		optimizer.zero_grad()

		# forward pass
		y_pred = classifier(x_in=batch_dict['x_data'], x_lengths=batch_dict['x_length'])

		# compute the loss
		loss = loss_func(y_pred, batch_dict['y_target'])

		# O termo weight_decay define a força da regularização L2
    # Quanto maior o valor, maior a penalização dos pesos
    # Geralmente é um valor pequeno, por exemplo, 0.001

		#loss += l2_regularization * sum(p.pow(2.0).sum() for p in classifier.parameters())

		# backpropagation
		loss.backward()

		# take a step with optimizer
		optimizer.step()

		loss_batch = loss.item()

		running_loss += loss_batch

		acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])

		running_acc += acc_batch

	train_state['train_loss'].append(running_loss/(batch_index+1))
	train_state['train_acc'].append(running_acc/(batch_index+1))

	dataset.set_split('val')
	batch_generator = generate_batches(dataset, batch_size=batch_size, device=device)

	running_loss = 0.0
	running_acc = 0.0

	classifier.eval()

	for batch_index, batch_dict in enumerate(batch_generator):

		# forward pass
		y_pred = classifier(x_in=batch_dict['x_data'], x_lengths=batch_dict['x_length'])

		# compute the loss
		loss = loss_func(y_pred, batch_dict['y_target'])

		loss_batch = loss.item()

		running_loss += loss_batch

		acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])

		running_acc += acc_batch

	train_state['val_loss'].append(running_loss/(batch_index+1))
	train_state['val_acc'].append(running_acc/(batch_index+1))

	print("[INFO] epoch {}, train loss {}, val loss {}".format(epoch_index, train_state['train_loss'][-1],
		train_state['val_loss'][-1]))
	print("[INFO] train_acc {}, val acc {}".format(train_state['train_acc'][-1], train_state['val_acc'][-1]))

	monitor_training(train_state, path_loss, path_acc)

	if best_val_loss > train_state['val_loss'][-1]:
		best_val_loss = train_state['val_loss'][-1]

		state = {
			'epoch': epoch_index,
			'model_state': classifier.state_dict(),
			'optimizer_state': optimizer.state_dict(),
			'metrics': train_state
		}

		torch.save(state, save_dir)

		print("[INFO] best validation loss updated and checkpoint saved")

[INFO] epoch 0, train loss 0.9591195252610416, val loss 0.7158102005720138
[INFO] train_acc 62.819169207317074, val acc 73.14732142857143
[INFO] best validation loss updated and checkpoint saved
[INFO] epoch 1, train loss 0.6839826374155719, val loss 0.6709842954363142
[INFO] train_acc 74.72370426829268, val acc 75.07254464285714
[INFO] best validation loss updated and checkpoint saved
[INFO] epoch 2, train loss 0.6551010456935662, val loss 0.6502328004155841
[INFO] train_acc 75.45374428353658, val acc 75.75892857142857
[INFO] best validation loss updated and checkpoint saved
[INFO] epoch 3, train loss 0.6442235819086796, val loss 0.6423164680600166
[INFO] train_acc 75.8253144054878, val acc 75.90401785714286
[INFO] best validation loss updated and checkpoint saved
[INFO] epoch 4, train loss 0.6332947399194647, val loss 0.6408410249011857
[INFO] train_acc 76.37076028963415, val acc 76.03794642857143
[INFO] best validation loss updated and checkpoint saved
[INFO] epoch 5, train loss 0.6

# Teste

In [None]:
# !mv /content/drive/MyDrive/model_1.tar /content/document_classification_class/checkpoints/model_1.tar

In [None]:
from libs.utils import make_embedding_matrix
from libs.utils import generate_batches
from libs.utils import monitor_training
from libs.nlpclasses import NewsDataset
from libs.model import NewsClassifier

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch

import pandas as pd
import numpy as np

from argparse import Namespace

import re

def compute_accuracy(y_pred, y_target):
	_, y_pred_indices = y_pred.max(dim=1)
	n_correct = torch.eq(y_pred_indices, y_target).sum().item()
	return n_correct / len(y_pred_indices) * 100


# change this line to the model you want to load
save_dir = '/content/document_classification_class/checkpoints/model_13.tar'

news_csv = "/content/document_classification_class/data/clean_news_with_splits.csv"
glove_filepath = "/content/document_classification_class/data/glove.6B.100d.txt"

use_glove = True
embedding_size = 100
hidden_dim = 200
batch_size = 128
learning_rate = 0.0001
dropout = 0.5
num_epochs = 50
#l2_regularization = 0.001


if not torch.cuda.is_available():
	device = torch.device("cpu")
else:
	device = torch.device("cuda")
print("[INFO] using device: {}".format(device))

# dataset and vectorizer
dataset = NewsDataset.load_dataset_and_make_vectorizer(news_csv)
vectorizer = dataset.get_vectorizer()

# use glove or randomly initialized embeddings
if use_glove:
	words = vectorizer.title_vocab._token_to_idx.keys()
	embeddings = make_embedding_matrix(glove_filepath=glove_filepath,
		words=words)
	print("[INFO] using pre-trained embeddings. Shape")
	print(embeddings.shape)
else:
	print("[INFO] using randomly initialized embeddings")
	embeddings = None

# model
classifier = NewsClassifier(embedding_size = embedding_size,
				num_embeddings = len(vectorizer.title_vocab),
				hidden_dim = hidden_dim,
				num_classes=len(vectorizer.category_vocab),
				dropout_p = dropout,
				pretrained_embeddings=embeddings,
				padding_idx=0)

classifier = classifier.to(device)

# loss and optimizer
loss_func = nn.CrossEntropyLoss()

print("[INFO] loading model...")
state = torch.load(save_dir)
classifier.load_state_dict(state["model_state"])

# iterate over test dataset
dataset.set_split('test')

batch_generator = generate_batches(dataset,
	batch_size=batch_size, device=device)

running_loss = 0.0
running_acc = 0.0

classifier.eval()

for batch_index, batch_dict in enumerate(batch_generator):

	# compute the output
	y_pred = classifier(x_in=batch_dict['x_data'],
		x_lengths=batch_dict['x_length'])

	# compute the loss
	loss = loss_func(y_pred, batch_dict['y_target'])

	loss_batch = loss.item()

	running_loss += (loss_batch - running_loss) / (batch_index + 1)

	# compute accuracy
	acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
	running_acc += (acc_batch - running_acc) / (batch_index + 1)

print("Test loss: {}".format(running_loss))
print("Test Accuracy: {}".format(running_acc))

[INFO] using device: cuda
[INFO] word not in glove embeddings, initialized randomly: <MASK>
[INFO] word not in glove embeddings, initialized randomly: <UNK>
[INFO] word not in glove embeddings, initialized randomly: <BEGIN>
[INFO] word not in glove embeddings, initialized randomly: <END>
[INFO] word not in glove embeddings, initialized randomly: newsfactor
[INFO] word not in glove embeddings, initialized randomly: thedeal
[INFO] word not in glove embeddings, initialized randomly: washingtonpost
[INFO] word not in glove embeddings, initialized randomly: siliconvalley
[INFO] word not in glove embeddings, initialized randomly: maccentral
[INFO] word not in glove embeddings, initialized randomly: techweb
[INFO] using pre-trained embeddings. Shape
(3297, 100)
[INFO] loading model...
Test loss: 0.48753383862120775
Test Accuracy: 82.26562500000001
