# TP 5 - Réseaux convolutifs 1D

In [1]:
import array
import csv
import gzip
import re
import shutil
import subprocess
import sys
from collections import namedtuple
from pathlib import Path
from tqdm import tqdm
from matplotlib import pyplot as plt
from torch.utils.tensorboard import SummaryWriter
import numpy as np

import sentencepiece as spm
import torch

from datamaestro import Dataset, prepare_dataset



In [2]:
BATCH_SIZE = 64
NB_EPOCH = 5

## Pre-processing et exploration des données

In [3]:
Batch = namedtuple("Batch", ["text", "labels"])

class TextDataset(torch.utils.data.Dataset):
    
    def __init__(self, text: torch.LongTensor, sizes: torch.LongTensor, labels: torch.LongTensor):
        self.text = text
        self.sizes = sizes
        self.labels = labels
        
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index: int):
        return self.text[self.sizes[index]:self.sizes[index+1]], self.labels[index].item()

    @staticmethod
    def collate(batch):
        data = [item[0] for item in batch]
        labels = [item[1] for item in batch]
        return Batch(torch.nn.utils.rnn.pad_sequence(data, batch_first=True), torch.LongTensor(labels))


def read(mode: str):
    """Process the dataset
    """
    datapath = Path(mode)
    if datapath.is_file():
        with gzip.open(datapath, "rb") as fp:
            return torch.load(fp)


In [4]:
s = spm.SentencePieceProcessor()
s.Load('wp1000.model')

True

In [5]:
train = read('train-1000.pth')
test = read('test-1000.pth')

In [6]:
for i in range(10):
    print(s.DecodeIds(train[i][0].tolist()))
    print(s.EncodeAsPieces(s.DecodeIds(train[i][0].tolist())))

Awww, that s a bummer You shoulda got David Carr of Third Day to do it ;D
['▁A', 'www', ',', '▁that', '▁s', '▁a', '▁bu', 'mm', 'er', '▁You', '▁should', 'a', '▁got', '▁D', 'av', 'id', '▁C', 'ar', 'r', '▁of', '▁Th', 'ir', 'd', '▁Day', '▁to', '▁do', '▁it', '▁;', 'D']
is upset that he can t update his Facebook by texting it and might cry as a result School today also Blah
['▁is', '▁up', 's', 'et', '▁that', '▁he', '▁can', '▁t', '▁update', '▁his', '▁F', 'a', 'ce', 'b', 'ook', '▁by', '▁text', 'ing', '▁it', '▁and', '▁might', '▁cry', '▁as', '▁a', '▁re', 's', 'ult', '▁S', 'ch', 'o', 'ol', '▁today', '▁also', '▁B', 'la', 'h']
I dived many times for the ball Managed to save 50 The rest go out of bounds
['▁I', '▁di', 'v', 'ed', '▁many', '▁time', 's', '▁for', '▁the', '▁b', 'all', '▁M', 'an', 'age', 'd', '▁to', '▁s', 'a', 've', '▁5', '0', '▁The', '▁rest', '▁go', '▁out', '▁of', '▁b', 'ound', 's']
my whole body feels itchy and like its on fire
['▁my', '▁who', 'le', '▁bo', 'dy', '▁feel', 's', '▁it', 'ch'

In [7]:
data_loader = torch.utils.data.DataLoader(train, shuffle=True, collate_fn=train.collate, batch_size=BATCH_SIZE)
test_loader = torch.utils.data.DataLoader(test, shuffle=True, collate_fn=train.collate, batch_size=BATCH_SIZE)

In [8]:
# Baseline test (majority class)
print('Test success: %0.3f' % (np.mean([float(torch.sum(1 == b)) / len(b) for a, b in test_loader])))

Test success: 0.511


## Définition du modèle

In [9]:
class my_max(torch.nn.Module):
    def __init__(self, dim):
        super(my_max, self).__init__()
        self.dim = dim

    def forward(self, x):
        return torch.max(x, self.dim)[0]

In [10]:
class my_transpose(torch.nn.Module):
    def __init__(self, dim1, dim2):
        super(my_transpose, self).__init__()
        self.dim1 = dim1
        self.dim2 = dim2

    def forward(self, x):
        return torch.transpose(x, self.dim1, self.dim2)

In [31]:
model_simple = torch.nn.Sequential(
        torch.nn.Embedding(1000, 50),
        my_transpose(1, 2),
        torch.nn.Conv1d(50, 50, 5),
        torch.nn.MaxPool1d(5, 2),
        torch.nn.Conv1d(50, 50, 5),
        my_max(2),
        torch.nn.Linear(50, 2),
        torch.nn.Sigmoid()
)

In [31]:
model_complex = torch.nn.Sequential(
        torch.nn.Embedding(1000, 50),
        my_transpose(1, 2),
        torch.nn.Conv1d(50, 50, 5),
        torch.nn.MaxPool1d(5, 2),
        torch.nn.Conv1d(50, 50, 5),
        my_max(2),
        torch.nn.Linear(50, 2),
        torch.nn.Sigmoid()
)

In [32]:
# Test shapes
for x, y in data_loader:
    print(x.shape)
    for layer in model:
        x = layer(x)
        print(x.shape)
    break

torch.Size([64, 48])
torch.Size([64, 48, 50])
torch.Size([64, 50, 48])
torch.Size([64, 50, 44])
torch.Size([64, 50, 20])
torch.Size([64, 50, 16])
torch.Size([64, 50])
torch.Size([64, 2])
torch.Size([64, 2])


## Entraînement du modèle

In [33]:
optim = torch.optim.Adam(params=model.parameters())
loss = torch.nn.CrossEntropyLoss()

In [34]:
writer = SummaryWriter()
losses = []
for i in range(NB_EPOCH):
    print('Beginning epoch %d' %(i+1))
    j = 0
    for x, y in data_loader:
        if j % 1000 == 0:
            print('Percentage of seen data: %0.2f' %(j*BATCH_SIZE / train.__len__()))
            print('Test success: %0.2f' % (np.mean([float(torch.sum(torch.max(model(a), 1)[1] == b)) / len(b) for a, b in test_loader])))
            writer.add_scalar('Loss/test', l, j)
    # each time: we see batch * length data
        optim.zero_grad()
        l = loss(model(x), y)
        l.backward()
        optim.step()
        writer.add_scalar('Loss/train', l, j)
        losses.append(l)
        j += 1

Beginning epoch 1
Percentage of seen data: 0.00
Test success: 0.50
Percentage of seen data: 0.04
Test success: 0.69
Percentage of seen data: 0.08
Test success: 0.69
Percentage of seen data: 0.12
Test success: 0.69
Percentage of seen data: 0.16
Test success: 0.72
Percentage of seen data: 0.20
Test success: 0.73
Percentage of seen data: 0.24
Test success: 0.71
Percentage of seen data: 0.28
Test success: 0.71
Percentage of seen data: 0.32
Test success: 0.71
Percentage of seen data: 0.36
Test success: 0.73
Percentage of seen data: 0.40
Test success: 0.75
Percentage of seen data: 0.44
Test success: 0.72
Percentage of seen data: 0.48
Test success: 0.73
Percentage of seen data: 0.52
Test success: 0.71
Percentage of seen data: 0.56
Test success: 0.72
Percentage of seen data: 0.60
Test success: 0.73
Percentage of seen data: 0.64
Test success: 0.73
Percentage of seen data: 0.68
Test success: 0.72
Percentage of seen data: 0.72
Test success: 0.74
Percentage of seen data: 0.76
Test success: 0.74
Pe

KeyboardInterrupt: 