# Imports and gpu init

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext import data
import math, copy, time
from torch.autograd import Variable
import matplotlib.pyplot as plt
import seaborn
seaborn.set_context(context="talk")
num_device = 0
device = torch.device("cuda:" + str(num_device) if torch.cuda.is_available() else "cpu")
torch.cuda.set_device(num_device)
%matplotlib inline

# Preprocessings and iterators on train/test dataset for batches

In [2]:
batch_size = 32

In [3]:

import re
from itertools import groupby
def clean_line(line):
    hashtag = re.compile('#[^ \t\n\r\f\v]+')
    username = re.compile('@[^ \t\n\r\f\v]+')
    url = re.compile('https?[^ \t\n\r\f\v]*')
    junk = re.compile('["¤#%&()*+-/;<=>@[\]^_`{|}~\\;\(\)\'\"\*\`\´\‘\’…\\\/\{\}\|\+><~\[\]\“\”%=\$§]')
    ponctu = re.compile('[.!?,:]')
    number = re.compile('(^[0-9]+)|([0-9]+)')
    rep = re.compile(r'(.)\1{2,}')
    emo = re.compile('[\u233a-\U0001f9ff]')

    if line.startswith('"') :
        line = line[1:]
    if line.endswith('"') :
        line = line[:-1]
        
    line = re.sub(url,' url ', line) # replace every url with ' url '
    
    def subfct1(matchobj):
        return ' ' + matchobj.group(0) + ' '
    line = re.sub(ponctu,subfct1, line) # separate the punctuation from the words
    
    def subfct2(matchobj):
        return matchobj.group(0)[:2]
    line = re.sub(rep, subfct2,line) # keep maximum 2 consecutive identical character
    
    line = re.sub(hashtag,' hastag ', line) # replace every hastag with ' hastag '
    line = re.sub(username,' username ', line) # replace every reference to a username with ' username '
    line = re.sub(junk,' ', line) #throw away junk character
    line = re.sub(number,' number ',line) # replace every number with ' number '
    line = re.sub(emo, ' ',line) #suppr strange emoticon ( to modify ?)

    line_split = [k for k,v in groupby(line.split())] #suprr repeated word:
    line_split = line_split[:40] #trunc if too long
    return line_split

def custom_tokenizer_text(text): # create a tokenizer function
    return clean_line(text)

def custom_preprocess_label(label):
    label = int(label)
    if label == 4:
        label = 1
    return str(label)

import pickle
store_model_path = '/stockage/Research_Team_Ressources/Adrien/VAE_text/'
try:
    with open(store_model_path +'Field_def.pickle', 'rb') as my_pickle:
        TEXT = pickle.load(my_pickle)
except IOError:
    pass

LABEL = data.Field(sequential=False, preprocessing=custom_preprocess_label, use_vocab=False)

In [4]:
data_path = '/stockage/Research_Team_Ressources/Sentiment140/training.1600000.processed.noemoticon.utf8.csv'
from torchtext import data
from torch.utils.data import Dataset
dataset = data.TabularDataset(
        path= data_path, format='csv',
        fields=[('Num', None),('Label', LABEL), ('id', None), ('date',None),
                ('flag', None),('user', None),('Text', TEXT)],
        skip_header = True)

In [6]:
nb_train = 1000000
ratio_train = nb_train / len(dataset)
nb_test = 500000
ratio_test = nb_test / len(dataset)
ratio_other = 1 - ratio_train - ratio_test

In [8]:
train_dataset, other_dataset, test_dataset = dataset.split(split_ratio=[ratio_train,ratio_test,ratio_other])

In [10]:
batch_size = 32
from torchtext.data import Iterator, BucketIterator
train_iter, test_iter = BucketIterator.splits(
 (train_dataset, test_dataset), # we pass in the datasets we want the iterator to draw data from
 batch_sizes=(batch_size, batch_size),
 device=num_device, # if you want to use the GPU, specify the GPU number here
 sort_key=lambda x: len(x.Text), # the BucketIterator needs to be told what function it should use to group the data.
 sort_within_batch=False,
 repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)

In [11]:
class Batch:
    "Object for holding a batch of data with mask during training."
    def __init__(self, src, trg, pad=0):
        self.src = src
        self.src_mask = (src != pad).unsqueeze(-2)
        self.trg = trg

In [12]:
def data_gen_class(data_iter, nbatches = None):
    iterator = iter(data_iter)
    len_max = len(data_iter)
    if nbatches == None:
        nbatches = len_max
    elif nbatches > len_max:
        nbatches = len_max
    for i in range(nbatches):
        bb = next(iterator)
        yield Batch(bb.Text[0].permute(1,0), bb.Label, TEXT.vocab.stoi['<pad>'])

# Fully convolutional NN (with global pooling for managing different length)

In [443]:
# Convolutional neural network
class ConvNet(nn.Module):
    def __init__(self, input_size, embed_size, num_classes=10):
        super(ConvNet, self).__init__()
        
        self.input_size = input_size
        self.embed_size = embed_size
        
        self.embed = nn.Embedding(input_size, embed_size)        
    
        self.conv1dBlock1 = nn.Sequential(
            nn.Conv1d(embed_size, 20, 3, padding=1),
            nn.Dropout(0.5),
            nn.BatchNorm1d(20),
            nn.ReLU())
        
        self.conv1dBlock2 = nn.Sequential(
            nn.Conv1d(20, 2, 3, padding=1),
            nn.Dropout(0.5),
            nn.BatchNorm1d(2),
            nn.ReLU())
        
#         self.fc = nn.Linear(5, num_classes)
        
    def forward(self, x):
#         print(x.size())
        x = self.embed(x)
#         print(x.size())
        x = x.transpose(1,2)
#         print(x.size())
               
        x = self.conv1dBlock1(x)
#         print(x.size())
        x = self.conv1dBlock2(x)
#         print(x.size())
        #Global Mean Pooling
        out = torch.mean(x, dim=2)
#         print(x.size())

#         out = self.fc(x)
        return out

## Instantiate the model

In [444]:
n_vocab = len(TEXT.vocab)
model = ConvNet(n_vocab, embed_size=300, num_classes = 2).to(device)
# This was important from their code. 
# Initialize parameters with Glorot / fan_avg.
for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

In [445]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion.to(device)

CrossEntropyLoss()

In [446]:
generator = data_gen_class(train_iter)
bat = next(generator)
outputs = model.forward(bat.src)
print(outputs.shape)

torch.Size([32, 2])


In [163]:
# m = nn.Conv1d(200, 10, 2) # in-channels = 200, out-channels = 10
# input = Variable(torch.randn(10, 200, 5)) # 200 = embedding dim, 5 = seq length
# feature_maps = m(input)
# print(feature_maps.size()) # feature_maps size = 10,10,4 

## Fit the model and test it

In [447]:
def run_epoch(data_iter, model, criterion, opt=None):
    nb_batches = len(data_iter)
    generator = data_gen_class(data_iter)
    nb_item = 0
    total_loss = 0
    acc_y_sum = 0
    
    temp_nb_item = 0
    temp_total_loss = 0
    temp_acc_y_sum = 0
    
    for i, batch in enumerate(generator):
        #Forward pass
        outputs = model.forward(batch.src)
        loss = criterion(outputs, batch.trg)
        
        total_loss += loss.item()
        temp_total_loss += loss.item()
        acc_y = torch.argmax(outputs,dim=1) == batch.trg
        acc_y = acc_y.float().sum().item()
        acc_y_sum += acc_y
        temp_acc_y_sum += acc_y
        nb_item  += batch.src.size(0)
        temp_nb_item += batch.src.size(0)
        
        if opt is not None:
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if (i+1) % 1000 == 0:
                print ('Step [{}/{}], Loss: {:.4f}, Accuracy: {:.2f}' 
                       .format(i+1, nb_batches, temp_total_loss/temp_nb_item, temp_acc_y_sum/temp_nb_item))
                temp_acc_y_sum = 0
                temp_nb_item = 0
                temp_total_loss = 0
    
    print(nb_item)
    return total_loss/nb_item, acc_y_sum/nb_item
            

In [448]:
%%time
num_epoch = 100
for epoch in range(num_epoch):
    print("epoch : ", epoch)
    model.train()
    print(run_epoch(train_iter, model,criterion, optimizer))
    model.eval()
    print(run_epoch(test_iter, model,criterion, None))

epoch :  0
Step [1000/31250], Loss: 0.0198, Accuracy: 0.68
Step [2000/31250], Loss: 0.0178, Accuracy: 0.75
Step [3000/31250], Loss: 0.0169, Accuracy: 0.76
Step [4000/31250], Loss: 0.0164, Accuracy: 0.76
Step [5000/31250], Loss: 0.0161, Accuracy: 0.76
Step [6000/31250], Loss: 0.0158, Accuracy: 0.77
Step [7000/31250], Loss: 0.0156, Accuracy: 0.77
Step [8000/31250], Loss: 0.0156, Accuracy: 0.77
Step [9000/31250], Loss: 0.0154, Accuracy: 0.78
Step [10000/31250], Loss: 0.0154, Accuracy: 0.78
Step [11000/31250], Loss: 0.0154, Accuracy: 0.77
Step [12000/31250], Loss: 0.0152, Accuracy: 0.78
Step [13000/31250], Loss: 0.0152, Accuracy: 0.78
Step [14000/31250], Loss: 0.0152, Accuracy: 0.78
Step [15000/31250], Loss: 0.0151, Accuracy: 0.78
Step [16000/31250], Loss: 0.0151, Accuracy: 0.78
Step [17000/31250], Loss: 0.0151, Accuracy: 0.78
Step [18000/31250], Loss: 0.0151, Accuracy: 0.78
Step [19000/31250], Loss: 0.0150, Accuracy: 0.78
Step [20000/31250], Loss: 0.0149, Accuracy: 0.78
Step [21000/31250]

  return Variable(arr, volatile=not train)
  return Variable(arr, volatile=not train), lengths


500000
(0.01569111578733474, 0.808364)
epoch :  1
Step [1000/31250], Loss: 0.0144, Accuracy: 0.80
Step [2000/31250], Loss: 0.0145, Accuracy: 0.79
Step [3000/31250], Loss: 0.0143, Accuracy: 0.80
Step [4000/31250], Loss: 0.0143, Accuracy: 0.80
Step [5000/31250], Loss: 0.0144, Accuracy: 0.79
Step [6000/31250], Loss: 0.0145, Accuracy: 0.79
Step [7000/31250], Loss: 0.0143, Accuracy: 0.80
Step [8000/31250], Loss: 0.0143, Accuracy: 0.79
Step [9000/31250], Loss: 0.0143, Accuracy: 0.79
Step [10000/31250], Loss: 0.0142, Accuracy: 0.80
Step [11000/31250], Loss: 0.0143, Accuracy: 0.79
Step [12000/31250], Loss: 0.0144, Accuracy: 0.79
Step [13000/31250], Loss: 0.0145, Accuracy: 0.79
Step [14000/31250], Loss: 0.0145, Accuracy: 0.79
Step [15000/31250], Loss: 0.0144, Accuracy: 0.80
Step [16000/31250], Loss: 0.0144, Accuracy: 0.79
Step [17000/31250], Loss: 0.0143, Accuracy: 0.79
Step [18000/31250], Loss: 0.0144, Accuracy: 0.79
Step [19000/31250], Loss: 0.0145, Accuracy: 0.79
Step [20000/31250], Loss: 0.

KeyboardInterrupt: 

In [205]:
model.eval()
print(run_epoch(test_iter, model,criterion, None))

  return Variable(arr, volatile=not train), lengths
  return Variable(arr, volatile=not train)


500000
(0.014900214263267816, 0.795854)


In [206]:
word_index = {key : value for key, value in TEXT.vocab.stoi.items()}

In [207]:
id_vocab = {value : key for key, value in word_index.items()}

In [261]:
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)
from keras.preprocessing.text import Tokenizer

def sentence_to_vec(sent_vect):
    tokenizer = Tokenizer(oov_token=TEXT.unk_token)
    tokenizer.fit_on_texts("")
    tokenizer.word_index = word_index
    text_to_decode = [sent_vect]
    sequences = [[word_index["§"]] + tokenizer.texts_to_sequences(text_to_decode)[0]]
    return sequences

In [349]:
sent = torch.from_numpy(np.asarray(sentence_to_vec("python"))).to(device)
print(sent)
model.eval()
tmp = model.forward(sent)
print("Prediction : ",F.softmax(tmp, dim = 1).to(torch.device("cpu")).detach().numpy())

tensor([[    2,  6724]], device='cuda:0')
Prediction :  [[0.5565129  0.44348708]]
