In [1]:
from glob import glob
from nltk.corpus import stopwords
from itertools import repeat

from collections import namedtuple
from os import path, remove
import io
import re
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
import networkx as nx

from collections import Counter
import scipy.sparse as sp

import numpy as np

replace_patterns = [
    ('<[^>]*>', ''),                                    # remove HTML tags
    ('(\D)\d\d:\d\d:\d\d(\D)', '\\1 ParsedTime \\2'),
    ('(\D)\d\d:\d\d(\D)', '\\1 ParsedTime \\2'),
    ('(\D)\d:\d\d:\d\d(\D)', '\\1 ParsedTime \\2'),
    ('(\D)\d:\d\d(\D)', '\\1 ParsedTime \\2'),
    ('(\D)\d\d\d\-\d\d\d\d(\D)', '\\1 ParsedPhoneNum \\2'),
    ('(\D)\d\d\d\D\d\d\d\-\d\d\d\d(\D)', '\\1 ParsedPhoneNum \\2'),
    ('(\D\D)\d\d\d\D\D\d\d\d\-\d\d\d\d(\D)', '\\1 ParsedPhoneNum \\2'),
    ('(\D)\d\d\d\d\d\-\d\d\d\d(\D)', '\\1 ParsedZipcodePlusFour \\2'),
    ('(\D)\d(\D)', '\\1ParsedOneDigit\\2'),
    ('(\D)\d\d(\D)', '\\1ParsedTwoDigits\\2'),
    ('(\D)\d\d\d(\D)', '\\1ParsedThreeDigits\\2'),
    ('(\D)\d\d\d\d(\D)', '\\1ParsedFourDigits\\2'),
    ('(\D)\d\d\d\d\d(\D)', '\\1ParsedFiveDigits\\2'),
    ('(\D)\d\d\d\d\d\d(\D)', '\\1ParsedSixDigits\\2'),
    ('\d+', 'ParsedDigits')
]

compiled_replace_patterns = [(re.compile(p[0]), p[1]) for p in replace_patterns]

def generate_preprocessor(replace_patterns):
    compiled_replace_patterns = [(re.compile(p[0]), p[1]) for p in replace_patterns]
    def preprocessor(text):
        # For each pattern, replace it with the appropriate string
        for pattern, replace in compiled_replace_patterns:
            text = re.sub(pattern, replace, text)
        text = text.lower()
        return text
    return preprocessor

generated_patters=generate_preprocessor(replace_patterns)

def preprocessor(text):
    # For each pattern, replace it with the appropriate string
    for pattern, replace in compiled_replace_patterns:
        text = re.sub(pattern, replace, text)
    text = text.lower()
    return text


In [2]:
class GraphsizePretrained(BaseEstimator, TransformerMixin):
    def __init__(self, w=2, pretrained_vec='glove.6B.100d', verbose=False):
        self.w = w
        self.pretrained_vec = pretrained_vec
        self.embeddings_dict = {}
        
        if not verbose:
            self.progress_bar = lambda x: x
        else:
            from tqdm import tqdm
            self.progress_bar = tqdm
            
        with open(self.pretrained_vec, 'r') as f:
            for line in self.progress_bar(f):
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], "float32")
                self.ndim = len(vector)
                self.embeddings_dict[word] = vector
        self.vocab = { word: i for (i,word) in enumerate( self.embeddings_dict.keys() ) }
        
        self.analyzer = TfidfVectorizer(preprocessor=preprocessor)
        
    def fit(self, X, y=None):
        self.N = len(X)
        return self
   
    def transform(self, text):
        docs = list(map(self.analyzer.build_analyzer(), self.progress_bar(text)))
        result = list(map(self._build_graph_, self.progress_bar(docs)))
        return result
    
    def _build_graph_(self, doc):
        terms    = list(filter( lambda x: x in self.embeddings_dict, doc))
        sorted_terms = sorted(list(set(terms)))

        cooccur_count = Counter()
        for i,idt in enumerate(terms):
            terms_to_add = terms[ max(i-self.w, 0):i ]
            terms_to_add = list(zip(terms_to_add, repeat(idt)))
            terms_to_add = list(map(sorted,terms_to_add))
            terms_to_add = list(map(tuple,terms_to_add))
            cooccur_count.update( terms_to_add )
        
        G = nx.Graph()
        G.add_nodes_from( sorted_terms )
        w_edges = [ (s,t,w) for ((s,t),w) in cooccur_count.items() ]
        G.add_weighted_edges_from( w_edges, weight='freq' )
        
        return G, np.array([ self.embeddings_dict[term] for term in sorted_terms ])

In [3]:
from utils import Dataset
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdm_notebook
from time import time
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [4]:
webkb = Dataset('/home/mangaravite/Documents/datasets/topics/webkb/')
reut  = Dataset('/home/mangaravite/Documents/datasets/topics/reut/')
acm   = Dataset('/home/mangaravite/Documents/datasets/topics/acm/')
_20ng   = Dataset('/home/mangaravite/Documents/datasets/topics/20ng/')

dataset = acm

In [5]:
fold = next(dataset.get_fold_instances(10))
fold._fields

('X_train', 'y_train', 'X_test', 'y_test', 'X_val', 'y_val')

In [6]:
%%time
graph_builder = GraphsizePretrained(w=5, verbose=True, pretrained_vec='/home/mangaravite/Documents/pretrained_vectors/glove.6B.300d.txt')
Gs_train = graph_builder.fit_transform(fold.X_train)
Gs_val   = graph_builder.transform(fold.X_val)

400000it [00:20, 19409.48it/s]
100%|██████████| 19907/19907 [00:05<00:00, 3673.38it/s]
100%|██████████| 19907/19907 [00:12<00:00, 1546.86it/s]
100%|██████████| 2495/2495 [00:00<00:00, 3902.91it/s]
100%|██████████| 2495/2495 [00:01<00:00, 1818.86it/s]

CPU times: user 40.1 s, sys: 1.58 s, total: 41.7 s
Wall time: 41 s





In [7]:
import torch
import dgl
import dgl.function as fn
import torch.nn as nn
import torch.nn.functional as F
from dgl.nn.pytorch.conv import GraphConv, GATConv
from dgl.nn.pytorch.glob import GlobalAttentionPooling

import torch.optim as optim
from torch.utils.data import DataLoader

In [8]:
class ClassifierGCN(nn.Module):
    def __init__(self, in_dim, hidden_dim, n_classes, drop=.5):
        super(ClassifierGCN, self).__init__()

        self.encoder = nn.Linear(in_dim, hidden_dim).to(torch.device('cuda:0'))
        
        self.layers = nn.ModuleList([
            GraphConv(hidden_dim, hidden_dim, activation=F.relu).to(torch.device('cuda:0')),
            GraphConv(hidden_dim, hidden_dim, activation=F.relu).to(torch.device('cuda:0'))
        ])
        
        self.lin = nn.Linear(2*hidden_dim, 1).to(torch.device('cuda:0'))
        self.pooling = GlobalAttentionPooling( self.lin ).to(torch.device('cuda:0'))
        
        self.norm = nn.BatchNorm1d( 2*hidden_dim )
        self.drop = nn.Dropout(drop)
        
        self.classify = nn.Linear(2*hidden_dim, n_classes).to(torch.device('cuda:0'))

    def forward(self, G):
        h = G.ndata['f']
        he = self.encoder(h)
        h = he
        for conv in self.layers:
            h = conv(G, h)
        
        # CONCAT he E hg
        hg = torch.cat((h,he), 1)
        hg = self.norm( hg )
        hg = self.drop( hg )
        hg = self.pooling(G, hg)
        
        pred = self.classify( hg )
        return pred
class ClassifierGAT(nn.Module):
    def __init__(self, in_dim, hidden_dim, n_classes, n_heads=16, drop=.5, attn_drop=.5, device='cuda:0'):
        super(ClassifierGAT, self).__init__()

        self.encoder = nn.Linear(in_dim, hidden_dim).to(torch.device(device))
        
        self.layers = nn.ModuleList([
            GATConv(hidden_dim, hidden_dim, num_heads=n_heads, activation=F.leaky_relu,
                    feat_drop=drop, attn_drop=attn_drop).to(torch.device(device)),
            GATConv(n_heads*hidden_dim, hidden_dim, num_heads=n_heads, activation=F.leaky_relu,
                    feat_drop=drop, attn_drop=attn_drop).to(torch.device(device))
        ])
        
        self.lin = nn.Linear(n_heads*hidden_dim + hidden_dim, 1).to(torch.device(device))
        self.pooling = GlobalAttentionPooling( self.lin ).to(torch.device(device))
        
        self.norm = nn.BatchNorm1d( n_heads*hidden_dim + hidden_dim )
        self.drop = nn.Dropout(drop)
        
        self.classify = nn.Linear( n_heads*hidden_dim + hidden_dim, n_classes).to(torch.device(device))

    def forward(self, G):
        h = G.ndata['f']
        he = self.encoder(h)
        h = he
        for conv in self.layers:
            h = conv(G, h)
            h = h.view(h.shape[0], -1)
        
        # CONCAT he E hg
        hg = torch.cat((h,he), 1)
        hg = self.norm( hg )
        hg = self.drop( hg )
        hg = self.pooling(G, hg)
        
        pred = self.classify( hg )
        return pred

In [9]:
def collate(samples):
    Gs_Fs, labels = map(list, zip(*samples))
    graphs = []
    for g, f in Gs_Fs:
        g_dgl = dgl.DGLGraph()
        g_dgl.from_networkx(g)
        g_dgl.ndata['f'] = torch.FloatTensor(f).to(torch.device('cuda:0'))
        g_dgl.to(torch.device('cuda:0'))
        graphs.append(g_dgl)
    batched_graph = dgl.batch(graphs)
    batched_graph.to(torch.device('cuda:0'))
    labels = torch.tensor(labels).to(torch.device('cuda:0'))
    return batched_graph, labels

In [10]:
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin

class FocalLoss(nn.Module):
    # https://github.com/mbsariyildiz/focal-loss.pytorch
    def __init__(self, gamma=0, alpha=None, size_average=True):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha
        if isinstance(alpha, (float, int)): self.alpha = torch.Tensor([alpha, 1 - alpha])
        if isinstance(alpha, list): self.alpha = torch.Tensor(alpha)
        self.size_average = size_average

    def forward(self, input, target):
        if input.dim()>2:
            input = input.view(input.size(0), input.size(1), -1)  # N,C,H,W => N,C,H*W
            input = input.transpose(1, 2)                         # N,C,H*W => N,H*W,C
            input = input.contiguous().view(-1, input.size(2))    # N,H*W,C => N*H*W,C
        target = target.view(-1, 1)

        logpt = F.log_softmax(input, dim=1)
        logpt = logpt.gather(1,target)
        logpt = logpt.view(-1)
        pt = logpt.exp()

        if self.alpha is not None:
            if self.alpha.type() != input.data.type():
                self.alpha = self.alpha.type_as(input.data)
            at = self.alpha.gather(0, target.data.view(-1))
            logpt = logpt * at

        loss = -1 * (1 - pt)**self.gamma * logpt
        if self.size_average: return loss.mean()
        else: return loss.sum()

In [15]:
PATH = 'best_param_pretrained_wekb.pth'
n_heads=4
n_epochs = 100
patience = 25
hidden_dim = 300
train_batch_size = 32
test_val_batch_size = 256

#model = SimpleClassifierGCN(len(graph_builder.vocab), hidden_dim, dataset.nclass, drop=.5).to(torch.device('cuda:0'))
model = ClassifierGAT(graph_builder.ndim, hidden_dim, dataset.nclass, n_heads=n_heads, drop=.5, attn_drop=.3).to(torch.device('cuda:0'))
#model = ClassifierGCN(graph_builder.ndim, hidden_dim, dataset.nclass, drop=.5).to(torch.device('cuda:0'))

In [16]:
#loss_func = FocalLoss().to(torch.device('cuda:0'))
loss_func = nn.CrossEntropyLoss().to(torch.device('cuda:0'))

optimizer = optim.Adam( model.parameters(), lr=1e-3, weight_decay=1e-3)
#optimizer = optim.AdamW( model.parameters(), lr=1e-2, weight_decay=1e-3)

#optimizer = optim.RMSprop( model.parameters(), lr=1e-2, weight_decay=1e-4)
#optimizer = optim.RMSprop( model.parameters(), lr=0.0001 )

model.train()
torch.cuda.synchronize()
epoch_losses = []

In [17]:
data_loader_val  = DataLoader(list(zip(Gs_val,  fold.y_val )), batch_size=test_val_batch_size,
                              shuffle=True, collate_fn=collate)

In [18]:
best_score = None
n_iters = 0

for epoch in range(n_epochs):
    data_loader = DataLoader(list(zip(Gs_train, fold.y_train)), batch_size=train_batch_size,
                             shuffle=True, collate_fn=collate)
    epoch_loss = 0
    with tqdm(total=len(data_loader.dataset), smoothing=0.) as pbar:
        t0 = time()
        total = 0
        correct = 0
        model.train()
        for i, (bg, label) in enumerate(data_loader):
            outputs = model(bg)
            probs_Y = torch.softmax(outputs, 1)
            sampled_Y = torch.argmax(probs_Y, 1).reshape(-1)
            
            # Train eval phase
            total += label.size(0)
            correct += (sampled_Y == label).sum().item()
            
            # NN backprop phase
            loss = loss_func(outputs, label)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_loss += loss.detach().item()
            
            del loss, outputs, bg, probs_Y, sampled_Y
            pbar.update( len(label) )
            pbar.set_description_str('iter {}, train acc {:.3f} train loss {:.2f}'.format(epoch, (correct/total), epoch_loss / (epoch + 1)))
        
        score_train = correct/total
    with tqdm(total=len(data_loader_val.dataset), smoothing=0.) as pbar:
        model.eval()
        total = 0
        correct = 0
        epoch_loss = 0.
        for bg, label in data_loader_val:
            with torch.no_grad():
                outputs = model(bg)
            
            probs_Y = torch.softmax(outputs, 1)
            sampled_Y = torch.argmax(probs_Y, 1).reshape(-1)

            # Validation eval phase
            total += label.size(0)
            correct += (sampled_Y == label).sum().item()
            
            #break
            
            del probs_Y, outputs, bg, sampled_Y
            pbar.update( label.size(0) )
            score_val = correct/total

            pbar.set_description_str('iter {}, val   acc {:.3f} ( over: {:.3} )'.format(epoch, score_val, score_val/score_train))
            
        #break
        pbar.set_description_str('iter {}, val  acc {:.3f} ( over: {:.3}/{} )'.format(epoch, score_val, score_val/score_train, n_iters))
        score = correct/total
        if best_score is None or score > best_score:
            torch.save(model, PATH)
            best_score = score
            n_iters = 0
        else:
            n_iters += 1
            if n_iters >= patience:
                print()
                print('BEST val acc {:.3f}'.format(best_score), end='\r')
                break
        pbar.set_description_str('iter {}, val   acc {:.3f} ( over: {:.3}/{} )'.format(epoch, score_val, score_val/score_train, n_iters))
        epoch_loss /= (epoch + 1)
        epoch_losses.append(epoch_loss)

iter 0, train acc 0.584 train loss 840.94: 100%|██████████| 19907/19907 [01:05<00:00, 305.10it/s]
iter 0, val   acc 0.657 ( over: 1.12/0 ): 100%|██████████| 2495/2495 [00:06<00:00, 380.70it/s]
iter 1, train acc 0.654 train loss 343.60: 100%|██████████| 19907/19907 [01:05<00:00, 304.25it/s]
iter 1, val   acc 0.675 ( over: 1.03/0 ): 100%|██████████| 2495/2495 [00:06<00:00, 383.00it/s]
iter 2, train acc 0.674 train loss 214.53: 100%|██████████| 19907/19907 [01:05<00:00, 306.00it/s]
iter 2, val   acc 0.678 ( over: 1.01/0 ): 100%|██████████| 2495/2495 [00:06<00:00, 385.53it/s]
iter 3, train acc 0.688 train loss 153.24: 100%|██████████| 19907/19907 [01:04<00:00, 307.96it/s]
iter 3, val   acc 0.688 ( over: 1.0/0 ): 100%|██████████| 2495/2495 [00:06<00:00, 360.45it/s]
iter 4, train acc 0.697 train loss 120.40: 100%|██████████| 19907/19907 [01:04<00:00, 307.21it/s]
iter 4, val   acc 0.693 ( over: 0.995/0 ): 100%|██████████| 2495/2495 [00:06<00:00, 361.85it/s]
iter 5, train acc 0.696 train loss 


BEST val acc 0.721




In [None]:
###################################### acm ######################################
###################____________________ GAT ____________________###################
# BEST val acc 0.721 10folds
#       CrossEntropyLoss, Adam
#       n_heads=4, drop=.5, attn_drop=.3
#       lr=1e-3, w=5
#       weight_decay=1e-3,
#       PATH = 'best_param_pretrained_wekb.pth'
#       n_epochs = 100
#       patience = _
#       hidden_dim = 300
#       train_batch_size = 32
#       test_val_batch_size = 256
###################____________________ GAT ____________________###################
# BEST val acc 0.708 iter=20 10folds
#       CrossEntropyLoss, Adam
#       n_heads=8, drop=.3, attn_drop=.2
#       lr=1e-3, w=5
#       weight_decay=1e-3,
#       PATH = 'best_param_pretrained_wekb.pth'
#       n_epochs = 100
#       patience = _
#       hidden_dim = 300
#       train_batch_size = 16
#       test_val_batch_size = 256

In [None]:
###################################### reut ######################################
###################____________________ GAT ____________________###################
# BEST val acc 0.753(/0.747) iter=24 10folds (aparente underfitting) 
#       CrossEntropyLoss, Adam
#       n_heads=4, drop=.5, attn_drop=.3
#       lr=1e-3, w=5
#       weight_decay=1e-3,
#       PATH = 'best_param_pretrained_wekb.pth'
#       n_epochs = 100
#       patience = _
#       hidden_dim = 300
#       train_batch_size = 16
#       test_val_batch_size = 256

In [None]:
###################################### webkb ######################################
###################____________________ GAT ____________________###################
# BEST val acc 0.820 iter=15 10folds
#       CrossEntropyLoss, Adam
#       n_heads=4, drop=.5, attn_drop=.3
#       lr=1e-3, w=5
#       weight_decay=1e-3,
#       PATH = 'best_param_pretrained_wekb.pth'
#       n_epochs = 100
#       patience = 25
#       hidden_dim = 300
#       train_batch_size = 16
#       test_val_batch_size = 256
###################____________________ GAT ____________________###################
# BEST val acc 0.815 10folds (aparente underfitting)
#       CrossEntropyLoss, Adam
#       n_heads=4, drop=.5, attn_drop=.3
#       lr=1e-3, w=5
#       weight_decay=1e-3,
#       PATH = 'best_param_pretrained_wekb.pth'
#       n_epochs = 100
#       patience = 12
#       hidden_dim = 300
#       train_batch_size = 16
#       test_val_batch_size = 256

In [None]:
###################################### 20ng ######################################
###################____________________ GAT ____________________###################
# BEST val acc 0.764 iter=27 10folds
#       FocalLoss, Adam
#       n_heads=16, drop=.2, attn_drop=.1
#       lr=1e-3, w=5
#       weight_decay=1e-4,
#       PATH = 'best_param_pretrained_wekb.pth'
#       n_epochs = 100
#       patience = _
#       hidden_dim = 300
#       train_batch_size = 16
#       test_val_batch_size = 256
###################____________________ GAT ____________________###################
# BEST val acc 0.767 iter=31 10folds
#       CrossEntropyLoss, Adam
#       n_heads=8, drop=.2, attn_drop=.1
#       lr=1e-3, w=5
#       weight_decay=1e-4,
#       PATH = 'best_param_pretrained_wekb.pth'
#       n_epochs = 100
#       patience = _
#       hidden_dim = 300
#       train_batch_size = 16
#       test_val_batch_size = 256