In [1]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from itertools import repeat
from sklearn.model_selection import StratifiedKFold

from nltk.corpus import stopwords
import networkx as nx
import scipy.stats as st
import io
from glob import glob
import scipy.sparse as sp
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdm_notebook
import matplotlib.pyplot as plt
import dgl
import numpy as np
import networkx as nx
from os import path
import torch
from multiprocessing import Pool
import torch.optim as optim
from torch.utils.data import DataLoader
import re
from collections import namedtuple 
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer

# show zumbies nvidia executions: sudo fuser -v /dev/nvidia*

In [2]:
replace_patterns = [
    ('<[^>]*>', ''),                                    # remove HTML tags
    ('(\D)\d\d:\d\d:\d\d(\D)', '\\1 ParsedTime \\2'),
    ('(\D)\d\d:\d\d(\D)', '\\1 ParsedTime \\2'),
    ('(\D)\d:\d\d:\d\d(\D)', '\\1 ParsedTime \\2'),
    ('(\D)\d:\d\d(\D)', '\\1 ParsedTime \\2'),
    ('(\D)\d\d\d\-\d\d\d\d(\D)', '\\1 ParsedPhoneNum \\2'),
    ('(\D)\d\d\d\D\d\d\d\-\d\d\d\d(\D)', '\\1 ParsedPhoneNum \\2'),
    ('(\D\D)\d\d\d\D\D\d\d\d\-\d\d\d\d(\D)', '\\1 ParsedPhoneNum \\2'),
    ('(\D)\d\d\d\d\d\-\d\d\d\d(\D)', '\\1 ParsedZipcodePlusFour \\2'),
    ('(\D)\d(\D)', '\\1ParsedOneDigit\\2'),
    ('(\D)\d\d(\D)', '\\1ParsedTwoDigits\\2'),
    ('(\D)\d\d\d(\D)', '\\1ParsedThreeDigits\\2'),
    ('(\D)\d\d\d\d(\D)', '\\1ParsedFourDigits\\2'),
    ('(\D)\d\d\d\d\d(\D)', '\\1ParsedFiveDigits\\2'),
    ('(\D)\d\d\d\d\d\d(\D)', '\\1ParsedSixDigits\\2'),
    ('\d+', 'ParsedDigits')
]

compiled_replace_patterns = [(re.compile(p[0]), p[1]) for p in replace_patterns]

def generate_preprocessor(replace_patterns):
    compiled_replace_patterns = [(re.compile(p[0]), p[1]) for p in replace_patterns]
    def preprocessor(text):
        # For each pattern, replace it with the appropriate string
        for pattern, replace in compiled_replace_patterns:
            text = re.sub(pattern, replace, text)
        text = text.lower()
        return text
    return preprocessor

generated_patters=generate_preprocessor(replace_patterns)

def preprocessor(text):
    # For each pattern, replace it with the appropriate string
    for pattern, replace in compiled_replace_patterns:
        text = re.sub(pattern, replace, text)
    text = text.lower()
    return text

In [3]:
class Graphsize(BaseEstimator, TransformerMixin):
    def __init__(self, lang='english', w=2, min_df=2, max_feat=999999999, stem=True, analyzer=None, verbose=False):
        self.lang = lang
        self.w = w
        self.min_df = min_df
        self.max_feat = max_feat
        if not verbose:
            self.progress_bar = lambda x: x
        else:
            from tqdm import tqdm
            self.progress_bar = tqdm
        
        self.stopwords = set(stopwords.words('english'))
        
        self._stem_ = lambda x: x
        if stem:
            from nltk.stem.snowball import SnowballStemmer
            self._stem_ = SnowballStemmer(lang).stem
            
        self.analyzer = analyzer
        if self.analyzer is None:
            self.analyzer = TfidfVectorizer(preprocessor=preprocessor)
        
        self.vocab = dict()
        self.df = Counter()
        
    def fit(self, X, y=None):
        self.N = len(X)
        list(map(self._build_df_, self.progress_bar(X)))
        self._filter_()
        self._build_vocab_()
        return self
        
    def _build_df_(self, text):
        terms = list( filter( lambda x: x not in self.stopwords,
                             self.analyzer.build_analyzer()(text) ))
        terms = list( map( self._stem_, terms ))
        self.df.update( set(terms) )
    
    def _filter_(self):
        self.df = self.df.most_common(self.max_feat)
        self.df = dict( list(filter( lambda x: x[1] >= self.min_df, self.df)) )
    
    def _build_vocab_(self):
        self.id2term = sorted(list(self.df.keys()))
        self.vocab = dict( [ (k,i) for (i,k) in enumerate(self.id2term) ] )
        
        self.vocab['<UNK>'] = len(self.id2term)
        self.id2term.append( '<UNK>' )
    
    
    def transform(self, text):
        docs = list(map(self.analyzer.build_analyzer(), self.progress_bar(text)))
        result = list(map(self._build_graph_, self.progress_bar(docs)))
        result = list(map(self._build_features_, self.progress_bar(result)))
        
        return result
    def _build_features_(self, param):
        G, tfidf, pg = param
        if len(G) == 0:
            G.add_node( self.vocab['<UNK>'] )
            return G, sp.csr_matrix(np.random.normal(size=len(self.vocab)))
        
        sorted_nodes = sorted( G.nodes )
        full_weight = np.array([tfidf])*np.array([pg]).T    
        
        # Using the Adj matrix
        A = nx.to_scipy_sparse_matrix( G, weight='freq', nodelist=sorted_nodes )
        A = A.multiply(full_weight)
        row,col = A.nonzero()
        col2 = [ sorted_nodes[c] for c in col ]
        nA = sp.csr_matrix( (A.data, (row, col2)), shape=(A.shape[0], len(self.vocab)) )
        

        return G, nA
    
    def _build_graph_(self, doc):
        terms = list(filter( lambda x: x in self.vocab, doc))
        terms = list(map( lambda x: self.vocab[x], terms ))
        sorted_terms = sorted(list(set(terms)))
        
        tf = Counter(terms)
        tfidf = dict( [ (k, v*np.log2((self.N+1)/self.df[self.id2term[k]])) for (k,v) in tf.items() ] )

        cooccur_count = Counter()
        for i,idt in enumerate(terms):
            terms_to_add = terms[ max(i-self.w, 0):i ]
            terms_to_add = list(zip(terms_to_add, repeat(idt)))
            terms_to_add = list(map(sorted,terms_to_add))
            terms_to_add = list(map(tuple,terms_to_add))
            cooccur_count.update( terms_to_add )
        
        G = nx.Graph()
        G.add_nodes_from( [ (k,{'tfidf': tfidf[k], 'tf': tf[k] }) for k in set(terms) ] )
        w_edges = [ (s,t,w) for ((s,t),w) in cooccur_count.items() ]
        G.add_weighted_edges_from( w_edges, weight='freq' )
        
        
        tfidf = [ tfidf[term] for term in sorted_terms ]
        tf = [ tf[term] for term in sorted_terms ]
        #Add self-loops
        G.add_weighted_edges_from( zip(sorted_terms, sorted_terms, tf), weight='freq' )
        pg = nx.pagerank( G )
        pg = [ pg[term] for term in sorted_terms ]
        
        return G, tfidf, pg

In [4]:
class Dataset(object):
    def __init__(self, dataset_path, random_state=42):
        self.random_state = random_state
        self.dataset_path = dataset_path
        self.dname = path.basename(path.abspath(dataset_path))
        self._load_dataset_()
        self._identify_splits_()
        self.nclass = len(set(self.y))
        self.split = {}
    @property
    def ndocs(self):
        return len(self.y)

    @staticmethod
    def read_lines(filename):
        with io.open(filename, newline='\n') as filin:
            return filin.readlines()
        
    @staticmethod
    def get_array(X, idxs):
        return [ X[idx] for idx in idxs ]
    
    @staticmethod
    def _load_splits_(folddir):
        splits = []
        with open(folddir, encoding='utf8', errors='ignore') as fileout:
            for line in fileout.readlines():
                fold = []
                for idx_part in line.split(';'):
                    index = list(map(int, idx_part.split()))
                    fold.append( index )
                splits.append( tuple(fold) )
        return splits
    
    def get_split(self, nfold, force_create=True, save=True, with_val=True):
        nfold = str(nfold)
        if nfold in self.split:
            folds = self.split[nfold]
            if not with_val: # extends train_index with val_index
                folds = self._split_without_val_(folds)
            return folds
        
        if nfold not in self.available_splits:
            if not force_create:
                raise Exception(f"[ERROR] The {nfold}-fold split doen't exists. Use force_create=True to create.")
            folds = self._create_splits_( int(nfold) )
            self.available_splits.add( nfold )
        else:
            split_file = path.join(self.dataset_path, 'splits', f'split_{nfold}.csv')
            folds = Dataset._load_splits_( split_file )
            
        if any([len(f[0]) == 3 for f in folds]):
            print(f"[WARNING] The {nfold}-split doen't have validation. Creating...", end=' ')
            folds = self._create_val_(folds)
            print("Done!")
            
        self.split[nfold] = folds
        
        if save:
            self.save_split(nfold)
        
        if not with_val: # extends train_index with val_index
            folds = self._split_without_val_(folds)
        
        return folds
    
    def get_fold_instances(self, nfold, force_create=True, save=True, with_val=True):
        splits = self.get_split( nfold, force_create=force_create, save=save, with_val=with_val)
        for s in splits:
            yield self._get_fold_instance_(s)
    
    def save_split(self, split, force_create=True):
        splits = self.get_split( split, force_create=force_create )
        split_file = path.join(self.dataset_path, 'splits', f'split_{split}.csv')
        with open(split_file, 'w', encoding='utf8', errors='ignore') as fileout:
            for train_index, val_index, test_index in splits:
                train_str = ' '.join(list(map(str, train_index)))
                val_str   = ' '.join(list(map(str, val_index)))
                test_str  = ' '.join(list(map(str, test_index)))
                line = train_str + ';' + val_str + ';' + test_str + '\n'
                fileout.write(line)
    
    def _create_splits_(self, k):
        skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=self.random_state)
        kf  = list(skf.split(self.texts, self.y))
        return kf
                
    def _split_without_val_(self, splits):
        nfolds = []
        for train_index, val_index, test_index in splits:
            train_index.extend(val_index)
            nfolds.append( (train_index, test_index) )
        return nfolds
    
    def _get_fold_instance_(self, s):
        if len(s) == 2:
            train_idx, test_idx = s
            train = ( Dataset.get_array( self.texts, train_idx ), Dataset.get_array( self.y, train_idx ) )
            test = ( Dataset.get_array( self.texts, test_idx ), Dataset.get_array( self.y, test_idx ) )
            return train, test
        elif len(s) == 3:
            train_idx, val_idx, test_idx = s
            train = ( Dataset.get_array( self.texts, train_idx ), Dataset.get_array( self.y, train_idx ) )
            val = ( Dataset.get_array( self.texts, val_idx ), Dataset.get_array( self.y, val_idx ) )
            test = ( Dataset.get_array( self.texts, test_idx ), Dataset.get_array( self.y, test_idx ) )
            return train, val, test
    
    def _create_val_(self, split):
        aux_split = []
        print(len(split[0]))
        for (train_ids, test_ids) in split:
            train_idx_atual, val_idx_atual = train_test_split(train_ids,
                                            test_size=len(test_ids),
                                            stratify=Dataset.get_array(self.y, train_ids))
            aux_split.append( (train_idx_atual, val_idx_atual, test_ids) )
        return aux_split

    def _load_dataset_(self):
        self.texts = Dataset.read_lines(path.join(self.dataset_path, 'texts.txt'))
        self.y = Dataset.read_lines(path.join(self.dataset_path, 'score.txt'))
        self.y = list(map(int, self.y))
        
    def _identify_splits_(self):
        splits_files = glob( path.join(self.dataset_path, 'splits', 'split_*.csv') )
        self.available_splits = set(map(lambda x: path.basename(x).replace('split_', '').replace('.csv', ''), splits_files ))
    

In [5]:
import json
import re
from sklearn.model_selection import StratifiedKFold
from sklearn.datasets import dump_svmlight_file, load_svmlight_file
import gzip
import io
import pickle
from os import path, mkdir

import numpy as np

class NumpyEncoder(json.JSONEncoder):
    """ Special json encoder for numpy types """
    def default(self, obj):
        if isinstance(obj, (np.int_, np.intc, np.intp, np.int8,
            np.int16, np.int32, np.int64, np.uint8,
            np.uint16, np.uint32, np.uint64)):
            return int(obj)
        elif isinstance(obj, (np.float_, np.float16, np.float32, 
            np.float64)):
            return float(obj)
        elif isinstance(obj,(np.ndarray,)): #### This is the fix
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

def save_vectorize(vectorize, namefile):
	vecfile = open(namefile, 'wb')
	pickle.dump(vectorize, vecfile)
	vecfile.close()

def create_splits(X, y, k, seed):
    kf = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
    return list(kf.split(X, y))

def load_json(path_json_file):
    if path_json_file is None:
        return []
    with open(path_json_file) as file_in:
        json_obj = json.load(file_in)
    return json_obj

def save_json(path_json_file, data):
    with open(path_json_file, 'w') as file_out:
        json.dump(data, file_out, cls=NumpyEncoder)
        
def is_jsonable(x):
    try:
        json.dumps(x, cls=NumpyEncoder)
        return True
    except (TypeError, OverflowError):
        return False


def create_path(path_to_create):
    path_to_create = path.abspath(path_to_create)
    paths = path_to_create.split(path.sep)
    complete_path = '/'
    for p in paths[1:]:
        complete_path = path.join(complete_path, p)
        if not path.exists(complete_path):
            mkdir( complete_path )

def read_texts(filename):
    with io.open(filename, newline='\n') as filin:
        return list(map(str.rstrip, filin.readlines()))

def read_dataset(pathname):
    texts = read_texts(path.join(pathname, 'texts.txt'))
    scores = read_texts(path.join(pathname, 'score.txt'))
    scores = list(map(int, scores))
    return texts,scores

def get_array(X, idxs):
    return [ X[idx] for idx in idxs ]
    
def save_splits_ids(splits, folddir):
    with open(folddir, 'w', encoding='utf8', errors='ignore') as fileout:
        for train_index, test_index in splits:
            line = ' '.join(list(map(str, train_index))) + ';' + ' '.join(list(map(str, test_index))) + '\n'
            fileout.write(line)

def load_splits_ids(folddir, with_val=False):
    splits = []
    with open(folddir, encoding='utf8', errors='ignore') as fileout:
        for line in fileout.readlines():
            parts = line.split(';')
            if len(parts) == 2:
                train_index, test_index = parts
                train_index = list(map(int, train_index.split()))
                test_index = list(map(int, test_index.split()))
                splits.append( (train_index, test_index) )
            elif len(parts) == 3:
                train_index, val_index, test_index = parts
                test_index = list(map(int, test_index.split()))
                val_index = list(map(int, val_index.split()))
                train_index = list(map(int, train_index.split()))
                if not with_val:
                    train_index.extend(val_index)
                    val_index = []
                splits.append( (train_index, val_index, test_index) )
            else:
                raise Exception("")
    return splits

def dump_svmlight_file_gz(X,y,filename):
    with gzip.open(filename, 'w') as filout:
        dump_svmlight_file(X, y, filout, zero_based=False)

def load_svmlight_fold(filename_train, filename_test):
    X_train, y_train = load_svmlight_file(filename_train, zero_based=False)
    X_test, y_test = load_svmlight_file(filename_test, n_features=X_train.shape[1], zero_based=False)
    
    return X_train, y_train, X_test, y_test

In [6]:
texts,scores = read_dataset('/home/mangaravite/Documents/datasets/topics/webkb/')
n_class = len(set(scores))
splits = load_splits_ids('/home/mangaravite/Documents/datasets/topics/webkb/split_10.csv')
train_idx_zero, test_idx_zero = splits[0]

#train_idx_zero, val_idx_zero = train_test_split(train_idx_zero,
#                                                test_size=len(test_idx_zero),
#                                                stratify=get_array(scores, train_idx_zero))

X_train, y_train = get_array(texts, train_idx_zero), get_array(scores, train_idx_zero)
#X_val, y_val = get_array(texts, val_idx_zero), get_array(scores, val_idx_zero)
X_test, y_test = get_array(texts, test_idx_zero), get_array(scores, test_idx_zero)

print(len(y_train), len(y_test))
#print(y_train[0], y_val[0])

7376 823


In [7]:
graph_builder = Graphsize(verbose=True)
Gs = graph_builder.fit_transform( X_train[:2] )
print(graph_builder.df, graph_builder.vocab)
print(Gs[0][0].nodes(data=True), Gs[0][0].edges(data=True))
print(Gs[0][1].A)

100%|██████████| 2/2 [00:00<00:00, 299.87it/s]
100%|██████████| 2/2 [00:00<00:00, 2199.43it/s]
100%|██████████| 2/2 [00:00<00:00, 553.63it/s]
100%|██████████| 2/2 [00:00<00:00, 787.29it/s]

{'world': 2, 'page': 2, 'back': 2, 'img': 2, 'mail': 2, 'work': 2, 'time': 2, 'web': 2} {'back': 0, 'img': 1, 'mail': 2, 'page': 3, 'time': 4, 'web': 5, 'work': 6, 'world': 7, '<UNK>': 8}
[(0, {'tfidf': 0.5849625007211562, 'tf': 1}), (1, {'tfidf': 0.5849625007211562, 'tf': 1}), (2, {'tfidf': 0.5849625007211562, 'tf': 1}), (3, {'tfidf': 1.7548875021634687, 'tf': 3}), (4, {'tfidf': 1.1699250014423124, 'tf': 2}), (5, {'tfidf': 1.1699250014423124, 'tf': 2}), (7, {'tfidf': 1.7548875021634687, 'tf': 3})] [(0, 1, {'freq': 1}), (0, 3, {'freq': 1}), (0, 7, {'freq': 1}), (0, 4, {'freq': 1}), (0, 0, {'freq': 1}), (1, 7, {'freq': 1}), (1, 3, {'freq': 1}), (1, 1, {'freq': 1}), (2, 5, {'freq': 1}), (2, 3, {'freq': 1}), (2, 2, {'freq': 1}), (3, 7, {'freq': 2}), (3, 4, {'freq': 1}), (3, 5, {'freq': 3}), (3, 3, {'freq': 3}), (4, 7, {'freq': 3}), (4, 4, {'freq': 2}), (4, 5, {'freq': 1}), (5, 7, {'freq': 1}), (5, 5, {'freq': 2}), (7, 7, {'freq': 3})]
[[0.0826648  0.0826648  0.         0.24799441 0.165329




In [8]:
%%time
max_feats = 250000
graph_builder = Graphsize(w=5, verbose=True, max_feat=max_feats)
Gs_train = graph_builder.fit_transform(X_train)
Gs_test  = graph_builder.transform(X_test)

100%|██████████| 7376/7376 [00:22<00:00, 334.53it/s]
100%|██████████| 7376/7376 [00:06<00:00, 1118.99it/s]
100%|██████████| 7376/7376 [01:49<00:00, 67.32it/s] 
100%|██████████| 7376/7376 [00:09<00:00, 805.80it/s]
100%|██████████| 823/823 [00:00<00:00, 1231.44it/s]
100%|██████████| 823/823 [00:11<00:00, 74.27it/s]
100%|██████████| 823/823 [00:00<00:00, 924.65it/s]

CPU times: user 2min 39s, sys: 1.26 s, total: 2min 40s
Wall time: 2min 40s





In [9]:
import torch
import dgl.function as fn
import torch.nn as nn
import torch.nn.functional as F
from dgl.nn.pytorch.conv import GraphConv, GATConv

In [10]:
class Classifier(nn.Module):
    def __init__(self, in_dim, hidden_dim, n_heads, n_classes):
        super(Classifier, self).__init__()

        self.layers = nn.ModuleList([
            #GraphConv(in_dim, hidden_dim, activation=F.relu).to(torch.device('cuda:0')),
            #GraphConv(hidden_dim, hidden_dim, activation=F.relu).to(torch.device('cuda:0'))])
            GATConv(in_dim, hidden_dim, num_heads=n_heads, activation=F.relu,
                    feat_drop=0.5, attn_drop=0.5).to(torch.device('cuda:0')),
            GATConv(n_heads*hidden_dim, hidden_dim, num_heads=n_heads, activation=F.relu,
                    feat_drop=0.5, attn_drop=0.5).to(torch.device('cuda:0'))
        ])
        self.classify = nn.Linear(n_heads*hidden_dim, n_classes).to(torch.device('cuda:0'))
        #self.lin = nn.Linear(hidden_dim, 1).to(torch.device('cuda:0'))
        self.lin = nn.Linear(n_heads*hidden_dim, 1).to(torch.device('cuda:0'))

    def forward(self, G):
        h = G.ndata['f']
        for conv in self.layers:
            h = conv(G, h)
            h = h.view(h.shape[0], -1)
        w = self.lin(h)
        G.ndata['h'] = h
        G.ndata['w'] = w
        hg = dgl.mean_nodes(G, 'h', weight='w')
        return self.classify(hg)
    def transform(self, G):
        h = G.ndata['f']
        print(h.shape)
        for conv in self.layers:
            h = conv(G, h)
            print(h.shape)
        w = self.lin(h)
        G.ndata['h'] = h
        G.ndata['w'] = w
        hg = dgl.mean_nodes(G, 'h', weight='w')
        return hg

In [11]:
def collate(samples):
    Gs_Fs, labels = map(list, zip(*samples))
    graphs = []
    with Pool(processes=4) as pool:
        for g, f in Gs_Fs:
            g_dgl = dgl.DGLGraph()
            g_dgl.from_networkx(g)
            g_dgl.ndata['f'] = torch.FloatTensor(f.A).to(torch.device('cuda:0'))
            g_dgl.to(torch.device('cuda:0'))
            graphs.append(g_dgl)
    batched_graph = dgl.batch(graphs)
    batched_graph.to(torch.device('cuda:0'))
    labels = torch.tensor(labels).to(torch.device('cuda:0'))
    return batched_graph, labels

In [12]:
model = Classifier(len(graph_builder.vocab), 300, 16, n_class).to(torch.device('cuda:0'))
loss_func = nn.CrossEntropyLoss().to(torch.device('cuda:0'))
optimizer = optim.Adam(model.parameters(), lr=0.00001)
model.train()
torch.cuda.synchronize()
epoch_losses = []
n_epochs = 10

In [13]:
data_loader_test = DataLoader(list(zip(Gs_test, y_test)), batch_size=128, shuffle=True, collate_fn=collate)

In [14]:
for epoch in range(n_epochs):
    data_loader = DataLoader(list(zip(Gs_train, y_train)), batch_size=16, shuffle=True, collate_fn=collate)
    epoch_loss = 0
    with tqdm_notebook(total=len(data_loader.dataset), smoothing=0.) as pbar:
        t0 = time()
        for bg, label in data_loader:
            outputs = model(bg)
            loss = loss_func(outputs, label)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_loss += loss.detach().item()
            del loss, outputs, bg
            pbar.update( len(label) )
            print('iter {}, loss {:.4f} ({:.2f})'.format(epoch, epoch_loss / (epoch + 1), (time()-t0)), end='\r')
    with tqdm_notebook(total=len(data_loader_test.dataset), smoothing=0.) as pbar:
        with torch.no_grad():
            total = 0
            correct = 0
            for bg, label in data_loader_test:
                outputs = model(bg)
                _, predicted = torch.max(outputs, 1)
                total += label.size(0)
                correct += (predicted == label).sum().item()
                del predicted, outputs, bg
                pbar.update( label.size(0) )
                print('acc test {:.4f}'.format(correct/total), end='\r')
        epoch_loss /= (epoch + 1)
        epoch_losses.append(epoch_loss)
    print()

Widget Javascript not detected.  It may not be installed or enabled properly.


iter 0, loss 9107.8693 (229.66)


Widget Javascript not detected.  It may not be installed or enabled properly.


acc test 0.4520



Widget Javascript not detected.  It may not be installed or enabled properly.


iter 1, loss 930.7006 (228.83)


Widget Javascript not detected.  It may not be installed or enabled properly.


acc test 0.4569



Widget Javascript not detected.  It may not be installed or enabled properly.


iter 2, loss 404.3160 (231.69)


Widget Javascript not detected.  It may not be installed or enabled properly.


acc test 0.4897



Widget Javascript not detected.  It may not be installed or enabled properly.


iter 3, loss 285.1148 (231.44)


Widget Javascript not detected.  It may not be installed or enabled properly.


acc test 0.4714



Widget Javascript not detected.  It may not be installed or enabled properly.


iter 4, loss 206.6796 (232.61)


Widget Javascript not detected.  It may not be installed or enabled properly.


acc test 0.4690



Widget Javascript not detected.  It may not be installed or enabled properly.


iter 5, loss 13957.3755 (230.17)


Widget Javascript not detected.  It may not be installed or enabled properly.


acc test 0.4532



Widget Javascript not detected.  It may not be installed or enabled properly.


iter 6, loss 519.3247 (234.79)


Widget Javascript not detected.  It may not be installed or enabled properly.


acc test 0.4557



Widget Javascript not detected.  It may not be installed or enabled properly.


iter 7, loss 353.3630 (235.51)


Widget Javascript not detected.  It may not be installed or enabled properly.


acc test 0.4593



Widget Javascript not detected.  It may not be installed or enabled properly.


iter 8, loss 238.9854 (231.81)


Widget Javascript not detected.  It may not be installed or enabled properly.


acc test 0.4605



Widget Javascript not detected.  It may not be installed or enabled properly.


iter 9, loss 142.0671 (233.82)


Widget Javascript not detected.  It may not be installed or enabled properly.


acc test 0.4775



In [15]:
import torch
import dgl.function as fn
import torch.nn as nn
import torch.nn.functional as F
from dgl.nn.pytorch.conv import GraphConv, GATConv

In [16]:
n = len(graph_builder.vocab)
class_mtx = [ sp.lil_matrix((n,n)) for _ in range(n_class) ]  
for (G,feats), y in tqdm_notebook(zip(Gs_train, y_train), total=len(y_train)):
    print(len(G))
    nodes = sorted(G.nodes)
    A = nx.to_scipy_sparse_matrix(G, nodelist=nodes)

Widget Javascript not detected.  It may not be installed or enabled properly.


42
52
94
60
31
29
33
58
35
34
5
36
50
65
83
35
65
48
101
35
31
30
31
23
62
61
13
18
27
27
38
33
52
142
34
23
20
40
47
19
1
54
77
46
62
48
34
29
31
62
24
54
70
48
64
139
5
11
24
60
61
44
39
31
65
26
27
9
43
39
34
12
14
54
22
65
27
51
4
23
23
27
40
57
34
28
116
22
41
26
29
32
21
10
9
19
14
25
12
30
33
87
100
22
30
16
51
25
53
1
46
46
42
32
63
41
8
48
14
15
85
18
19
2
31
14
63
4
27
24
52
57
1
47
23
29
42
20
23
31
19
11
32
17
81
70
17
80
49
27
69
31
21
9
27
1
22
10
81
27
36
68
27
79
121
23
57
46
18
112
37
28
27
24
52
21
32
52
40
33
20
41
43
140
12
21
31
7
36
40
34
44
10
69
89
15
14
24
32
45
25
33
17
7
9
35
9
49
13
54
15
85
26
35
22
18
54
10
8
8
55
17
28
37
64
20
3
33
20
97
61
40
62
36
15
26
34
14
54
9
39
14
41
13
23
29
17
93
16
38
55
80
22
51
104
8
15
33
13
8
62
3
31
22
28
146
51
15
25
55
62
11
15
29
65
17
52
19
65
46
31
36
28
42
43
62
1
39
47
18
36
41
17
8
11
47
48
35
23
74
28
30
21
38
19
28
71
32
34
17
38
35
41
6
38
10
23
53
29
83
33
59
20
24
35
27
46
46
86
15
129
24
17
92
23
32
77
24
27

96
41
71
158
38
37
51
31
90
15
51
21
24
96
4
85
12
15
8
120
37
54
81
29
77
20
24
30
52
174
80
51
14
21
96
6
13
46
43
34
16
35
24
20
32
15
13
52
68
25
15
185
66
6
5
21
49
31
71
9
14
112
63
45
49
6
66
156
80
23
90
24
7
28
8
80
3
52
35
61
85
9
5
4
68
69
21
10
10
468
40
26
69
71
48
19
22
5
1
36
62
15
95
44
8
43
126
52
81
96
25
297
1
34
49
33
124
20
29
333
213
82
94
7
49
76
22
23
2
86
8
18
115
11
20
196
28
39
52
79
35
44
26
15
61
29
55
2
28
9
5
59
30
138
11
25
12
29
91
81
222
10
33
89
24
17
40
29
62
55
8
12
95
43
22
59
42
41
70
145
88
32
91
37
21
16
5
78
54
48
34
216
100
49
18
7
37
25
71
57
42
17
116
15
84
123
12
249
48
119
26
15
71
45
20
53
12
150
27
57
20
136
9
43
91
33
163
261
132
37
10
78
40
44
41
39
38
161
40
132
11
36
49
77
10
51
15
3
29
71
16
16
4
36
18
31
75
59
21
26
39
37
88
6
63
160
60
43
33
43
33
65
37
13
61
33
96
199
17
15
28
15
45
11
9
24
50
12
27
51
21
8
49
92
15
25
14
97
68
24
150
17
138
28
20
66
1
148
8
24
15
264
113
11
8
110
43
23
282
32
54
88
22
49
26
113
39
94
7
73
85
30


37
41
26
59
23
23
12
50
78
19
65
63
224
8
12
23
24
27
40
34
38
108
46
35
35
35
56
44
96
182
43
17
60
72
12
41
58
18
54
18
16
21
95
20
28
35
44
354
15
38
15
58
56
103
59
116
89
20
146
20
13
46
50
116
17
25
50
38
44
11
5
30
21
83
46
37
100
61
26
74
29
30
54
98
54
51
44
61
26
43
64
14
45
37
29
178
30
47
20
17
39
36
20
48
43
32
30
20
156
36
33
51
31
41
17
73
3
30
141
69
90
23
42
24
29
21
25
5
54
26
32
54
61
77
39
46
76
36
31
13
99
27
34
76
85
72
25
26
55
29
42
76
28
21
105
26
85
34
32
110
41
27
37
47
21
13
113
63
38
22
54
37
72
33
68
63
15
50
16
45
106
39
46
52
31
34
52
11
16
26
93
80
35
80
35
89
18
16
4
42
49
111
22
26
54
35
33
54
358
25
51
16
52
39
72
108
19
64
47
34
30
38
4
255
42
38
36
172
48
78
45
59
31
34
73
43
40
35
56
69
159
40
69
30
30
13
191
31
51
26
42
22
99
23
59
111
96
86
77
101
10
94
51
19
55
58
66
19
27
19
20
65
25
29
71
99
67
37
100
73
29
58
28
63
17
103
86
46
183
76
65
128
19
30
66
72
47
10
16
77
27
13
48
36
157
95
35
107
25
50
30
82
18
52
41
47
37
47
47
62
14
45
112
31
28

In [17]:
/home/mangaravite/Documents/TGAhidden_dim = 300
num_heads = 4
gcn1 = GraphConv(len(graph_builder.vocab), hidden_dim, activation=F.relu).to(torch.device('cuda:0'))
#gcn2 = GATConv(hidden_dim, hidden_dim, activation=F.relu).to(torch.device('cuda:0'))
gat = GATConv(hidden_dim, hidden_dim, num_heads, feat_drop=0.5, attn_drop=0.5, activation=F.relu).to(torch.device('cuda:0'))
lin = nn.Linear(num_heads*hidden_dim, 1).to(torch.device('cuda:0'))

In [18]:
del gcn1, gcn2, lin

NameError: name 'gcn2' is not defined

In [None]:
gcn1

In [None]:
torch.cuda.empty_cache()

In [None]:
%%time
for G, l in tqdm(data_loader):
    h = G.ndata['f']
    h = gcn1(G,h)
    h = gat(G,h)
    h = h.view(h.shape[0], -1)
    w = lin(h)
    G.ndata['h'] = h
    G.ndata['w'] = w
    hg = dgl.mean_nodes(G, 'h', weight='w')
    #del h, G, w
    break

In [None]:
hg.shape

In [None]:
print(h.shape)
h.view(h.shape[0], -1).shape

In [None]:
import gc
def pretty_size(size):
	"""Pretty prints a torch.Size object"""
	assert(isinstance(size, torch.Size))
	return " × ".join(map(str, size))

def dump_tensors(gpu_only=True):
	"""Prints a list of the Tensors being tracked by the garbage collector."""
	import gc
	total_size = 0
	for obj in gc.get_objects():
		try:
			if torch.is_tensor(obj):
				if not gpu_only or obj.is_cuda:
					print("%s:%s%s %s" % (type(obj).__name__, 
										  " GPU" if obj.is_cuda else "",
										  " pinned" if obj.is_pinned else "",
										  pretty_size(obj.size())))
					total_size += obj.numel()
					del obj
					gc.collect()
			elif hasattr(obj, "data") and torch.is_tensor(obj.data):
				if not gpu_only or obj.is_cuda:
					print("%s → %s:%s%s%s%s %s" % (type(obj).__name__, 
												   type(obj.data).__name__, 
												   " GPU" if obj.is_cuda else "",
												   " pinned" if obj.data.is_pinned else "",
												   " grad" if obj.requires_grad else "", 
												   " volatile" if obj.volatile else "",
												   pretty_size(obj.data.size())))
					total_size += obj.data.numel()
					del obj
					gc.collect()
		except Exception as e:
			pass        
	print("Total size:", total_size)
dump_tensors()
gc.collect()

In [None]:
def dump_tensors(gpu_only=True):
		torch.cuda.empty_cache()
		total_size = 0
		for obj in gc.get_objects():
			try:
				if torch.is_tensor(obj):
					if not gpu_only or obj.is_cuda:
						del obj
						gc.collect()
				elif hasattr(obj, "data") and torch.is_tensor(obj.data):
					if not gpu_only or obj.is_cuda:
						del obj
						gc.collect()
			except Exception as e:
				pass
dump_tensors()

In [None]:
sorted_list = sorted(nx.pagerank(G).items(), key=lambda x: x[0])
sizes = np.array(list(zip(*sorted_list))[1])
sizes = (sizes-sizes.min())/(sizes.max()-sizes.min())
sizes = np.power(np.sqrt(sizes*300), 2)
sizes

In [None]:
import matplotlib.pyplot as plt
print(nx.info(G))
nx.draw(G, with_labels=True, node_size=sizes,pos=nx.spring_layout(G))

In [None]:
torch.cuda.empty_cache()