In [8]:
import warnings
warnings.filterwarnings('ignore')

from octis.dataset.dataset import Dataset

#Import models:
from octis.models.GINOPIC import GINOPIC

#Import coherence metrics:
from octis.evaluation_metrics.coherence_metrics import *

#Import TD metrics:
from octis.evaluation_metrics.diversity_metrics import *

#Import classification metrics:
from octis.evaluation_metrics.classification_metrics import *

import random, torch, json

In [3]:
data_dir = '../preprocessed_datasets'

def get_dataset(dataset_name):
    data = Dataset()
    if dataset_name=='20NG':
        data.fetch_dataset("20NewsGroup")
    elif dataset_name=='SO':
        data.load_custom_dataset_from_folder(data_dir + "/SO")
    elif dataset_name=='BBC':
        data.fetch_dataset("BBC_News")
    elif dataset_name=='Bio':
        data.load_custom_dataset_from_folder(data_dir + "/Bio")
    elif dataset_name=='SearchSnippets':
        data.load_custom_dataset_from_folder(data_dir + "/SearchSnippets")
    else:
        raise Exception('Missing Dataset name...!!!')
    return data

In [5]:
import os
from random import randint
from IPython.display import clear_output

seeds = [randint(0, 9999) for _ in range(1)]

n_topics = {
    '20NG': [20, 50, 100],
    'BBC': [5, 20, 50, 100],
    'Bio': [20, 50, 100],
    'SO': [20, 50, 100],
    'SearchSnippets': [8, 20, 50, 100],
}

m = 'GINopic'
datasets = ['20NG', 'BBC', 'Bio', 'SO', 'SearchSnippets']

params = {
    '20NG': {
        'num_gin_layers': 2,
        'g_feat_size': 2048,
        'num_mlp_layers': 1,
        'gin_hidden_dim': 200,
        'gin_output_dim': 768,
        'eps_simGraph': 0.4
    },
    'BBC': {
        'num_gin_layers': 3,
        'g_feat_size': 256,
        'num_mlp_layers': 1,
        'gin_hidden_dim': 50,
        'gin_output_dim': 512,
        'eps_simGraph': 0.3
    },
    'Bio': {
        'num_gin_layers': 2,
        'g_feat_size': 1024,
        'num_mlp_layers': 1,
        'gin_hidden_dim': 200,
        'gin_output_dim': 256,
        'eps_simGraph': 0.05
    },
    'SO': {
        'num_gin_layers': 2,
        'g_feat_size': 64,
        'num_mlp_layers': 1,
        'gin_hidden_dim': 300,
        'gin_output_dim': 512,
        'eps_simGraph': 0.1
    },
    'SearchSnippets': {
        'num_gin_layers': 2,
        'g_feat_size': 1024,
        'num_mlp_layers': 1,
        'gin_hidden_dim': 50,
        'gin_output_dim': 256,
        'eps_simGraph': 0.2
    }
}

results = {
    'Dataset': [],
    'K': [],
    'Seed': [],
    'Model':[],
    'NPMI': [],
    'CV': [],
    'Accuracy': []
}

irbo = InvertedRBO(topk=10, weight=.95)

partition = True
validation = False

In [6]:
for seed in seeds:
    for d in datasets:
        for k in n_topics[d]:
            data = get_dataset(d)

            print('Results:-\n', results)

            print("-"*100)
            print('Dataset:{},\t Model:{},\t K={},\t Seed={}'.format(d, m, k, seed))
            print("-"*100)

            random.seed(seed)
            torch.random.manual_seed(seed)

            model = GINOPIC(num_topics=k,
                 use_partitions=partition,
                 use_validation=validation,
                 num_epochs=50,
                 w2v_path='./w2v/{}_part{}_valid{}/'.format(d, partition, validation),
                 graph_path='./doc_graphs/{}_part{}_valid{}/'.format(d, partition, validation),
                 num_gin_layers=params[d]['num_gin_layers'],
                 g_feat_size=params[d]['g_feat_size'],
                 num_mlp_layers=params[d]['num_mlp_layers'],
                 gin_hidden_dim=params[d]['gin_hidden_dim'],
                 gin_output_dim=params[d]['gin_output_dim'],
                 eps_simGraph=params[d]['eps_simGraph']
                )

            output = model.train_model(dataset=data)

            del model
            torch.cuda.empty_cache()

            #Hyperparams:
            results['Dataset'].append(d)
            results['Model'].append(m)
            results['K'].append(k)
            results['Seed'].append(seed)
            #############

            #Coherence Scores:
            npmi = Coherence(texts=data.get_corpus(), topk=10, measure='c_npmi')
            results['NPMI'].append(npmi.score(output))
            del npmi

            cv = Coherence(texts=data.get_corpus(), topk=10, measure='c_v')
            results['CV'].append(cv.score(output))
            del cv

            #############
            if partition==True:
                #classification:
                try:
                    #Accuracy
                    accuracy = AccuracyScore(data)
                    results['Accuracy'].append(accuracy.score(output))
                except:
                    results['Accuracy'].append(0.0)
            else:
                results['Accuracy'].append(0.0)
            #############
            clear_output(wait=False)  

In [9]:
open('results.json', 'w+').write(json.dumps(results))

1633