In [22]:
import warnings
warnings.filterwarnings('ignore')

import sys, os
sys.path.insert(0, os.path.abspath('..'))

import pandas as pd
import numpy as np

from octis.dataset.dataset import Dataset
from octis.models.GINOPIC import GINOPIC
from octis.evaluation_metrics.coherence_metrics import *
from octis.evaluation_metrics.diversity_metrics import *
from octis.evaluation_metrics.classification_metrics import *
import random, torch, json
from random import randint
from IPython.display import clear_output

# Loading Functions

In [7]:
data_dir = '../preprocessed_datasets'

def get_dataset(dataset_name):
    data = Dataset()
    if dataset_name=='20NG':
        data.fetch_dataset("20NewsGroup")
    elif dataset_name=='SO':
        data.load_custom_dataset_from_folder(data_dir + "/SO")
    elif dataset_name=='BBC':
        data.fetch_dataset("BBC_News")
    elif dataset_name=='Bio':
        data.load_custom_dataset_from_folder(data_dir + "/Bio")
    elif dataset_name=='SearchSnippets':
        data.load_custom_dataset_from_folder(data_dir + "/SearchSnippets")
    elif dataset_name=='BRNews':
        data.load_custom_dataset_from_folder(data_dir + "/BRNews")
    elif dataset_name=='hotels_reviews':
        data.load_custom_dataset_from_folder(data_dir + "/hotels_reviews")
    else:
        raise Exception('Missing Dataset name...!!!')
    return data

# Hiper parametros

In [8]:
params = {
    '20NG': {
        'num_gin_layers': 2,
        'g_feat_size': 2048,
        'num_mlp_layers': 1,
        'gin_hidden_dim': 200,
        'gin_output_dim': 768,
        'eps_simGraph': 0.4
    },
    'BBC': {
        'num_gin_layers': 3,
        'g_feat_size': 256,
        'num_mlp_layers': 1,
        'gin_hidden_dim': 50,
        'gin_output_dim': 512,
        'eps_simGraph': 0.3
    },
    'Bio': {
        'num_gin_layers': 2,
        'g_feat_size': 1024,
        'num_mlp_layers': 1,
        'gin_hidden_dim': 200,
        'gin_output_dim': 256,
        'eps_simGraph': 0.05
    },
    'SO': {
        'num_gin_layers': 2,
        'g_feat_size': 64,
        'num_mlp_layers': 1,
        'gin_hidden_dim': 300,
        'gin_output_dim': 512,
        'eps_simGraph': 0.1
    },
    'SearchSnippets': {
        'num_gin_layers': 2,
        'g_feat_size': 1024,
        'num_mlp_layers': 1,
        'gin_hidden_dim': 50,
        'gin_output_dim': 256,
        'eps_simGraph': 0.2
    },
    'BRNews': {
        'num_gin_layers': 2,
        'g_feat_size': 64,
        'num_mlp_layers': 1,
        'gin_hidden_dim': 50,
        'gin_output_dim': 256,
        'eps_simGraph': 0.4
    },
    'hotels_reviews': {
        'num_gin_layers': 2,
        'g_feat_size': 64,
        'num_mlp_layers': 1,
        'gin_hidden_dim': 50,
        'gin_output_dim': 256,
        'eps_simGraph': 0.3
    }
}

results = {
    'Dataset': [],
    'K': [],
    'Seed': [],
    'Model':[],
    'NPMI': [],
    'CV': [],
    'Accuracy': []
}

partition = True
validation = False

m = 'GINopic'

n_topics = {
    '20NG': [20, 50],
    'BBC': [5, 20],
    'Bio': [20, 50],
    'SO': [20, 50],
    'SearchSnippets': [8, 20],
    'BRNews': [8, 20],
    'hotels_reviews': [8, 20]
}

# Funcao de execucao

In [9]:
def run_model(d, k, seed):        
    data = get_dataset(d)
    
    print("-"*100)
    print('Dataset:{},\t K={},\t Seed={}'.format(d, k, seed))
    print("-"*100)
    
    random.seed(seed)
    torch.random.manual_seed(seed)

    model = GINOPIC(num_topics=k,
         use_partitions=partition,
         use_validation=validation,
         num_epochs=50,
         w2v_path='./w2v/{}_part{}_valid{}/'.format(d, partition, validation),
         graph_path='./doc_graphs/{}_part{}_valid{}/'.format(d, partition, validation),
         num_gin_layers=params[d]['num_gin_layers'],
         g_feat_size=params[d]['g_feat_size'],
         num_mlp_layers=params[d]['num_mlp_layers'],
         gin_hidden_dim=params[d]['gin_hidden_dim'],
         gin_output_dim=params[d]['gin_output_dim'],
         eps_simGraph=params[d]['eps_simGraph']
        )
    
    output = model.train_model(dataset=data)
    
    del model
    torch.cuda.empty_cache()
    
    #Hyperparams:
    results['Dataset'].append(d)
    results['Model'].append(m)
    results['K'].append(k)
    results['Seed'].append(seed)
    #############
    
    #Coherence Scores:
    npmi = Coherence(texts=data.get_corpus(), topk=10, measure='c_npmi')
    results['NPMI'].append(npmi.score(output))
    del npmi
    
    cv = Coherence(texts=data.get_corpus(), topk=10, measure='c_v')
    results['CV'].append(cv.score(output))
    del cv
    
    #############
    if partition==True:
        #classification:
        try:
            #Accuracy
            accuracy = AccuracyScore(data)
            results['Accuracy'].append(accuracy.score(output))
        except:
            results['Accuracy'].append(0.0)
    else:
        results['Accuracy'].append(0.0)
    #############
    clear_output(wait=False) 

# Experimentos

In [24]:
datasets = ['20NG', 'BBC', 'Bio', 'SO', 'SearchSnippets', 'hotels_reviews', 'BRNews']
seed = 666

for d in datasets:
    for k in n_topics[d]:
        run_model(d, k, seed)

In [32]:
rdf = pd.DataFrame(results)[13:].reset_index(drop=True)
rdf.to_csv('results.csv', index=None)

In [44]:
print(rdf.drop(['Seed', 'Model'], axis=1).to_latex(index=False, float_format="%.2f"))

\begin{tabular}{lrrrr}
\toprule
Dataset & K & NPMI & CV & Accuracy \\
\midrule
20NG & 20 & 0.12 & 0.65 & 0.40 \\
20NG & 50 & 0.12 & 0.63 & 0.40 \\
BBC & 5 & 0.13 & 0.73 & 0.88 \\
BBC & 20 & 0.13 & 0.69 & 0.90 \\
Bio & 20 & 0.16 & 0.63 & 0.49 \\
Bio & 50 & 0.12 & 0.57 & 0.52 \\
SO & 20 & 0.06 & 0.48 & 0.57 \\
SO & 50 & 0.05 & 0.46 & 0.68 \\
SearchSnippets & 8 & 0.06 & 0.54 & 0.72 \\
SearchSnippets & 20 & 0.04 & 0.50 & 0.77 \\
BRNews & 8 & 0.05 & 0.57 & 0.92 \\
BRNews & 20 & 0.01 & 0.57 & 0.93 \\
hotels_reviews & 8 & 0.06 & 0.66 & 0.44 \\
hotels_reviews & 20 & 0.06 & 0.66 & 0.47 \\
\bottomrule
\end{tabular}

