In [1]:
import os
import pickle
from topic_metrics.io_utils import split_corpus
from topic_metrics.counting import count_histogram, count_vocab, count_windows

test_dir = "tests/20ng/corpus"
dest_dir = "outputs/20ng"

corpus_dir = os.path.join(dest_dir, 'corpus')
os.makedirs(dest_dir, exist_ok=True)
os.makedirs(corpus_dir, exist_ok=True)

"""
split documents into smaller batches for faster processing
"""

split_corpus(test_dir, corpus_dir, batch=1000, num_processes=1)
print(corpus_dir, len(os.listdir(corpus_dir)),'\n')

"""
further process corpus via sliding windows
We calculate co-occurence in sliding windows via convolutions and boolean operations
"""
count_vocab(corpus_dir, f"{dest_dir}/vocab_count.pkl", num_processes=4)

# uploaded vocab are in alphabetically-sorted
vocab = sorted(pickle.load(open(f"{dest_dir}/vocab_count.pkl", 'rb')))
vocab_index = {k:i for i,k in enumerate(vocab)}
count_histogram(corpus_dir, dest_dir, num_processes=4)
count_windows(corpus_dir, dest_dir, window_size=10, vocab2id=vocab_index, count_processes=2)

100%|██████████| 1/1 [00:00<00:00, 1468.59it/s]

outputs/20ng/corpus 17 




100%|██████████| 17/17 [00:00<00:00, 20879.40it/s]
100%|██████████| 17/17 [00:00<00:00, 3748.85it/s]

vocab saved: outputs/20ng/vocab_count.pkl



100%|██████████| 17/17 [00:00<00:00, 17264.69it/s]
100%|██████████| 17/17 [00:00<00:00, 39656.93it/s]


histogram saved: outputs/20ng/histogram.csv


100%|██████████| 17/17 [00:04<00:00,  3.72it/s]


counting completed, 7.38360333442688 seconds, dumping...
completed. 13.656367540359497 seconds
Single prior counts saved to: outputs/20ng/10/single.pkl
Joint co-occurrence counts saved to: outputs/20ng/10/joint


In [2]:
"""
Next, we can easily load the count graphs as probability graphs
"""
from topic_metrics.measuring import single_count_setup, load_full_joint_count_graph, create_scores_from_count_array, npmi
from topic_metrics.io_utils import reshape_shared_array
from time import time

dest_npmi = 'outputs/20ng/npmi_10'
os.makedirs(dest_npmi, exist_ok=True)

num_windows, single_prob = single_count_setup("outputs/20ng/histogram.csv",
                                              "outputs/20ng/10/single.pkl",
                                              window_size=10, min_freq=0)

start = time()
joint_count_arr = load_full_joint_count_graph("outputs/20ng/10/joint", len(vocab_index), num_processes=40)

"""
Using the count graph, we can create scored (npmi) graphs
"""

joint_npmi_arr = create_scores_from_count_array(npmi,joint_count_arr,single_prob,num_windows,min_freq=0)
joint_npmi_arr = reshape_shared_array(joint_npmi_arr, len(vocab_index))
print('loaded in', time()-start)

# """
# Saving it in a dictionary format, it is easier to load specific dicts compared to building the entire graph
# """
for k1, s in enumerate(joint_npmi_arr):
    pickle.dump({k2:v for k2,v in enumerate(s)}, open(os.path.join(dest_npmi,f"{k1}.pkl"), "wb"))
print(len(os.listdir('outputs/20ng/npmi_10')))

100%|██████████| 1612/1612 [00:00<00:00, 359620.13it/s]
100%|██████████| 1612/1612 [00:00<00:00, 4095.34it/s]


loaded in 0.7906534671783447
1612


In [3]:
"""
Let's use our corpus statistic to evaluate a trained model
We convert the topics to vocab id corresponding to our processed statistics
"""
import topic_metrics
from topic_metrics.measuring import calculate_scores_from_counts, direct_avg

opt_scores = pickle.load(open("/home/jiapeng/project3/baselines/ctm_20NewsGroup_50/results_b0_mf50_orig.pkl",'rb'))
topics = [x[-1] for x  in opt_scores]

"""
We can lazily calculate directly from the counts
Using counts give us flexibility to tweak the measures
"""
scores = calculate_scores_from_counts([[vocab_index[w] for w in t] for t in topics], 
                 "outputs/20ng/histogram.csv",
                 "outputs/20ng/10/single.pkl",
                 "outputs/20ng/10/joint", 
                score_func = npmi, window_size = 10, agg_func = direct_avg,
                smooth=True, min_freq=0, num_processes=10)


for topic, score in zip(topics, scores):
    print(f"{score:.4f} {' '.join(topic)}")

100%|██████████| 50/50 [00:00<00:00, 579.36it/s]

-0.0099 interpretation obvious statement lack suit absolutely personal fool truth absolute
0.1815 commercial capability element flight station space shuttle satellite vehicle launch
0.1322 prepare bear heart faith life sin pray day hell eternal
0.1271 application character printer problem font manager mouse button window print
-0.1906 actual quickly accurate advantage capable guarantee gather pre portion suit
0.1387 number child disease age medical patient year health risk adult
0.1892 session secret attack algorithm key random block bit encryption chip
0.0647 black sign cover art game man annual title copy bag
-0.1035 operate favor money suit bank raise stay taxis task worry
0.1162 stick shoot play zone game goal good penalty point time
0.2157 shift tire gear road bike ride brake transmission engine car
0.0504 past sick eat hour late ago week doctor pain month
0.0616 building brother floor hide morning year neighbor work time people
0.1013 woman car start neighbor door police building




In [4]:
"""
Since we already have an NPMI graph, let's use it to do the same calculations
However, pre-calculating the entire NPMI graph can be time-consuming for larger corpus
"""
from topic_metrics.measuring import calculate_scored_graphs


scores = calculate_scored_graphs([[vocab_index[w] for w in t] for t in topics],
                 "outputs/20ng/npmi_10", 
                agg_func = direct_avg, num_processes=10)


for topic, score in zip(topics, scores):
    print(f"{score:.4f} {' '.join(topic)}")

100%|██████████| 50/50 [00:00<00:00, 33694.60it/s]


-0.0099 interpretation obvious statement lack suit absolutely personal fool truth absolute
0.1815 commercial capability element flight station space shuttle satellite vehicle launch
0.1322 prepare bear heart faith life sin pray day hell eternal
0.1271 application character printer problem font manager mouse button window print
-0.1906 actual quickly accurate advantage capable guarantee gather pre portion suit
0.1387 number child disease age medical patient year health risk adult
0.1892 session secret attack algorithm key random block bit encryption chip
0.0647 black sign cover art game man annual title copy bag
-0.1035 operate favor money suit bank raise stay taxis task worry
0.1162 stick shoot play zone game goal good penalty point time
0.2157 shift tire gear road bike ride brake transmission engine car
0.0504 past sick eat hour late ago week doctor pain month
0.0616 building brother floor hide morning year neighbor work time people
0.1013 woman car start neighbor door police building

In [5]:
"""
Now we move on to bigger graphs -- Wikipedia
Randomly mining topics (cliques) with edge-weight > npmi-value 0.1
"""
import random
import pickle
import numpy as np
from time import time
from topic_metrics.measuring import single_count_setup, load_full_joint_count_graph, create_scores_from_count_array, npmi
from topic_metrics.io_utils import reshape_shared_array
from topic_metrics.mining import sample

data_dir = "/data/jiapeng/wiki/final_2"
wsz, min_freq = 10, 100
vocab2id = {k:i for i,k in enumerate(sorted(pickle.load(open(f"{data_dir}/vocab_count.pkl", 'rb'))))}
id2vocab = {v:k for k,v in vocab2id.items()}
num_windows, single_prob = single_count_setup(f"{data_dir}/histogram.csv",
                                                        f"{data_dir}/{wsz}_single.pkl", wsz, min_freq) 
start = time()
joint_count_arr = load_full_joint_count_graph(f"{data_dir}/{wsz}", len(vocab2id), num_processes=40)
joint_npmi_arr = create_scores_from_count_array(npmi,joint_count_arr,single_prob,
                                                 num_windows,min_freq=100,num_processes=40)
joint_npmi_arr = reshape_shared_array(joint_npmi_arr, len(vocab2id))
print('loaded in', time()-start)


100%|██████████| 39961/39961 [01:06<00:00, 597.31it/s] 
100%|██████████| 39961/39961 [01:16<00:00, 520.91it/s] 


loaded in 195.60514903068542


In [6]:
from topic_metrics.mining import sample

r = random.sample(range(30000), 10)
samples = []
for key in r:
    samples.extend(sample(key, joint_npmi_arr, 
                          clique_size=10, edge_condition=lambda v: v>.1, 
                          target=100))

START: 23126 QTY: 93


100%|██████████| 93/93 [00:00<00:00, 24109.67it/s]
100%|██████████| 93/93 [00:00<00:00, 6987.63it/s]

23126	Pruned: 0.08805036544799805





23126	@0.8691244125366211s	 Sampled: 5
START: 4177 QTY: 276


100%|██████████| 276/276 [00:00<00:00, 9599.62it/s]
100%|██████████| 276/276 [00:00<00:00, 2323.44it/s]


4177	Pruned: 0.21376633644104004
4177	@10.895503282546997s	 Sampled: 92
START: 2446 QTY: 235


100%|██████████| 235/235 [00:00<00:00, 11707.44it/s]
100%|██████████| 235/235 [00:00<00:00, 2834.66it/s]

2446	Pruned: 0.1689894199371338





2446	@5.790056228637695s	 Sampled: 36
START: 27865 QTY: 375


100%|██████████| 375/375 [00:00<00:00, 7200.24it/s]
100%|██████████| 375/375 [00:00<00:00, 1776.49it/s]


27865	Pruned: 0.3300948143005371
27865	@12.679257869720459s	 Sampled: 100
START: 14720 QTY: 1


100%|██████████| 1/1 [00:00<00:00, 16131.94it/s]
100%|██████████| 1/1 [00:00<00:00, 16980.99it/s]


14720	Pruned: 0.0654139518737793
14720	@0.06556391716003418s	 Sampled: 0
START: 13482 QTY: 185


100%|██████████| 185/185 [00:00<00:00, 14609.90it/s]
100%|██████████| 185/185 [00:00<00:00, 3631.25it/s]

13482	Pruned: 0.12938833236694336





13482	@1.130042552947998s	 Sampled: 9
START: 27368 QTY: 171


100%|██████████| 171/171 [00:00<00:00, 16274.70it/s]
100%|██████████| 171/171 [00:00<00:00, 3888.14it/s]

27368	Pruned: 0.12093091011047363





27368	@1.2601909637451172s	 Sampled: 11
START: 28156 QTY: 341


100%|██████████| 341/341 [00:00<00:00, 8291.16it/s]
100%|██████████| 341/341 [00:00<00:00, 1956.60it/s]


28156	Pruned: 0.2844538688659668
28156	@6.054417371749878s	 Sampled: 45
START: 10217 QTY: 437


100%|██████████| 437/437 [00:00<00:00, 6462.97it/s]
100%|██████████| 437/437 [00:00<00:00, 1536.26it/s]


10217	Pruned: 0.4209134578704834
10217	@8.738419532775879s	 Sampled: 42
START: 7840 QTY: 245


100%|██████████| 245/245 [00:00<00:00, 11169.98it/s]
100%|██████████| 245/245 [00:00<00:00, 2650.49it/s]

7840	Pruned: 0.18047523498535156





7840	@6.242502927780151s	 Sampled: 30


In [7]:
from topic_metrics.measuring import calculate_scores_from_counts, direct_avg
"""
Finally, we caculate the final C_NPMI scores of the sampled topics
We have some predefined settings such as C_NPMI, C_P, C_V, C_UMass
Also possible for finer control over min_freq, window_size, score_func, and agg_func
"""
from topic_metrics.measuring import C_NPMI
scores = calculate_scores_from_counts(samples, "/data/jiapeng/wiki/histogram.csv",
                 "/data/jiapeng/wiki/final_2/10_single.pkl", 
                 "/data/jiapeng/wiki/final_2/10",
                     **C_NPMI(),
                     smooth=True, min_freq=100, num_processes=10)

for sample, score in zip(samples, scores):
    print(f"{score:.4f} {' '.join([id2vocab[w] for w in sample])}")

100%|██████████| 370/370 [00:06<00:00, 54.91it/s]


0.2312 cabinet conservative constituency councillor leader liberal member mep msp politician
0.2359 candidates constituency council democrat democrats elected elects incumbent plurality seat
0.2190 chp coalition elected election elections leader leadership minister parliamentary resigned
0.2404 coalition electoral incumbent labour labour_party liberal national_party party re-elected votes
0.2590 by-election msp parliament parliamentary re-elected resignation seat sitting snp the_labour_party
0.2108 adults belly blotches bright colored dots elongated legs reddish-brown uniformly
0.2089 distinctive nape narrow pink purple reddish-brown rounded thin ventral whitish
0.2170 borne cream-coloured grows markings orange oval petal purple tip triangular
0.2246 leaf longitudinal oval petals pinkish shaped spots surfaces translucent whitish
0.1986 extending irregular pectoral rounded rows sides spines stripes triangular vertical
0.2232 dorsum lighter oblique olive purple shades silvery stripe whit

In [8]:
from topic_metrics.measuring import calculate_scores_from_count_array
"""
We can calculate scores directly from the count array
"""
scores = calculate_scores_from_count_array(samples, single_prob,
            joint_count_arr, topic_metrics.measuring.npmi, topic_metrics.measuring.direct_avg, 
            num_windows=num_windows, smooth=True, min_freq=100, num_processes=5)

for sample, score in zip(samples, scores):
    print(f"{score:.4f} {' '.join([id2vocab[w] for w in sample])}")

100%|██████████| 370/370 [00:01<00:00, 230.60it/s]

0.2312 cabinet conservative constituency councillor leader liberal member mep msp politician
0.2359 candidates constituency council democrat democrats elected elects incumbent plurality seat
0.2190 chp coalition elected election elections leader leadership minister parliamentary resigned
0.2404 coalition electoral incumbent labour labour_party liberal national_party party re-elected votes
0.2590 by-election msp parliament parliamentary re-elected resignation seat sitting snp the_labour_party
0.2108 adults belly blotches bright colored dots elongated legs reddish-brown uniformly
0.2089 distinctive nape narrow pink purple reddish-brown rounded thin ventral whitish
0.2170 borne cream-coloured grows markings orange oval petal purple tip triangular
0.2246 leaf longitudinal oval petals pinkish shaped spots surfaces translucent whitish
0.1986 extending irregular pectoral rounded rows sides spines stripes triangular vertical
0.2232 dorsum lighter oblique olive purple shades silvery stripe whit


