In [1]:
import os
import pickle
from topic_metrics.io_utils import split_corpus
from topic_metrics.counting import count_histogram, count_vocab, count_windows

test_dir = "tests/20ng/corpus"
dest_dir = "outputs/20ng"

corpus_dir = os.path.join(dest_dir, 'corpus')
os.makedirs(dest_dir, exist_ok=True)
os.makedirs(corpus_dir, exist_ok=True)

"""
split documents into smaller batches for faster processing
"""

split_corpus(test_dir, corpus_dir, batch=1000, num_processes=1)
print(corpus_dir, len(os.listdir(corpus_dir)),'\n')

"""
further process corpus via sliding windows
We calculate co-occurence in sliding windows via convolutions and boolean operations
"""
count_vocab(corpus_dir, f"{dest_dir}/vocab_count.pkl", num_processes=4)

# uploaded vocab are in alphabetically-sorted
vocab = sorted(pickle.load(open(f"{dest_dir}/vocab_count.pkl", 'rb')))
vocab_index = {k:i for i,k in enumerate(vocab)}
count_histogram(corpus_dir, dest_dir, num_processes=4)
count_windows(corpus_dir, dest_dir, window_size=10, vocab2id=vocab_index, count_processes=10)

100%|██████████| 1/1 [00:00<00:00, 1572.08it/s]

outputs/20ng/corpus 17 




100%|██████████| 17/17 [00:00<00:00, 12400.55it/s]
100%|██████████| 17/17 [00:00<00:00, 4119.90it/s]

vocab saved: outputs/20ng/vocab_count.pkl



100%|██████████| 17/17 [00:00<00:00, 15719.39it/s]
100%|██████████| 17/17 [00:00<00:00, 38521.43it/s]

histogram saved: outputs/20ng/histogram.csv



100%|██████████| 17/17 [00:05<00:00,  3.00it/s]


counting completed, 13.603363752365112 seconds, dumping...
completed. 17.91154980659485 seconds
Single prior counts saved to: outputs/20ng/10/single.pkl
Joint co-occurrence counts saved to: outputs/20ng/10/joint


In [2]:
"""
Next, we can easily load the count graphs as probability graphs
"""
from topic_metrics.measuring import single_count_setup, load_full_joint_count_graph, create_scores_from_count_array, npmi
from topic_metrics.io_utils import reshape_shared_array
from time import time

dest_npmi = 'outputs/20ng/npmi_10'
os.makedirs(dest_npmi, exist_ok=True)

num_windows, single_prob = single_count_setup("outputs/20ng/histogram.csv",
                                              "outputs/20ng/10/single.pkl",
                                              window_size=10, min_freq=0)

start = time()
joint_count_arr = load_full_joint_count_graph("outputs/20ng/10/joint", len(vocab_index), num_processes=40)

"""
Using the count graph, we can create scored (npmi) graphs
"""

joint_npmi_arr = create_scores_from_count_array(npmi,joint_count_arr,single_prob,num_windows,min_freq=0)
joint_npmi_arr = reshape_shared_array(joint_npmi_arr, len(vocab_index))
print('loaded in', time()-start)

# """
# Saving it in a dictionary format, it is easier to load specific dicts compared to building the entire graph
# """
for k1, s in enumerate(joint_npmi_arr):
    pickle.dump({k2:v for k2,v in enumerate(s)}, open(os.path.join(dest_npmi,f"{k1}.pkl"), "wb"))
print(len(os.listdir('outputs/20ng/npmi_10')))

100%|██████████| 1612/1612 [00:00<00:00, 400712.27it/s]
100%|██████████| 1612/1612 [00:00<00:00, 4861.58it/s]


loaded in 0.6950051784515381
1612


In [3]:
"""
Let's use our corpus statistic to evaluate a trained model
We convert the topics to vocab id corresponding to our processed statistics
"""
import topic_metrics
from topic_metrics.measuring import calculate_scores_from_counts, direct_avg

opt_scores = pickle.load(open("/home/jiapeng/project3/baselines/ctm_20NewsGroup_50/results_b0_mf50_orig.pkl",'rb'))
topics = [x[-1] for x  in opt_scores]

"""
We can lazily calculate directly from the counts
Using counts give us flexibility to tweak the measures
"""
scores = calculate_scores_from_counts([[vocab_index[w] for w in t] for t in topics], 
                 "outputs/20ng/histogram.csv",
                 "outputs/20ng/10/single.pkl",
                 "outputs/20ng/10/joint", 
                score_func = npmi, window_size = 10, agg_func = direct_avg,
                smooth=True, min_freq=0, num_processes=10)


for topic, score in zip(topics, scores):
    print(f"{score:.4f} {' '.join(topic)}")

100%|██████████| 50/50 [00:00<00:00, 482.90it/s]

-0.0099 interpretation obvious statement lack suit absolutely personal fool truth absolute
0.1815 commercial capability element flight station space shuttle satellite vehicle launch
0.1322 prepare bear heart faith life sin pray day hell eternal
0.1271 application character printer problem font manager mouse button window print
-0.1906 actual quickly accurate advantage capable guarantee gather pre portion suit
0.1387 number child disease age medical patient year health risk adult
0.1892 session secret attack algorithm key random block bit encryption chip
0.0647 black sign cover art game man annual title copy bag
-0.1035 operate favor money suit bank raise stay taxis task worry
0.1162 stick shoot play zone game goal good penalty point time
0.2157 shift tire gear road bike ride brake transmission engine car
0.0504 past sick eat hour late ago week doctor pain month
0.0616 building brother floor hide morning year neighbor work time people
0.1013 woman car start neighbor door police building




In [4]:
"""
Since we already have an NPMI graph, let's use it to do the same calculations
However, pre-calculating the entire NPMI graph can be time-consuming for larger corpus
"""
from topic_metrics.measuring import calculate_scored_graphs


scores = calculate_scored_graphs([[vocab_index[w] for w in t] for t in topics],
                 "outputs/20ng/npmi_10", 
                agg_func = direct_avg, num_processes=10)


for topic, score in zip(topics, scores):
    print(f"{score:.4f} {' '.join(topic)}")

100%|██████████| 50/50 [00:00<00:00, 31091.95it/s]


-0.0099 interpretation obvious statement lack suit absolutely personal fool truth absolute
0.1815 commercial capability element flight station space shuttle satellite vehicle launch
0.1322 prepare bear heart faith life sin pray day hell eternal
0.1271 application character printer problem font manager mouse button window print
-0.1906 actual quickly accurate advantage capable guarantee gather pre portion suit
0.1387 number child disease age medical patient year health risk adult
0.1892 session secret attack algorithm key random block bit encryption chip
0.0647 black sign cover art game man annual title copy bag
-0.1035 operate favor money suit bank raise stay taxis task worry
0.1162 stick shoot play zone game goal good penalty point time
0.2157 shift tire gear road bike ride brake transmission engine car
0.0504 past sick eat hour late ago week doctor pain month
0.0616 building brother floor hide morning year neighbor work time people
0.1013 woman car start neighbor door police building

In [5]:
"""
Now we move on to bigger graphs -- Wikipedia
Randomly mining topics (cliques) with edge-weight > npmi-value 0.1
"""
import random
import pickle
import numpy as np
from time import time
from topic_metrics.measuring import single_count_setup, load_full_joint_count_graph, create_scores_from_count_array, npmi
from topic_metrics.io_utils import reshape_shared_array
from topic_metrics.mining import sample

data_dir = "/data/jiapeng/wiki/final_2"
wsz, min_freq = 10, 100
vocab2id = {k:i for i,k in enumerate(sorted(pickle.load(open(f"{data_dir}/vocab_count.pkl", 'rb'))))}
id2vocab = {v:k for k,v in vocab2id.items()}
num_windows, single_prob = single_count_setup(f"{data_dir}/histogram.csv",
                                                        f"{data_dir}/{wsz}_single.pkl", wsz, min_freq) 
start = time()
joint_count_arr = load_full_joint_count_graph(f"{data_dir}/{wsz}", len(vocab2id), num_processes=40)
joint_npmi_arr = create_scores_from_count_array(npmi,joint_count_arr,single_prob,
                                                 num_windows,min_freq=100,num_processes=40)
joint_npmi_arr = reshape_shared_array(joint_npmi_arr, len(vocab2id))
print('loaded in', time()-start)


100%|██████████| 39961/39961 [00:23<00:00, 1676.32it/s]
100%|██████████| 39961/39961 [00:52<00:00, 765.97it/s] 


loaded in 101.14856314659119


In [6]:
from topic_metrics.mining import sample

r = random.sample(range(30000), 10)
samples = []
for key in r:
    samples.extend(sample(key, joint_npmi_arr, 
                          clique_size=10, edge_condition=lambda v: v>.1, 
                          target=100))

START: 3819 QTY: 26


100%|██████████| 26/26 [00:00<00:00, 57486.51it/s]
100%|██████████| 26/26 [00:00<00:00, 15230.71it/s]

3819	Pruned: 0.10320210456848145
3819	@0.10422396659851074s	 Sampled: 0





START: 8433 QTY: 600


100%|██████████| 600/600 [00:00<00:00, 4058.36it/s]
100%|██████████| 600/600 [00:00<00:00, 966.36it/s]


8433	Pruned: 0.8721444606781006
8433	@6.827470779418945s	 Sampled: 100
START: 19053 QTY: 47


100%|██████████| 47/47 [00:00<00:00, 49730.65it/s]
100%|██████████| 47/47 [00:00<00:00, 11382.43it/s]


19053	Pruned: 0.08179950714111328
19053	@0.09302783012390137s	 Sampled: 1
START: 4647 QTY: 967


100%|██████████| 967/967 [00:00<00:00, 3483.38it/s]
100%|██████████| 967/967 [00:01<00:00, 596.13it/s]


4647	Pruned: 1.9838898181915283
4647	@9.66976022720337s	 Sampled: 100
START: 2749 QTY: 145


100%|██████████| 145/145 [00:00<00:00, 19590.08it/s]
100%|██████████| 145/145 [00:00<00:00, 4007.63it/s]

2749	Pruned: 0.11713004112243652





2749	@0.3934199810028076s	 Sampled: 7
START: 24524 QTY: 182


100%|██████████| 182/182 [00:00<00:00, 15968.94it/s]
100%|██████████| 182/182 [00:00<00:00, 3134.83it/s]

24524	Pruned: 0.14628839492797852





24524	@1.5269231796264648s	 Sampled: 14
START: 12035 QTY: 425


100%|██████████| 425/425 [00:00<00:00, 6751.50it/s]
100%|██████████| 425/425 [00:00<00:00, 1318.70it/s]


12035	Pruned: 0.46665287017822266
12035	@8.081963062286377s	 Sampled: 56
START: 27003 QTY: 695


100%|██████████| 695/695 [00:00<00:00, 4573.28it/s]
100%|██████████| 695/695 [00:00<00:00, 831.40it/s]


27003	Pruned: 1.0707147121429443
27003	@8.336115837097168s	 Sampled: 57
START: 19585 QTY: 1133


100%|██████████| 1133/1133 [00:00<00:00, 2970.57it/s]
100%|██████████| 1133/1133 [00:02<00:00, 509.30it/s]


19585	Pruned: 2.693538188934326
19585	@10.50261378288269s	 Sampled: 100
START: 23362 QTY: 43


100%|██████████| 43/43 [00:00<00:00, 53249.21it/s]
100%|██████████| 43/43 [00:00<00:00, 12852.21it/s]

23362	Pruned: 0.08014321327209473
23362	@0.14968585968017578s	 Sampled: 2





In [7]:
from topic_metrics.measuring import calculate_scores_from_counts, direct_avg
"""
Finally, we caculate the final C_NPMI scores of the sampled topics
We have some predefined settings such as C_NPMI, C_P, C_V, C_UMass
Also possible for finer control over min_freq, window_size, score_func, and agg_func
"""
from topic_metrics.measuring import C_NPMI
scores = calculate_scores_from_counts(samples, "/data/jiapeng/wiki/histogram.csv",
                 "/data/jiapeng/wiki/final_2/10_single.pkl", 
                 "/data/jiapeng/wiki/final_2/10",
                     **C_NPMI(),
                     smooth=True, min_freq=100, num_processes=10)

for sample, score in zip(samples, scores):
    print(f"{score:.4f} {' '.join([id2vocab[w] for w in sample])}")

100%|██████████| 437/437 [00:05<00:00, 84.04it/s] 


0.2508 nfl offensive raiders receptions starter the_los_angeles_rams the_oakland_raiders the_seattle_seahawks threw tight
0.2348 completions consecutive fourth interception offense packers quarterback record threw yardage
0.2288 afl broncos cowboys sacks the_minnesota_vikings vikings washington_redskins week win yards
0.2234 halftime kick longhorns lsu missed quarter rushing score scored sooners
0.2428 bryant offense overtime pass quarter receiver receptions rookie rushed touchdown
0.2417 kansas_city kickoff overtime preseason punt tackles the_atlanta_falcons the_indianapolis_colts the_miami_dolphins week
0.2229 cowboys kickoff matchup score seahawks texans the_new_england_patriots threw tied touchdowns
0.2638 preseason re-signed receiver saints signed tackle tampa_bay_buccaneers texans the_tampa_bay_buccaneers waived
0.2734 jaguars the_cincinnati_bengals the_pittsburgh_steelers tight titans touchdown touchdowns undrafted vikings waived
0.2341 20-yard cowboys possession punts raiders r

In [8]:
from topic_metrics.measuring import calculate_scores_from_count_array
"""
We can calculate scores directly from the count array
"""
scores = calculate_scores_from_count_array(samples, single_prob,
            joint_count_arr, topic_metrics.measuring.npmi, topic_metrics.measuring.direct_avg, 
            num_windows=num_windows, smooth=True, min_freq=100, num_processes=5)

for sample, score in zip(samples, scores):
    print(f"{score:.4f} {' '.join([id2vocab[w] for w in sample])}")

100%|██████████| 437/437 [00:01<00:00, 333.54it/s]

0.2508 nfl offensive raiders receptions starter the_los_angeles_rams the_oakland_raiders the_seattle_seahawks threw tight
0.2348 completions consecutive fourth interception offense packers quarterback record threw yardage
0.2288 afl broncos cowboys sacks the_minnesota_vikings vikings washington_redskins week win yards
0.2234 halftime kick longhorns lsu missed quarter rushing score scored sooners
0.2428 bryant offense overtime pass quarter receiver receptions rookie rushed touchdown
0.2417 kansas_city kickoff overtime preseason punt tackles the_atlanta_falcons the_indianapolis_colts the_miami_dolphins week
0.2229 cowboys kickoff matchup score seahawks texans the_new_england_patriots threw tied touchdowns
0.2638 preseason re-signed receiver saints signed tackle tampa_bay_buccaneers texans the_tampa_bay_buccaneers waived
0.2734 jaguars the_cincinnati_bengals the_pittsburgh_steelers tight titans touchdown touchdowns undrafted vikings waived
0.2341 20-yard cowboys possession punts raiders r


