In [1]:
import os
import pickle
from topic_metrics.counting import split_corpus, count_histogram, count_vocab, count_windows

test_dir = "tests/20ng/corpus"
dest_dir = "outputs/20ng"

corpus_dir = os.path.join(dest_dir, 'corpus')
os.makedirs(dest_dir, exist_ok=True)
os.makedirs(corpus_dir, exist_ok=True)

"""
split documents into smaller batches for faster processing
"""

split_corpus(test_dir, corpus_dir, batch=1000, num_processes=1)
print(corpus_dir, len(os.listdir(corpus_dir)),'\n')

"""
further process corpus via sliding windows
We calculate co-occurence in sliding windows via convolutions and boolean operations
"""
count_vocab(corpus_dir, dest_dir, num_processes=4)

# uploaded vocab are in alphabetically-sorted
vocab = sorted(pickle.load(open(f"{dest_dir}/vocab_count.pkl", 'rb')))
vocab_index = {k:i for i,k in enumerate(vocab)}
count_histogram(corpus_dir,dest_dir, num_processes=4)
count_windows(corpus_dir, dest_dir, window_size=10, vocab2id=vocab_index, num_processes=10)

100%|██████████| 1/1 [00:00<00:00, 1571.49it/s]

outputs/20ng/corpus 17 




100%|██████████| 17/17 [00:00<00:00, 14910.74it/s]
100%|██████████| 17/17 [00:00<00:00, 5862.30it/s]

vocab saved: outputs/20ng/vocab_count.pkl



100%|██████████| 17/17 [00:00<00:00, 26108.81it/s]
100%|██████████| 17/17 [00:00<00:00, 48472.58it/s]

histogram saved: outputs/20ng/histogram.csv



100%|██████████| 17/17 [00:05<00:00,  3.30it/s]


pre-counting completed, post-processing...


100%|██████████| 17/17 [00:00<00:00, 24203.38it/s]

single counts completed, joint counting...



100%|██████████| 17/17 [00:00<00:00, 23008.44it/s]


joint counts completed, dumping...
cleaning...
completed.
Single prior counts saved to: outputs/20ng/10/single.pkl
Joint co-occurrence counts saved to: outputs/20ng/10/joint


In [4]:
"""
Next, we can easily load the count graphs as probability graphs
"""
from topic_metrics.measuring import single_count_setup, load_joint_prob_graph, create_graph_with, npmi
import time

dest_npmi = 'outputs/20ng/npmi_10'
os.makedirs(dest_npmi, exist_ok=True)

num_windows, single_prob = single_count_setup("outputs/20ng/histogram.csv",
                                              "outputs/20ng/10/single.pkl",
                                              window_size=10, min_freq=0)

joint_prob = load_joint_prob_graph("outputs/20ng/10/joint", 
                        num_windows=num_windows, min_freq=0, shortlist=[],
                          num_processes=20, existing_graph={})

"""
Using the probability graph, we can create scored (npmi) graphs
"""
npmi_graph = create_graph_with(npmi, joint_prob, single_prob, smooth=True)

"""
Saving it in a similar format as count graphs
"""
for k1, s in npmi_graph.items():
    pickle.dump(dict(s), open(os.path.join(dest_npmi,f"{k1}.pkl"), "wb"))
print(len(os.listdir('outputs/20ng/npmi_10')))

100%|██████████| 1612/1612 [00:00<00:00, 2357.09it/s]


1612


In [5]:
"""
Let's use our corpus statistic to evaluate a trained model
We convert the topics to vocab id corresponding to our processed statistics
"""
import topic_metrics
from topic_metrics.measuring import calculate_scores_from_counts, direct_avg

opt_scores = pickle.load(open("/home/jiapeng/project3/baselines/ctm_20NewsGroup_50/results_b0_mf50_orig.pkl",'rb'))
topics = [x[-1] for x  in opt_scores]

"""
We can lazily calculate directly from the counts
Using counts give us flexibility to tweak the measures
"""
scores = calculate_scores_from_counts([[vocab_index[w] for w in t] for t in topics], 
                 "outputs/20ng/histogram.csv",
                 "outputs/20ng/10/single.pkl",
                 "outputs/20ng/10/joint", 
                score_func = npmi, window_size = 10, agg_func = direct_avg,
                smooth=True, min_freq=0, num_processes=10)


for topic, score in zip(topics, scores):
    print(f"{score:.4f} {' '.join(topic)}")

100%|██████████| 50/50 [00:00<00:00, 614.07it/s]

-0.0099 interpretation obvious statement lack suit absolutely personal fool truth absolute
0.1815 commercial capability element flight station space shuttle satellite vehicle launch
0.1322 prepare bear heart faith life sin pray day hell eternal
0.1271 application character printer problem font manager mouse button window print
-0.1906 actual quickly accurate advantage capable guarantee gather pre portion suit
0.1387 number child disease age medical patient year health risk adult
0.1892 session secret attack algorithm key random block bit encryption chip
0.0647 black sign cover art game man annual title copy bag
-0.1035 operate favor money suit bank raise stay taxis task worry
0.1162 stick shoot play zone game goal good penalty point time
0.2157 shift tire gear road bike ride brake transmission engine car
0.0504 past sick eat hour late ago week doctor pain month
0.0616 building brother floor hide morning year neighbor work time people
0.1013 woman car start neighbor door police building




In [8]:
"""
Since we already have an NPMI graph, let's use it to do the same calculations
However, pre-calculating the entire NPMI graph can be time-consuming for larger corpus
"""
from topic_metrics.measuring import calculate_scored_graphs


scores = calculate_scored_graphs([[vocab_index[w] for w in t] for t in topics],
                 "outputs/20ng/npmi_10", 
                agg_func = direct_avg, num_processes=10)


for topic, score in zip(topics, scores):
    print(f"{score:.4f} {' '.join(topic)}")

100%|██████████| 50/50 [00:00<00:00, 37302.60it/s]

-0.0099 interpretation obvious statement lack suit absolutely personal fool truth absolute
0.1815 commercial capability element flight station space shuttle satellite vehicle launch
0.1322 prepare bear heart faith life sin pray day hell eternal
0.1271 application character printer problem font manager mouse button window print
-0.1906 actual quickly accurate advantage capable guarantee gather pre portion suit
0.1387 number child disease age medical patient year health risk adult
0.1892 session secret attack algorithm key random block bit encryption chip
0.0647 black sign cover art game man annual title copy bag
-0.1035 operate favor money suit bank raise stay taxis task worry
0.1162 stick shoot play zone game goal good penalty point time
0.2157 shift tire gear road bike ride brake transmission engine car
0.0504 past sick eat hour late ago week doctor pain month
0.0616 building brother floor hide morning year neighbor work time people
0.1013 woman car start neighbor door police building




In [10]:
"""
Now we move on to bigger graphs -- Wikipedia
Randomly mining topics (cliques) with edge-weight > npmi-value 0.1
"""
import random
from topic_metrics.mining import sample

r = random.sample(range(30000), 10)
samples = []
for key in r:
    samples.extend(sample(key, "tests/wiki/prob_10", 
                          clique_size=10, edge_condition=lambda v: v>.1, 
                          target=100))

START: 522


238it [00:01, 183.69it/s]


522	Loaded: 1.308131456375122
522	Pruned: 13.719730615615845
522	@16.173567295074463s	 Sampled: 28
START: 25926


349it [00:01, 178.40it/s]


25926	Loaded: 1.9681410789489746
25926	Pruned: 29.5809965133667
25926	@31.528244256973267s	 Sampled: 17
START: 23859


113it [00:00, 204.67it/s]


23859	Loaded: 0.563974142074585
23859	Pruned: 3.3966941833496094
23859	@3.5509443283081055s	 Sampled: 5
START: 19364


34it [00:00, 212.25it/s]


19364	Loaded: 0.1718440055847168
19364	Pruned: 0.4407620429992676
19364	@0.4415731430053711s	 Sampled: 0
START: 21166


90it [00:00, 194.21it/s]


21166	Loaded: 0.47439122200012207
21166	Pruned: 2.3054561614990234
21166	@2.380438804626465s	 Sampled: 3
START: 16992


42it [00:00, 194.11it/s]


16992	Loaded: 0.22812819480895996
16992	Pruned: 0.6224210262298584
16992	@0.626230001449585s	 Sampled: 1
START: 3210


310it [00:01, 185.67it/s]


3210	Loaded: 1.6784298419952393
3210	Pruned: 23.505011558532715
3210	@25.59369397163391s	 Sampled: 19
START: 10069


260it [00:01, 195.07it/s]


10069	Loaded: 1.3457491397857666
10069	Pruned: 16.606889724731445
10069	@21.38482356071472s	 Sampled: 27
START: 18009


877it [00:04, 204.82it/s]


18009	Loaded: 4.291403293609619
18009	Pruned: 175.28465056419373
18009	@185.9761462211609s	 Sampled: 97
START: 5424


181it [00:00, 187.76it/s]


5424	Loaded: 0.9764842987060547
5424	Pruned: 8.04410982131958
5424	@9.025786876678467s	 Sampled: 12


In [14]:
"""
Finally, we caculate the final C_NPMI scores of the sampled topics
We have some predefined settings such as C_NPMI, C_P, C_V, C_UMass
Not sure why the code runs slower with type hints, hence its disabled
"""
from topic_metrics.measuring import C_NPMI
scores = calculate_scores_from_counts(samples, "/data/jiapeng/wiki/histogram.csv",
                 "/data/jiapeng/wiki/final_2_pkl/10_single.pkl", 
                 "/data/jiapeng/wiki/final_2_pkl/10",
                     **C_NPMI(),
                     smooth=True, min_freq='auto', num_processes=10)

vocab = pickle.load(open(f"/data/jiapeng/wiki/vocab.pkl", 'rb'))
vocab2id = {word:i for i, word in enumerate(sorted(vocab))}
id2vocab = {v:k for k,v in vocab2id.items()}

for sample, score in zip(samples, scores):
    print(f"{score:.4f} {' '.join([id2vocab[w] for w in sample])}")

100%|██████████| 209/209 [00:02<00:00, 83.55it/s]


0.1868 buffer data file hardware instructions kernel memory mode stack virtual
0.1933 allows api database disk files formats hardware microsoft processor supports
0.1892 access applications client clients downloading file google networks server websites
0.1595 application applications database directory functionality online providers provides resource virtual
0.2121 built-in cache controller directory disk interface linux module stack user
0.1863 accessing allows node object pointer stack stored storing variable variables
0.2189 browser content download email files software users virtual website websites
0.2048 access accessing care disabilities disability health healthcare hiv providers services
0.1782 data email mobile network networks online phones provider services technologies
0.1878 allows applications data interface interfaces method nodes processing query syntax
0.2170 accessing client data directory files http information protocol url web
0.2075 accessing apis applications app