## VRA Corpus TFIDF
This notebook does some pre-processing of the VRA corpus content and then applies the tf-idf algorithm to it (using n-grams of length one to three) to generate potential categories for use in tagging and searches.

In [77]:
from collections import PriorityQueue
import json
import operator
import itertools
import re
import numpy as np
import psycopg2
import nltk
import pickle
from nltk.stem import PorterStemmer
from tqdm import tqdm, tqdm_notebook
from scipy.sparse import csr_matrix, lil_matrix
from bs4 import BeautifulSoup
from IPython.display import display, HTML
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from stopwords import stopwords
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
stopwords = set(itertools.chain(*[stopwords, ENGLISH_STOP_WORDS]))    

In [2]:
conn = psycopg2.connect('dbname=vra user=postgres')
cur = conn.cursor()
cur.execute('select extract from core_content')
docs = cur.fetchall()

In [54]:
def pre_process_text(input_text):
    text=input_text
    # The combination of tf-idf and matching with the wikipedia categories means that we dont really 
    # need to filter text much before matching with topics. In fact, if we do too much, then we drastically
    # reduce useful matching with 2- and 3-grams.
#    text = re.sub(r'\b\d(?!d)(\w{1,4})?\b', '', text, flags=re.I)
#    text = re.sub(r'\b\d{2,9}(\w{2,4}|s|S)?\b', '', text)
#    text = re.sub(r'\b\d{16}\b', '', text)
#    text = re.sub(r'\b0x\w+\b', '', text)
#    text = re.sub(r'\b\d+\w(\d|\w)+\b', '', text)
#    text = re.sub(r':|\-|\.|,|"|\(|\)|\?|\!', '', text)

    text_without_stopwords = []
    for word in text.split():
        #if word not in stopwords:
        if True:
            word = re.sub("(\w+)'s?$", '\g<1>', word)
            text_without_stopwords.append(wnl.lemmatize(word.lower()))
    text = ' '.join(text_without_stopwords)
    return text

In [56]:
corpus = []
for index, item in tqdm(enumerate(docs), total=len(docs)):
    text = '{} {}'.format(item[0]['title'], item[0]['content'])
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = pre_process_text(text)
    
    corpus.append(text)
for item in corpus[:20]:
    print(item[:25])

100%|██████████| 2140/2140 [00:27<00:00, 79.16it/s] 

should older people and t
aggies building a new cam
machine learning (theory)
old scientist: how comput
cosmic dust on earth reve
spacex dragon departs spa
first global mercury map 
google reveals new androi
nasa sounding-rocket miss
altering a robot gender a
person of interest final 
hirise | valley network i
super-salty turkish lake 
jason-3 in orbit update: 
nasa stardust sample retu
new detail on cere seen i
discovery of a fundamenta
google new 'family share 
bionic spaniel help super
pentagon unveils mind-con





In [57]:
vectorizer = TfidfVectorizer(min_df=1, analyzer='word', ngram_range=(1,3), stop_words='english', sublinear_tf=True)
corpus_tfidf_matrix = vectorizer.fit_transform(corpus)
print(json.dumps(dict(zip(vectorizer.get_feature_names()[:50], vectorizer.idf_[:50])), indent=2))

{
  "00 afternoon": 7.975881108029737,
  "00 pm": 6.877268819361627,
  "00 afternoon following": 7.975881108029737,
  "00 mark video": 7.975881108029737,
  "00 mark": 7.975881108029737,
  "00 binary clearly": 7.975881108029737,
  "00 primer": 7.975881108029737,
  "00 pdt april": 7.975881108029737,
  "00 pst click": 7.975881108029737,
  "00 15 edt": 7.975881108029737,
  "00 et monday": 7.282733927469792,
  "00 11 00": 7.570415999921573,
  "00 pm 00": 7.975881108029737,
  "00 cdt release": 7.975881108029737,
  "00 cover": 7.975881108029737,
  "00 pm pst": 7.975881108029737,
  "00 mantis load": 7.975881108029737,
  "00 obvious idea": 7.975881108029737,
  "00 15": 7.975881108029737,
  "00 pdt": 7.975881108029737,
  "00 et": 7.282733927469792,
  "00 pst": 7.570415999921573,
  "00 consecutive trading": 7.975881108029737,
  "00 billion": 7.975881108029737,
  "00 00 hopeful": 7.975881108029737,
  "00 hopeful future": 7.975881108029737,
  "00 edt 1200": 7.975881108029737,
  "00 pst november": 7

In [6]:
corpus_tfidf_matrix

<2140x940159 sparse matrix of type '<class 'numpy.float64'>'
	with 1417973 stored elements in Compressed Sparse Row format>

In [7]:
with open('../data/ontology/readable_graph.p','rb') as infile:
    ontology_raw = pickle.load(infile)


In [8]:
len(ontology_raw.keys())

617752

In [9]:
vectorizer.get_feature_names().index('internet')

423757

In [58]:
stemmer = PorterStemmer()

def pre_process_text_with_stemming(input_text):
    text=input_text
#    text = re.sub(r'\b\d(?!d)(\w{1,4})?\b', '', text, flags=re.I)
#    text = re.sub(r'\b\d{2,9}(\w{2,4}|s|S)?\b', '', text)
#    text = re.sub(r'\b\d{16}\b', '', text)
#    text = re.sub(r'\b0x\w+\b', '', text)
#    text = re.sub(r'\b\d+\w(\d|\w)+\b', '', text)
#    text = re.sub(r':|\-|\.|,|"|\(|\)|\?|\!', '', text)

    text_without_stopwords = []
    for word in text.split():
    #    if word not in stopwords:
        if True:
            word = re.sub("(\w+)'s?$", '\g<1>', word)
            text_without_stopwords.append(stemmer.stem(word.lower()))
    text = ' '.join(text_without_stopwords)
    return text

In [59]:
print("Pre-processing feature names")
feature_names = vectorizer.get_feature_names()
#feature_names = [pre_process_text_with_stemming(x) for x in tqdm(vectorizer.get_feature_names())]

print("Pre-processing ontology names")
ontology_terms = list(ontology_raw.keys())
ontology_terms_position_dict = {
    pre_process_text(key):position for (position, key) in tqdm(enumerate(ontology_terms))
}

print("Creating tfidf_ontology_matrix")
tfidf_ontology_matrix = lil_matrix((len(feature_names), len(ontology_terms)))
print(tfidf_ontology_matrix.shape)
for (tfidf_index, tfidf_term) in tqdm(enumerate(feature_names), total=len(feature_names)):
    ontology_index = ontology_terms_position_dict.get(tfidf_term, -1)
    if ontology_index >= 0:
        tfidf_ontology_matrix[tfidf_index,ontology_index] = 1
        
    #if tfidf_term in ontology_terms_set:
    #    ontology_index = ontology_terms.index(tfidf_term)
    #    tfidf_ontology_matrix[tfidf_index,ontology_index] = 1
                    

Pre-processing feature names


1392it [00:00, 13915.00it/s]

Pre-processing ontology names


617752it [00:39, 15555.64it/s]


Creating tfidf_ontology_matrix


 11%|█         | 114820/1068549 [00:00<00:01, 565274.86it/s]

(1068549, 617752)


100%|██████████| 1068549/1068549 [00:01<00:00, 656521.47it/s]


In [60]:
tfidf_ontology_matrix.getnnz()

15492

In [49]:
print(tfidf_ontology_matrix.shape)
feature_names[423757]

(1026938, 617752)


'imparted'

In [61]:
corpus_ontology_matrix = corpus_tfidf_matrix * tfidf_ontology_matrix

In [90]:
def get_categories_for_doc(doc_id, corpus_ontology_matrix, ontology_terms):
    doc_row = corpus_ontology_matrix[doc_id]
    (nzrows, nzcolumns) = np.nonzero(doc_row)
    categories = {ontology_terms[x]:corpus_ontology_matrix[doc_id,x]  for x in nzcolumns}
    return(sorted(categories.items(), key=operator.itemgetter(1), reverse=True))

In [91]:
get_categories_for_doc(1920, corpus_ontology_matrix, ontology_terms)

[('intelligent design', 0.10372358699204302),
 ('lesson plan', 0.087892644379619675),
 ('religious', 0.06430953528993319),
 ('argument', 0.062506142478723339),
 ('documents', 0.062271734076365902),
 ('dover', 0.057029593351962615),
 ('lesson', 0.056351752437624344),
 ('controversy', 0.054097755151829588),
 ('evolution', 0.05209130798721498),
 ('school districts', 0.049174234022303746),
 ('courts', 0.048562057634881556),
 ('handout', 0.047117233499808779),
 ('exercise', 0.046686458317395423),
 ('schools', 0.046179980420000682),
 ('design', 0.041409832153954121),
 ('ohio', 0.040932280208480609),
 ('repeated', 0.039261874170149924),
 ('district', 0.039261874170149924),
 ('videos', 0.036362121707800711),
 ('fossils', 0.0361667850814755),
 ('teach', 0.035976100282403695),
 ('student', 0.035429681509320139),
 ('consonants', 0.033682596531922404),
 ('scientific controversy', 0.033682596531922404),
 ('normal science', 0.033682596531922404),
 ('creationism', 0.033682596531922404),
 ('standard l

In [88]:
display(HTML(docs[1920][0]['content']))

In [15]:
np.nonzero(corpus_ontology_matrix[123])

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
 array([615896, 483805, 458531, 586542, 515971, 497110, 589279, 379817,
        617298, 456930, 5

In [102]:
np.nonzero(corpus_ontology_matrix[823])

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
 array([304037,  44415, 357884, 355578, 424598, 172402, 417585, 494993,
        463119, 377629, 473893,  38013, 608846, 187473, 148051, 484453,
        220371, 184136, 296883, 298801, 245557, 485734,  38588,  74558,
        304485, 406167, 299762, 293619,  62890, 337930, 435133, 479263,
        534210, 461851, 455812, 342256, 599374, 455307, 502854, 503752,
          3842, 530928, 321121,  16506, 366360, 559209, 423624, 198463,
        401728, 102984, 287366, 443091, 540353, 347518, 303305, 530339,
        310360, 275255, 279401, 335436,  95433, 494173,  15932,  36292,
        456855, 233002, 534365, 155632, 260214, 

In [111]:
ontology_terms[616213]

'pilot'

In [112]:
len(np.nonzero(corpus_ontology_matrix[:,616213])[0])

54

# Ideas for Using ontology terms

1. for each item in the ontology structure, annotate each items with all paths that lead to that item from any root. This is probably done by traversing the graph from each root down to each leaf, adding the path as it goes
2. create a dictionary that maps ontology entries to numbers (`ontology_item_count`)
3. For each ontology item extracted out of the corpus_ontology_matrix,  and do the following:
  - add 1 to the value of the entry `ontology_item_count` with the item as key (first time is 1)
  - for each parent in the path of the item (from each root), also add 1 to the entry for the item in each path to each root
4. Find the top N entries in `ontology_item_count` and those are the topics associated with the document

A possible variation would be to add more than 1 for items with longer "parenting" so specific topics have a fighting chance of being included (for example, level 4 topics will always only ever appear once since there are no children below them in the path) - for example, level 4 topics could have 3 points, level 3 topics 2 points, and level 2 and 1 topics have 1 point. 

# Back Matter

In [17]:
doc_number = 1300
keywords = vectorizer.transform([corpus[doc_number]])
feature_names = vectorizer.get_feature_names()
top_keywords_data_indices = np.fliplr([keywords.data.argsort()])[0]
for index in top_keywords_data_indices:
    print('{}: {}'.format(feature_names[keywords.indices[index]], keywords[0, keywords.indices[index]]))

sl: 0.053392648345908254
reusability: 0.04673331119531088
wofford: 0.04639987548183314
engine: 0.04606131523858041
rocket engine: 0.044403392014127546
rp: 0.04413144117441303
steve wofford: 0.04336963680040668
glenn case: 0.04336963680040668
core stage: 0.042367475495807015
paulsen: 0.04116488045615963
aerojet: 0.040853472614838224
flight rate: 0.037644705643552615
falcon: 0.03740122501283209
shuttle: 0.03693862176013398
ferrari: 0.036214175678424325
commenter: 0.036214175678424325
isp: 0.036214175678424325
thrust: 0.03555626637159073
ferrari rocket engine: 0.03487956249532138
ferrari rocket: 0.03487956249532138
liquid engine: 0.03487956249532138
sl engine: 0.03487956249532138
rocket: 0.03462895961400827
main engine: 0.03419795693382413
glenn: 0.03362676872582148
merlin: 0.033431426142529336
space shuttle main: 0.033106410991885216
shuttle main: 0.033106410991885216
authorization act: 0.033106410991885216
nasa aerojet: 0.033106410991885216
shuttle main engine: 0.033106410991885216
clos

In [18]:
print(keywords)

  (0, 936467)	0.0166202984151
  (0, 936462)	0.0137315108742
  (0, 936038)	0.0166202984151
  (0, 936037)	0.0166202984151
  (0, 935961)	0.0166202984151
  (0, 935956)	0.0125653724542
  (0, 935905)	0.0166202984151
  (0, 935904)	0.0166202984151
  (0, 935903)	0.0267100439333
  (0, 934249)	0.0166202984151
  (0, 934248)	0.0166202984151
  (0, 933669)	0.0166202984151
  (0, 933668)	0.0157753822231
  (0, 933248)	0.0166202984151
  (0, 933243)	0.0143309884526
  (0, 932930)	0.0111687291609
  (0, 929427)	0.0166202984151
  (0, 929426)	0.0151759046447
  (0, 929370)	0.00946448812668
  (0, 929341)	0.0166202984151
  (0, 929340)	0.0166202984151
  (0, 929325)	0.0127198001087
  (0, 929218)	0.0166202984151
  (0, 929217)	0.0166202984151
  (0, 929104)	0.00851044649325
  :	:
  (0, 12508)	0.0138758633363
  (0, 11847)	0.0166202984151
  (0, 11846)	0.0166202984151
  (0, 11791)	0.0166202984151
  (0, 11790)	0.0166202984151
  (0, 11677)	0.0153064308046
  (0, 10908)	0.0166202984151
  (0, 10904)	0.0157753822231
  (0, 1089

In [19]:
display(HTML(docs[doc_number][0]['content']))

In [20]:
raw_tokenized = nltk.word_tokenize(docs[doc_number][0]['content'])
processed_tokenized = nltk.word_tokenize(corpus[doc_number])
text_processed = nltk.Text(processed_tokenized)
text_processed_fd = nltk.FreqDist(text_processed)

In [21]:
text_processed_fd.most_common(25)

[('engine', 77),
 ('r', 38),
 ('rocket', 36),
 ('the', 36),
 ('sl', 31),
 ('nasa', 26),
 ('space', 22),
 ('it', 18),
 ('stage', 16),
 ('shuttle', 16),
 ('case', 16),
 ('flight', 15),
 ('falcon', 15),
 ('but', 14),
 ('that', 13),
 ('launch', 13),
 ('spacex', 12),
 ('cost', 12),
 ('liquid', 10),
 ('v', 10),
 ('vehicle', 10),
 ('orbit', 10),
 ('thrust', 9),
 ('year', 8),
 ('hydrogen', 8)]