## VRA Corpus TFIDF
This notebook does some pre-processing of the VRA corpus content and then applies the tf-idf algorithm to it (using n-grams of length one to three) to generate potential categories for use in tagging and searches.

In [1]:
import json
import operator
import itertools
import re
import numpy as np
import psycopg2
import nltk
import pickle
from nltk.stem import PorterStemmer
from tqdm import tqdm, tqdm_notebook
from scipy.sparse import csr_matrix, lil_matrix
from bs4 import BeautifulSoup
from IPython.display import display, HTML
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from stopwords import stopwords
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
stopwords = set(itertools.chain(*[stopwords, ENGLISH_STOP_WORDS]))    

In [2]:
conn = psycopg2.connect('dbname=vra user=postgres')
cur = conn.cursor()
cur.execute('select extract from core_content')
docs = cur.fetchall()

In [3]:
def pre_process_text(input_text):
    text=input_text
    # The combination of tf-idf and matching with the wikipedia categories means that we dont really 
    # need to filter text much before matching with topics. In fact, if we do too much, then we drastically
    # reduce useful matching with 2- and 3-grams.
#    text = re.sub(r'\b\d(?!d)(\w{1,4})?\b', '', text, flags=re.I)
#    text = re.sub(r'\b\d{2,9}(\w{2,4}|s|S)?\b', '', text)
#    text = re.sub(r'\b\d{16}\b', '', text)
#    text = re.sub(r'\b0x\w+\b', '', text)
#    text = re.sub(r'\b\d+\w(\d|\w)+\b', '', text)
#    text = re.sub(r':|\-|\.|,|"|\(|\)|\?|\!', '', text)

    text_without_stopwords = []
    for word in text.split():
        #if word not in stopwords:
        if True:
            word = re.sub("(\w+)'s?$", '\g<1>', word)
            text_without_stopwords.append(wnl.lemmatize(word.lower()))
    text = ' '.join(text_without_stopwords)
    return text

In [4]:
corpus = []
for index, item in tqdm(enumerate(docs), total=len(docs)):
    text = '{} {}'.format(item[0]['title'], item[0]['content'])
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = pre_process_text(text)
    
    corpus.append(text)
for item in corpus[:20]:
    print(item[:25])

100%|██████████| 2140/2140 [00:21<00:00, 99.08it/s] 

should older people and t
aggies building a new cam
machine learning (theory)
old scientist: how comput
cosmic dust on earth reve
spacex dragon departs spa
first global mercury map 
google reveals new androi
nasa sounding-rocket miss
altering a robot gender a
person of interest final 
hirise | valley network i
super-salty turkish lake 
jason-3 in orbit update: 
nasa stardust sample retu
new detail on cere seen i
discovery of a fundamenta
google new 'family share 
bionic spaniel help super
pentagon unveils mind-con





In [5]:
vectorizer = TfidfVectorizer(min_df=1, analyzer='word', ngram_range=(1,3), stop_words='english', sublinear_tf=True)
corpus_tfidf_matrix = vectorizer.fit_transform(corpus)
print(json.dumps(dict(zip(vectorizer.get_feature_names()[:50], vectorizer.idf_[:50])), indent=2))

{
  "00 cover": 7.975881108029737,
  "00 15 edt": 7.975881108029737,
  "00 11": 7.570415999921573,
  "00 pm 45": 7.975881108029737,
  "00 mantis": 7.975881108029737,
  "00 consecutive": 7.975881108029737,
  "00 edt": 7.975881108029737,
  "00 cdt": 7.975881108029737,
  "00 billion": 7.975881108029737,
  "00 afternoon": 7.975881108029737,
  "00 primer issue": 7.975881108029737,
  "00 mdt": 7.975881108029737,
  "00 pm eastern": 7.282733927469792,
  "00 pst click": 7.975881108029737,
  "00 00 hopeful": 7.975881108029737,
  "00 pst november": 7.975881108029737,
  "00 obvious": 7.975881108029737,
  "00 pdt april": 7.975881108029737,
  "00": 5.373191422585354,
  "00 billion 12": 7.975881108029737,
  "00 et monday": 7.282733927469792,
  "00 consecutive trading": 7.975881108029737,
  "00 10 00": 7.975881108029737,
  "00 larger formation": 7.975881108029737,
  "00 pm pst": 7.975881108029737,
  "00 obvious idea": 7.975881108029737,
  "00 pm 00": 7.975881108029737,
  "00 et": 7.282733927469792,
  

In [6]:
corpus_tfidf_matrix

<2140x1068549 sparse matrix of type '<class 'numpy.float64'>'
	with 1646395 stored elements in Compressed Sparse Row format>

In [7]:
with open('../data/ontology/readable_graph.p','rb') as infile:
    ontology_raw = pickle.load(infile)


In [8]:
len(ontology_raw.keys())

617752

In [9]:
vectorizer.get_feature_names().index('internet')

483065

In [10]:
stemmer = PorterStemmer()

def pre_process_text_with_stemming(input_text):
    text=input_text
#    text = re.sub(r'\b\d(?!d)(\w{1,4})?\b', '', text, flags=re.I)
#    text = re.sub(r'\b\d{2,9}(\w{2,4}|s|S)?\b', '', text)
#    text = re.sub(r'\b\d{16}\b', '', text)
#    text = re.sub(r'\b0x\w+\b', '', text)
#    text = re.sub(r'\b\d+\w(\d|\w)+\b', '', text)
#    text = re.sub(r':|\-|\.|,|"|\(|\)|\?|\!', '', text)

    text_without_stopwords = []
    for word in text.split():
    #    if word not in stopwords:
        if True:
            word = re.sub("(\w+)'s?$", '\g<1>', word)
            text_without_stopwords.append(stemmer.stem(word.lower()))
    text = ' '.join(text_without_stopwords)
    return text

In [11]:
print("Pre-processing feature names")
feature_names = vectorizer.get_feature_names()
#feature_names = [pre_process_text_with_stemming(x) for x in tqdm(vectorizer.get_feature_names())]

print("Pre-processing ontology names")
ontology_terms = list(ontology_raw.keys())
ontology_terms_position_dict = {
    pre_process_text(key):position for (position, key) in tqdm(enumerate(ontology_terms))
}

print("Creating tfidf_ontology_matrix")
tfidf_ontology_matrix = lil_matrix((len(feature_names), len(ontology_terms)))
print(tfidf_ontology_matrix.shape)
for (tfidf_index, tfidf_term) in tqdm(enumerate(feature_names), total=len(feature_names)):
    ontology_index = ontology_terms_position_dict.get(tfidf_term, -1)
    if ontology_index >= 0:
        tfidf_ontology_matrix[tfidf_index,ontology_index] = 1
        
    #if tfidf_term in ontology_terms_set:
    #    ontology_index = ontology_terms.index(tfidf_term)
    #    tfidf_ontology_matrix[tfidf_index,ontology_index] = 1
                    

Pre-processing feature names


4367it [00:00, 21717.39it/s]

Pre-processing ontology names


617752it [00:27, 22598.23it/s]


Creating tfidf_ontology_matrix


 16%|█▌        | 173080/1068549 [00:00<00:01, 862640.78it/s]

(1068549, 617752)


100%|██████████| 1068549/1068549 [00:01<00:00, 935116.21it/s]


In [12]:
tfidf_ontology_matrix.getnnz()

15492

In [13]:
print(tfidf_ontology_matrix.shape)
feature_names[423757]

(1068549, 617752)


'hardware kepler'

In [14]:
corpus_ontology_matrix = corpus_tfidf_matrix * tfidf_ontology_matrix

In [15]:
def get_categories_for_doc(doc_id, corpus_ontology_matrix, ontology_terms):
    doc_row = corpus_ontology_matrix[doc_id]
    (nzrows, nzcolumns) = np.nonzero(doc_row)
    categories = {ontology_terms[x]:corpus_ontology_matrix[doc_id,x]  for x in nzcolumns}
    return(sorted(categories.items(), key=operator.itemgetter(1), reverse=True))

In [16]:
get_categories_for_doc(1920, corpus_ontology_matrix, ontology_terms)

[('intelligent design', 0.10372358699204302),
 ('lesson plan', 0.087892644379619675),
 ('religious', 0.06430953528993319),
 ('argument', 0.062506142478723339),
 ('document', 0.062271734076365902),
 ('dover', 0.057029593351962615),
 ('lesson', 0.056351752437624344),
 ('controversy', 0.054097755151829588),
 ('evolution', 0.05209130798721498),
 ('school district', 0.049174234022303746),
 ('court', 0.048562057634881556),
 ('handout', 0.047117233499808779),
 ('exercise', 0.046686458317395423),
 ('schools', 0.046179980420000682),
 ('design', 0.041409832153954121),
 ('ohio', 0.040932280208480609),
 ('districts', 0.039261874170149924),
 ('repeated', 0.039261874170149924),
 ('videos', 0.036362121707800711),
 ('fossils', 0.0361667850814755),
 ('teach', 0.035976100282403695),
 ('student', 0.035429681509320139),
 ('discovery institute', 0.033682596531922404),
 ('standard language', 0.033682596531922404),
 ('normal science', 0.033682596531922404),
 ('michael behe', 0.033682596531922404),
 ('consona

In [17]:
display(HTML(docs[1920][0]['content']))

In [18]:
np.nonzero(corpus_ontology_matrix[123])

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
 array([521024, 234849,  79755, 257221, 484616, 565639,   9955, 446470,
        413162, 442793, 602700, 516006,  49129, 588597, 388329, 465047,
        394776, 301655, 510226,  43533, 555706, 474649, 352980, 354587,
        139286, 480781, 250935, 284515, 5650

In [19]:
np.nonzero(corpus_ontology_matrix[823])

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
 array([266589,  10717, 285819, 446470, 565961, 284169, 543134, 389326,
        612183, 613033, 602700, 388329, 292312, 465047, 512940, 106270,
        220119,  71096, 191670, 451606, 601802, 463389, 597962, 311938,
        405048,  63559, 535787,  71883, 440146, 370838, 378541, 273932,
        271431, 572435, 426394, 292367, 576665, 421292, 459278, 607835,
        601334, 309131, 580849, 597769, 354102, 456751, 297936, 260718,
        236648, 291695, 533121, 384872, 219660, 604657, 475409, 337848,
         10908, 482287, 231235, 439011, 165315, 392898, 283743, 405331,
        489245, 558417, 340591, 449243,   7944, 

In [20]:
ontology_terms[616213]

'diplomatic conferences in india'

In [21]:
len(np.nonzero(corpus_ontology_matrix[:,616213])[0])

0

# Ideas for Using ontology terms

1. for each item in the ontology structure, annotate each items with all paths that lead to that item from any root. This is probably done by traversing the graph from each root down to each leaf, adding the path as it goes
2. create a dictionary that maps ontology entries to numbers (`ontology_item_count`)
3. For each ontology item extracted out of the corpus_ontology_matrix,  and do the following:
  - add 1 to the value of the entry `ontology_item_count` with the item as key (first time is 1)
  - for each parent in the path of the item (from each root), also add 1 to the entry for the item in each path to each root
4. Find the top N entries in `ontology_item_count` and those are the topics associated with the document

A possible variation would be to add more than 1 for items with longer "parenting" so specific topics have a fighting chance of being included (for example, level 4 topics will always only ever appear once since there are no children below them in the path) - for example, level 4 topics could have 3 points, level 3 topics 2 points, and level 2 and 1 topics have 1 point. 

# Exploration of making roots in ontology structure

In [31]:
roots= [b'Artificial_intelligence', b'Nanotechnology', b'Robotics', b'Biotechnology',
             b'Networks', b'Bioinformatics', b'Biological_engineering', b'Computational_biology',
             b'Telecommunications', b'Energy', b'Ecosystems', b'Environmental_economics',
             b'Habitat', b'Earth_system_sciences', b'Environment', b'Computers', b'Learning',
             b'Education', b'Water', b'Space', b'Health', b'Poverty', b'Aid', b'Hunger',
             b'Development_economics', b'Farms', b'Land_management', b'Prevention', b'Security',
             b'Emergency_services', b'Political_philosphy', b'Governance', b'Accountability',
             b'Justice', b'Ethical_principles', b'Rights', b'Identity_politics', b'Individualism'
            ]
ontology_with_paths={}
for item in tqdm(ontology_raw):
    

SyntaxError: unexpected EOF while parsing (<ipython-input-31-005e1376e1c4>, line 3)

# Back Matter

In [22]:
doc_number = 1300
keywords = vectorizer.transform([corpus[doc_number]])
feature_names = vectorizer.get_feature_names()
top_keywords_data_indices = np.fliplr([keywords.data.argsort()])[0]
for index in top_keywords_data_indices:
    print('{}: {}'.format(feature_names[keywords.indices[index]], keywords[0, keywords.indices[index]]))

rs 25: 0.061965162579468915
rs: 0.0571111467629357
sl: 0.04422541866673147
wofford: 0.04213417468238866
engine: 0.04162862876415319
reusability: 0.041136684750336404
rp: 0.041136684750336404
steve wofford: 0.039382516308164306
glenn case: 0.039382516308164306
rocket engine: 0.03851659022934818
core stage: 0.03847248715335623
paulsen: 0.03738045082898041
sls: 0.037251980217365026
aerojet: 0.03709767178598838
25s: 0.036014758636361305
rs 25s: 0.036014758636361305
engines: 0.03485854417464639
thrust: 0.03431394682590035
closed cycle: 0.03418389784415612
shuttle: 0.034081750493694024
falcon: 0.033962801228701336
commenter: 0.03288488143919387
ferrari: 0.03288488143919387
isp: 0.03288488143919387
paulsen said: 0.031672963855227616
performing engine: 0.031672963855227616
liquid engine: 0.031672963855227616
open cycle: 0.031672963855227616
nasa aerojet: 0.031672963855227616
ferrari rocket: 0.031672963855227616
engine wa: 0.031672963855227616
25: 0.031474781379288816
rocket: 0.0313640800224909

In [23]:
print(keywords)

  (0, 1064717)	0.015092336982
  (0, 1064716)	0.015092336982
  (0, 1064039)	0.00506661191785
  (0, 1063960)	0.015092336982
  (0, 1063955)	0.0124691256564
  (0, 1063636)	0.015092336982
  (0, 1063635)	0.015092336982
  (0, 1063589)	0.015092336982
  (0, 1063585)	0.0118665325437
  (0, 1063552)	0.015092336982
  (0, 1063551)	0.015092336982
  (0, 1063550)	0.0255535478092
  (0, 1061750)	0.015092336982
  (0, 1061749)	0.015092336982
  (0, 1061423)	0.015092336982
  (0, 1061419)	0.0130134911908
  (0, 1061029)	0.0100281868413
  (0, 1057930)	0.015092336982
  (0, 1057929)	0.015092336982
  (0, 1057875)	0.00859438504668
  (0, 1057853)	0.015092336982
  (0, 1057852)	0.015092336982
  (0, 1057835)	0.0112796428157
  (0, 1057737)	0.015092336982
  (0, 1057736)	0.015092336982
  :	:
  (0, 5988)	0.0151576532841
  (0, 4933)	0.015092336982
  (0, 4932)	0.015092336982
  (0, 4919)	0.0124691256564
  (0, 4811)	0.015092336982
  (0, 4810)	0.015092336982
  (0, 4770)	0.0103902798651
  (0, 3830)	0.015092336982
  (0, 3829)	0.0

In [24]:
display(HTML(docs[doc_number][0]['content']))

In [25]:
raw_tokenized = nltk.word_tokenize(docs[doc_number][0]['content'])
processed_tokenized = nltk.word_tokenize(corpus[doc_number])
text_processed = nltk.Text(processed_tokenized)
text_processed_fd = nltk.FreqDist(text_processed)

In [26]:
text_processed_fd.most_common(25)

[('the', 219),
 (',', 208),
 ('.', 178),
 ('a', 96),
 ('to', 94),
 ('of', 78),
 ('and', 77),
 ('engine', 69),
 ('it', 63),
 ('that', 52),
 ('is', 50),
 ('in', 42),
 ('for', 40),
 ('``', 35),
 ("''", 35),
 ('rocket', 35),
 ('rs-25', 32),
 ('nasa', 26),
 ('but', 22),
 ('space', 22),
 ('sl', 21),
 ('be', 21),
 ('with', 20),
 ('on', 20),
 ('wa', 20)]