## VRA Corpus TFIDF
This notebook does some pre-processing of the VRA corpus content and then applies the tf-idf algorithm to it (using n-grams of length one to three) to generate potential categories for use in tagging and searches.

In [1]:
import json
import itertools
import re
import numpy as np
import psycopg2
import nltk
import pickle
from nltk.stem import PorterStemmer
from tqdm import tqdm, tqdm_notebook
from scipy.sparse import csr_matrix, lil_matrix
from bs4 import BeautifulSoup
from IPython.display import display, HTML
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from stopwords import stopwords
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
stopwords = set(itertools.chain(*[stopwords, ENGLISH_STOP_WORDS]))    

In [2]:
conn = psycopg2.connect('dbname=vra user=postgres')
cur = conn.cursor()
cur.execute('select extract from core_content')
docs = cur.fetchall()

In [3]:
def pre_process_text(input_text):
    text=input_text
    text = re.sub(r'\b\d(?!d)(\w{1,4})?\b', '', text, flags=re.I)
    text = re.sub(r'\b\d{2,9}(\w{2,4}|s|S)?\b', '', text)
    text = re.sub(r'\b\d{16}\b', '', text)
    text = re.sub(r'\b0x\w+\b', '', text)
    text = re.sub(r'\b\d+\w(\d|\w)+\b', '', text)
    text = re.sub(r':|\-|\.|,|"|\(|\)|\?|\!', '', text)

    text_without_stopwords = []
    for word in text.split():
        if word not in stopwords:
            word = re.sub("(\w+)'s?$", '\g<1>', word)
            text_without_stopwords.append(wnl.lemmatize(word.lower()))
    text = ' '.join(text_without_stopwords)
    return text

In [4]:
corpus = []
for index, item in tqdm(enumerate(docs), total=len(docs)):
    text = '{} {}'.format(item[0]['title'], item[0]['content'])
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = pre_process_text(text)
    
    corpus.append(text)
for item in corpus[:20]:
    print(item[:25])

100%|██████████| 2140/2140 [00:28<00:00, 74.21it/s] 

should older people demen
aggies building campus co
machine learning theory c
old scientist how compute
cosmic dust earth reveals
spacex dragon departs spa
first global mercury map 
google reveals new androi
nasa soundingrocket missi
altering robot gender soc
person interest final sea
hirise | valley network a
supersalty turkish lake h
jason orbit update januar
nasa stardust sample retu
new detail cere seen dawn
discovery fundamental lim
google 'family share pinc
bionic spaniel help super
pentagon unveils mindcont





In [5]:
vectorizer = TfidfVectorizer(min_df=1, analyzer='word', ngram_range=(1,3), stop_words='english', sublinear_tf=True)
corpus_tfidf_matrix = vectorizer.fit_transform(corpus)
print(json.dumps(dict(zip(vectorizer.get_feature_names()[:50], vectorizer.idf_[:50])), indent=2))

{
  "3d anaglyph easy": 7.975881108029737,
  "2d code": 7.975881108029737,
  "1d": 7.282733927469792,
  "2d film": 7.975881108029737,
  "2d film moment": 7.975881108029737,
  "3d app": 7.975881108029737,
  "2d session": 7.570415999921573,
  "2d symbology data": 7.282733927469792,
  "3d bioprinted brain": 7.975881108029737,
  "3d bioprinter": 7.570415999921573,
  "3d": 4.125733506319679,
  "2d surface balloon": 7.975881108029737,
  "1d 2d": 7.570415999921573,
  "3d bioprinted windpipe": 7.975881108029737,
  "3d bioprinted medical": 7.975881108029737,
  "2d rover image": 7.975881108029737,
  "2d symbol": 7.975881108029737,
  "3d action platformer": 7.975881108029737,
  "3d app android": 7.975881108029737,
  "2d metaphor expansion": 7.975881108029737,
  "1d 2d symbol": 7.975881108029737,
  "2d metaphor": 7.975881108029737,
  "1d south marked": 7.975881108029737,
  "3d bioprinted": 7.059590376155582,
  "2d code code": 7.975881108029737,
  "2d na": 7.975881108029737,
  "1d barcodes plus": 7

In [6]:
corpus_tfidf_matrix

<2140x940159 sparse matrix of type '<class 'numpy.float64'>'
	with 1417973 stored elements in Compressed Sparse Row format>

In [7]:
with open('../data/ontology/readable_graph.p','rb') as infile:
    ontology_raw = pickle.load(infile)


In [8]:
len(ontology_raw.keys())

617752

In [9]:
vectorizer.get_feature_names().index('internet')

423757

In [10]:
stemmer = PorterStemmer()

def pre_process_text_with_stemming(input_text):
    text=input_text
    text = re.sub(r'\b\d(?!d)(\w{1,4})?\b', '', text, flags=re.I)
    text = re.sub(r'\b\d{2,9}(\w{2,4}|s|S)?\b', '', text)
    text = re.sub(r'\b\d{16}\b', '', text)
    text = re.sub(r'\b0x\w+\b', '', text)
    text = re.sub(r'\b\d+\w(\d|\w)+\b', '', text)
    text = re.sub(r':|\-|\.|,|"|\(|\)|\?|\!', '', text)

    text_without_stopwords = []
    for word in text.split():
        if word not in stopwords:
            word = re.sub("(\w+)'s?$", '\g<1>', word)
            text_without_stopwords.append(stemmer.stem(word.lower()))
    text = ' '.join(text_without_stopwords)
    return text

In [11]:
print("Pre-processing feature names")
feature_names = [pre_process_text_with_stemming(x) for x in tqdm(vectorizer.get_feature_names())]

print("Pre-processing ontology names")
ontology_terms = list(ontology_raw.keys())
ontology_terms_position_dict = {
    pre_process_text_with_stemming(key):position for (position, key) in tqdm(enumerate(ontology_terms))
}

print("Creating tfidf_ontology_matrix")
tfidf_ontology_matrix = lil_matrix((len(feature_names), len(ontology_terms)))
print(tfidf_ontology_matrix.shape)
for (tfidf_index, tfidf_term) in tqdm(enumerate(feature_names), total=len(feature_names)):
    ontology_index = ontology_terms_position_dict.get(tfidf_term, -1)
    if ontology_index >= 0:
        tfidf_ontology_matrix[tfidf_index,ontology_index] = 1
        
    #if tfidf_term in ontology_terms_set:
    #    ontology_index = ontology_terms.index(tfidf_term)
    #    tfidf_ontology_matrix[tfidf_index,ontology_index] = 1
                    

Pre-processing feature names


100%|██████████| 940159/940159 [01:40<00:00, 9318.86it/s] 
1745it [00:00, 8642.51it/s]

Pre-processing ontology names


617752it [00:56, 10872.54it/s]


Creating tfidf_ontology_matrix


 11%|█         | 103875/940159 [00:00<00:01, 518635.55it/s]

(940159, 617752)


100%|██████████| 940159/940159 [00:01<00:00, 601739.96it/s]


In [12]:
tfidf_ontology_matrix.getnnz()

37707

In [13]:
print(tfidf_ontology_matrix.shape)
feature_names[423757]

(940159, 617752)


'internet'

In [17]:
corpus_ontology_matrix = corpus_tfidf_matrix * tfidf_ontology_matrix

In [39]:
np.nonzero(corpus_ontology_matrix[123])

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
 array([483437, 462015, 608269, 149709, 145111, 418135, 617401, 207810,
        488539, 455657, 4

In [38]:
corpus_ontology_matrix[123,483437]

0.011392203991071352

# Back Matter

In [None]:
doc_number = 1300
keywords = vectorizer.transform([corpus[doc_number]])
feature_names = vectorizer.get_feature_names()
top_keywords_data_indices = np.fliplr([keywords.data.argsort()])[0]
for index in top_keywords_data_indices:
    print('{}: {}'.format(feature_names[keywords.indices[index]], keywords[0, keywords.indices[index]]))

In [None]:
print(keywords)

In [None]:
display(HTML(docs[doc_number][0]['content']))

In [None]:
raw_tokenized = nltk.word_tokenize(docs[doc_number][0]['content'])
processed_tokenized = nltk.word_tokenize(corpus[doc_number])
text_processed = nltk.Text(processed_tokenized)
text_processed_fd = nltk.FreqDist(text_processed)

In [None]:
text_processed_fd.most_common(25)