## VRA Corpus TFIDF
This notebook does some pre-processing of the VRA corpus content and then applies the tf-idf algorithm to it (using n-grams of length one to three) to generate potential categories for use in tagging and searches.

In [3]:
import json
import itertools
import re
import numpy as np
import psycopg2
import nltk
from bs4 import BeautifulSoup
from IPython.display import display, HTML
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from stopwords import stopwords
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
stopwords = set(itertools.chain(*[stopwords, ENGLISH_STOP_WORDS]))    

In [4]:
conn = psycopg2.connect('dbname=vra_db user=postgres')
cur = conn.cursor()
cur.execute('select extract from core_content')
docs = cur.fetchall()

In [5]:
corpus = []
for index, item in enumerate(docs):
    text = '{} {}'.format(item[0]['title'], item[0]['content'])
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    text = re.sub(r'\b\d(?!d)(\w{1,4})?\b', '', text, flags=re.I)
    text = re.sub(r'\b\d{2,9}(\w{2,4}|s|S)?\b', '', text)
    text = re.sub(r'\b\d{16}\b', '', text)
    text = re.sub(r'\b0x\w+\b', '', text)
    text = re.sub(r'\b\d+\w(\d|\w)+\b', '', text)
    text = re.sub(r':|\-|\.|,|"|\(|\)|\?|\!', '', text)

    text_without_stopwords = []
    for word in text.split():
        if word not in stopwords:
            word = re.sub("(\w+)'s?$", '\g<1>', word)
            text_without_stopwords.append(wnl.lemmatize(word.lower()))
    text = ' '.join(text_without_stopwords)
    
    if index % 100 == 0:
        print('.', end='')
    corpus.append(text)
for item in corpus[:20]:
    print(item[:25])

....................................................................................................................................................................................................the wall eye building rev
the exoplanets you meet '
kepler announces largest 
blue origin rocket onboar
festo sbsi vision sensor 
fuelling galileo none
scotland northern ireland
vacuum test none
esa bic prague opening no
fivefingered robotic hand
distance wireless chargin
stunning video mercury tr
intelligrated win mhi inn
adding rock ocean deacidi
goopy dark matter slow in
rover technology space ea
silicate stardust trace h
french iot firm boost u p
mar exploration rover upd
transit of mercury planet


In [139]:
vectorizer = TfidfVectorizer(min_df=1, analyzer='word', ngram_range=(1,3), stop_words='english')
X = vectorizer.fit_transform(corpus)
print(json.dumps(dict(zip(vectorizer.get_feature_names()[:50], vectorizer.idf_[:50])), indent=2))

{
  "1d engine second": 9.705579652772443,
  "2d aesthetic": 9.705579652772443,
  "2d detected spring": 9.705579652772443,
  "2d brawler tend": 9.705579652772443,
  "2d book house": 9.705579652772443,
  "1d environments ken13": 9.705579652772443,
  "2d barrier midaction": 9.705579652772443,
  "2d codes": 9.705579652772443,
  "1d 2d": 9.705579652772443,
  "1d main": 9.705579652772443,
  "1d stage engine": 9.705579652772443,
  "2d booster": 9.705579652772443,
  "1d engines earlier": 9.705579652772443,
  "2d 3d": 9.300114544664277,
  "2d fighter appear": 9.705579652772443,
  "2d aesthetic final": 9.705579652772443,
  "1d engine arranged": 9.705579652772443,
  "1dx mark": 9.300114544664277,
  "2d detected": 9.705579652772443,
  "1d 2d codes": 9.705579652772443,
  "1dx mark ii": 9.300114544664277,
  "1d environments": 9.705579652772443,
  "2d fighter": 9.705579652772443,
  "1d engine qualified": 9.705579652772443,
  "2d carry": 9.705579652772443,
  "1d main engine": 9.705579652772443,
  "2d

In [172]:
doc_number = 2531
keywords = vectorizer.transform([corpus[doc_number]])
feature_names = vectorizer.get_feature_names()
top_keywords_data_indices = np.fliplr([keywords.data.argsort()])[0]
for index in top_keywords_data_indices:
    print('{}: {}'.format(feature_names[keywords.indices[index]], keywords[0, keywords.indices[index]]))

deep learning: 0.20890543263578493
tensorflow: 0.2043143732373751
deep: 0.16035672886468605
learning: 0.13017098642337152
ai: 0.11964942938582149
tacobot: 0.11357811467567197
google: 0.09958341715754876
taco bell: 0.09891784077265399
facebook: 0.09569533001700269
company: 0.08970388074550907
taco: 0.08853430597727317
photo: 0.0883239779560444
software: 0.0875804136248754
openai: 0.0771415181284591
neural: 0.07575667618500682
neural net: 0.07554132477889375
artificial intelligence: 0.07460848708117059
enlitic: 0.07098632167229499
dsstne: 0.07098632167229499
legowiecki: 0.07098632167229499
bell: 0.06983858713453246
intelligence: 0.06982741100657314
open source: 0.06916196213243551
artificial: 0.06824498312983208
open: 0.06451378030969943
radiologist: 0.06428459844038258
server: 0.06266962862912023
deep neural: 0.05998554420949917
cat: 0.059741461240337204
amazon: 0.05873222865140788
multiple server: 0.056789057337835985
slack: 0.05609575625608493
framework: 0.05566103365518124
deutsch: 0

In [173]:
display(HTML(docs[doc_number][0]['content']))

In [174]:
raw_tokenized = nltk.word_tokenize(docs[doc_number][0]['content'])
processed_tokenized = nltk.word_tokenize(corpus[doc_number])
text_processed = nltk.Text(processed_tokenized)
text_processed_fd = nltk.FreqDist(text_processed)

In [175]:
text_processed_fd.most_common(25)

[('deep', 27),
 ('company', 26),
 ('learning', 21),
 ('google', 20),
 ('facebook', 19),
 ('tensorflow', 18),
 ('photo', 17),
 ('ai', 17),
 ('but', 16),
 ('software', 16),
 ('it', 15),
 ('open', 14),
 ('year', 11),
 ('intelligence', 11),
 ('the', 10),
 ('image', 10),
 ('artificial', 10),
 ('technology', 9),
 ('amazon', 9),
 ('called', 9),
 ('neural', 9),
 ('tech', 9),
 ('human', 9),
 ('tacobot', 8),
 ('taco', 8)]