In [2]:
import nltk
import os
import _sqlite3
from nltk.corpus import PlaintextCorpusReader
from nltk import sent_tokenize, word_tokenize
from gensim import corpora, models, similarities
from gensim.models.ldamodel import LdaModel
from gensim.parsing.preprocessing import STOPWORDS
from gensim.similarities.docsim import Similarity

In [2]:
def get_philosophers(filename):
    
    import codecs
    from bs4 import BeautifulSoup
    f = codecs.open(filename, 'r', 'utf-8')
    soup = BeautifulSoup(f.read(), 'lxml')
    table = soup.find('table', {'class':'wikitable sortable'})
    filenames = []
    for row in table.findAll('tr'):
        col = row.findAll('td')
        if (len(col) > 0):
            philo = col[0].find('a').text
            filenames.append((philo, 'Philosophers/'+philo+'.html'))
    return filenames
    

filenames = get_philosophers("Index.html")
filenames

[('Acrion', 'Philosophers/Acrion.html'),
 ('Adrastus of Aphrodisias', 'Philosophers/Adrastus of Aphrodisias.html'),
 ('Aedesia', 'Philosophers/Aedesia.html'),
 ('Aedesius', 'Philosophers/Aedesius.html'),
 ('Aeneas of Gaza', 'Philosophers/Aeneas of Gaza.html'),
 ('Aenesidemus', 'Philosophers/Aenesidemus.html'),
 ('Aesara', 'Philosophers/Aesara.html'),
 ('Aeschines of Neapolis', 'Philosophers/Aeschines of Neapolis.html'),
 ('Aeschines of Sphettus', 'Philosophers/Aeschines of Sphettus.html'),
 ('Aetius', 'Philosophers/Aetius.html'),
 ('Agapius', 'Philosophers/Agapius.html'),
 ('Agathobulus', 'Philosophers/Agathobulus.html'),
 ('Agathosthenes', 'Philosophers/Agathosthenes.html'),
 ('Agrippa the Skeptic', 'Philosophers/Agrippa the Skeptic.html'),
 ('Albinus', 'Philosophers/Albinus.html'),
 ('Alcinous', 'Philosophers/Alcinous.html'),
 ('Alcmaeon of Croton', 'Philosophers/Alcmaeon of Croton.html'),
 ('Alexamenus of Teos', 'Philosophers/Alexamenus of Teos.html'),
 ('Alexander of Aegae', 'Philo

In [3]:
def get_text(file):
    
    import codecs
    from bs4 import BeautifulSoup
    f = codecs.open(file, 'r', 'utf-8')
    page_soup = BeautifulSoup(f.read(), 'lxml')
    all_text = ''
    for tag in page_soup.find_all('p'):
        all_text += tag.get_text()
    return all_text
    
get_text('Philosophers/Acrion.html')

'Acrion was a Locrian and a Pythagorean philosopher.[1]  He is mentioned by Valerius Maximus[2] under the name of Arion. According to William Smith, Arion is a false reading, instead of Acrion.[3]\n'

In [4]:
def run(filenames):
    
    return 
    
run(filenames)

In [3]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [4]:
from gensim import corpora

documents = ["Human machine interface for lab abc computer applications",
             "A survey of user opinion of computer system response time",
             "The EPS user interface management system",
             "System and human system engineering testing of EPS",
             "Relation of user perceived response time to error measurement",
             "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey"]

In [5]:
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
          for document in documents]

from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1]
          for text in texts]

from pprint import pprint  # pretty-printer
pprint(texts)

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]


In [6]:
dictionary = corpora.Dictionary(texts)
dictionary.save('/tmp/deerwester.dict')
print(dictionary)
print(dictionary.token2id)

2020-11-10 17:22:53,102 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-11-10 17:22:53,104 : INFO : built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)
2020-11-10 17:22:53,105 : INFO : saving Dictionary object under /tmp/deerwester.dict, separately None
2020-11-10 17:22:53,107 : INFO : saved /tmp/deerwester.dict


Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)
{'computer': 0, 'human': 1, 'interface': 2, 'response': 3, 'survey': 4, 'system': 5, 'time': 6, 'user': 7, 'eps': 8, 'trees': 9, 'graph': 10, 'minors': 11}


In [7]:
new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
print(new_vec)

[(0, 1), (1, 1)]


In [8]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus)
print(corpus)

2020-11-10 17:22:58,363 : INFO : storing corpus in Matrix Market format to /tmp/deerwester.mm
2020-11-10 17:22:58,365 : INFO : saving sparse matrix to /tmp/deerwester.mm
2020-11-10 17:22:58,367 : INFO : PROGRESS: saving document #0
2020-11-10 17:22:58,368 : INFO : saved 9x12 matrix, density=25.926% (28/108)
2020-11-10 17:22:58,370 : INFO : saving MmCorpus index to /tmp/deerwester.mm.index


[[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (5, 1), (7, 1), (8, 1)], [(1, 1), (5, 2), (8, 1)], [(3, 1), (6, 1), (7, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(4, 1), (10, 1), (11, 1)]]
