In [1]:
from collections import defaultdict

import codecs
import gensim
import networkx as nx
import numpy as np

from bs4 import BeautifulSoup

In [2]:
G = nx.read_gexf("datasets/influences.gexf")
names = [ n for n in G.nodes() ]

In [20]:
def philosopher2vec(name):
  def get_text(name):
    txt = codecs.open("datasets/pages/%s.html" % name,"r", "utf-8-sig").read()

    txt = BeautifulSoup(txt, "html.parser")
    txt = txt.get_text()

    return txt
  
  def tokenize(sentences):
    stoplist = set(codecs.open("datasets/stopwords.txt","r", "utf-8-sig").read().split())
    # stoplist = get_stop_words("en")

    texts = [[word.replace(",", "").replace("'s", "") for word in document.lower().split() if word not in stoplist] for document in sentences]

    frequency = defaultdict(int)

    for text in texts:
      for token in text:
        frequency[token] += 1
    
    texts = [[token for token in text if frequency[token] > 1] for text in texts]
    
    return texts
  
  def to_dict(tokens):
    d = gensim.corpora.Dictionary(tokens)

    return d
  
  def to_corpus(d, t):
    c = [d.doc2bow(text) for text in t]
    
    return c

  txt = get_text(name)
  sentences = txt.split(".")
  
  tokens = tokenize(sentences)
  dictionary = to_dict(tokens)
  
  corpus = to_corpus(dictionary, tokens)

  bigrams = gensim.models.Phrases(tokens)

  lsi = gensim.models.lsimodel.LsiModel(corpus, id2word=dictionary, num_topics=300)
  word2vec = gensim.models.Word2Vec(bigrams[tokens], size=100, window=5, min_count=5, workers=4, batch_words=100)

  # print tokens
  
  print name
  print "sentences: %s"% len(sentences)
  print "tokens: %s" % int(np.sum([ len(t) for t in tokens ]))
  print dictionary
  # print bigrams
  # print lsi.print_topics(10)
  print
  
  print "word2vec(work + philosophy) = "
  print "------------------------------"
  for (u,s) in word2vec.most_similar(positive=['work', "philosophy"]):
    print "%s: %s" % (u,s)
    
  print

#names = names[0:10]
names = ["Karl_Popper", "Martin_Heidegger", "Ludwig_Wittgenstein", "Bruno_Latour" ]
# names = ["Ludwig_Wittgenstein"]
  
[ philosopher2vec(name) for name in names ];

Karl_Popper
sentences: 923
tokens: 6719
Dictionary(1430 unique tokens: [u'', u'searle', u'writings', u'four', u'whose']...)

word2vec(work + philosophy) = 
------------------------------
theory: 0.473865866661
popper: 0.409742593765
growth: 0.38961905241
may: 0.376099914312
code: 0.372898429632
cannot: 0.365706205368
social: 0.365385532379
physics: 0.364394128323
science: 0.3585729599
scientific: 0.352876186371

Martin_Heidegger
sentences: 1007
tokens: 8013
Dictionary(1547 unique tokens: [u'', u'limited', u'writings', u'bedeutungslehre', u'augustine']...)

word2vec(work + philosophy) = 
------------------------------
heidegger: 0.909396111965
being: 0.83614552021
history: 0.834819972515
husserl: 0.830217123032
philosophical: 0.826589465141
works: 0.814429581165
hölderlin: 0.813647389412
account: 0.810482680798
nader_el-bizri: 0.808180689812
language: 0.795672118664

Ludwig_Wittgenstein
sentences: 1710
tokens: 10066
Dictionary(1901 unique tokens: [u'', u'searle', u'writings', u'augustin