# encoding and parsing wikipedia page diffs

In [2]:
import os
import sys
import pprint 
import random
from collections import defaultdict

from bs4 import BeautifulSoup
import nltk

from IPython.display import display, HTML

sys.path.append(os.path.abspath('../../WKP-python-toolkit'))
import wekeypedia

In [3]:
p = wekeypedia.WikipediaPage("Love")

revisions_list = p.get_revisions_list()

In [4]:
def normalize(word):
  lemmatizer = nltk.WordNetLemmatizer()
  stemmer = nltk.stem.porter.PorterStemmer()

  word = word.lower()
  word = stemmer.stem_word(word)
  word = lemmatizer.lemmatize(word)

  return word

def extract(diff_html):
  diff = { "added": [],
           "deleted" : [] }

  d = BeautifulSoup(diff_html, 'html.parser')

  tr = d.find_all("tr")

  for what in [ ["added", "ins"], ["deleted", "del"] ]:
    a = []

    # checking block 
    # we also check this is not only context showing for non-substition edits
    a = [ t.find("td", "diff-%sline" % (what[0])) for t in tr if len(t.find_all(what[1])) == 0 and len(t.find_all("td", "diff-empty")) > 0 ]

    # checking inline
    a.extend(d.find_all(what[1]))

    # filtering empty extractions
    a = [ x for x in a if x != None ]

    # registering
    diff[what[0]] = [ tag.get_text() for tag in a ]

  return diff

def count_stems(sentences, inflections=None):
  stems = defaultdict(int)

  for sentence in sentences:
    for word in nltk.word_tokenize(sentence):
      old = word
      word = normalize(word)
      if not(word in ignore_list):
        stems[word] += 1

        # keeping track of inflection usages
        inflections[word].setdefault(old,0)
        inflections[word][old] += 1

  return stems

def print_plusminus_overview(diff):
    for minus in diff["deleted"]:
        print "- %s" % (minus)

    for plus in diff["added"]:
        print "+ %s" % (plus)

        
def print_plusminus_terms_overview(stems):
    print "\n%s|%s\n" % ("+"*len(stems["added"].items()), "-"*len(stems["deleted"].items()))

In [5]:
inflections = defaultdict(dict)

ignore_list = "{}()[]<>./,;\"':!?&#=*&%"

display(HTML("<h3>raw html query result</h3>"))
 
#diff = p.get_diff(random.choice(revisions_list)["revid"])
diff = p.get_diff(100000308)

display(HTML(diff))

display(HTML("<h3>plus/minus overview</h3>"))

diff = extract(diff)
print_plusminus_overview(diff)


display(HTML("<h3>plus/minus ---> terms overview</h3>"))

stems = {}
stems["added"] = count_stems(diff["added"], inflections)
stems["deleted"] = count_stems(diff["deleted"], inflections)
print_plusminus_terms_overview(stems)

#  return (ad.items(),de.items(), { "added":inflections["added"], "deleted":inflections["deleted"] })

display(HTML("<h3>inflections</h3>"))

for stem, i in inflections.iteritems():
    print "[%s] %s" % (stem, ", ".join(map(lambda x: "%s (%s)" % (x[0], x[1]), i.items())))

- * [[Jason loves Olivia more]] 



|----



[olivia] Olivia (1)
[love] loves (1)
[jason] Jason (1)
[more] more (1)
