In [1]:
from parser import *

import itertools
import random
import pickle
import numpy as np
from operator import itemgetter
from collections import Counter
from itertools import izip
import scipy
import scipy.spatial.distance
from numpy.linalg import svd
from sklearn.cluster import AffinityPropagation
from collections import defaultdict
from sklearn.feature_selection import RFE
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectFpr, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score
from sklearn import metrics

In [2]:
from summanlp_textrank import commons, graph, keywords, pagerank_weighted, \
                  summarizer, textrank, textcleaner, textrank_runtime_error

In [3]:
inputXML = "sample_1"
# inputXML = "sample_rawXML"
summaryFolder = "sample_summary"
bodyFolder = "sample_body"

In [4]:
# <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
# PARSE:
# <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
documents, labels = parse_xml_folder(inputXML, summaryFolder, bodyFolder, writeOption='none')
surfaceFeatures = process_data(documents)

<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
STAGE [1] -- PARSING XML -- from sample_1 ...
STAGE [2] -- PROCESSING DATA -- (tokenizing/tagging/stopwords/extracting) ...


In [5]:
def featurize(documents, surfaceFeatures):
    print "STAGE [3] -- FEATURIZING -- (TextRank, LexRank, LDA) ..."
    features = []
    for docIndex, doc in enumerate(documents):
        documentFeatures = extract_document_wide_features(doc)
        documentFeatures.append(surfaceFeatures[docIndex])
        features += [ counter_sum(fl) for fl in izip(*documentFeatures) ]
    return features

In [6]:
def extract_document_wide_features(document):
    documentFeatures = []

    documentFeatures.append(textrank_keyphrase(document))
    documentFeatures.append(lexrank_keyphrase(document))

    txt = ' '.join([ su.text for su in document ])
    keyWords = textrank_keyword(txt)
    print keyWords

    return documentFeatures

In [7]:
def counter_sum(counterTuple):
    counterSum = Counter()
    for ele in counterTuple:
        counterSum += ele
    return counterSum

In [8]:
def textrank_keyphrase(text):

    # Creates the graph and calculates the similarity coefficient for every pair of nodes.
    graph = commons.build_graph([ syntacticUnit for syntacticUnit in text])
    summarizer._set_graph_edge_weights(graph)
    # Remove all nodes with all edges weights equal to zero.
    commons.remove_unreachable_nodes(graph)

    # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score
    pagerank_scores = summarizer._pagerank(graph)

    # Adds the summa scores to the sentence objects.
    # summarizer._add_scores_to_sentences(sentences, pagerank_scores)

    results = []
    for su in text:
        score = (1-pagerank_scores[su.label, su.index]) if (su.label, su.index) in pagerank_scores.keys() else 0.0
        results.append(Counter({ 'TEXTRANK_SCORE': score }))
    return results

In [9]:
def textrank_keyword(text):
    # Gets a dict of word -> lemma
    tokens = textcleaner.clean_text_by_word(text, 'english')
    print '******************'
    split_text = list(textcleaner.tokenize_by_word(text))
    print split_text[0:6]
    print '******************'
    # Creates the graph and adds the edges
    graph = commons.build_graph(keywords._get_words_for_graph(tokens))
    keywords._set_graph_edges(graph, tokens, split_text)
    del split_text # It's no longer used
    commons.remove_unreachable_nodes(graph)

    # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score
    pagerank_scores = keywords._pagerank(graph)

    extracted_lemmas = keywords._extract_tokens(graph.nodes(), pagerank_scores, ratio, words)
    print extracted_lemmas
    print '******************'
    lemmas_to_word = keywords._lemmas_to_words(tokens)
    print lemmas_to_words
    print '******************'
    keyWords = keywords._get_keywords_with_score(extracted_lemmas, lemmas_to_word)

    # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined
    combined_keywords = keywords._get_combined_keywords(keyWords, text.split())

    return keywords._format_results(keyWords, combined_keywords, split, scores)

In [10]:
def lexrank_keyphrase(text):
    results = []
    for i in range(len(text)):
        results.append(Counter({ 'LEXRANK_SCORE': 0.0 }))
    return results

In [11]:
def train_classifier(features, labels):
    print "STAGE [4] -- TRAINING MODEL -- Logistic Regression ..."
    print '<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>'
    vectorizer = DictVectorizer(sparse=False)
    feature_matrix = vectorizer.fit_transform(features) # Features = List of counters
    mod = LogisticRegression(fit_intercept=True, intercept_scaling=1, class_weight='auto')
    mod.fit_transform(feature_matrix, labels)
    return mod, feature_matrix

In [16]:
text = documents[0]
txt = u' '.join([ su.text for su in d ])

# Gets a dict of word -> lemma
tokens = textcleaner.clean_text_by_word(text, 'english')
split_text = list(textcleaner.tokenize_by_word(txt))

# Creates the graph and adds the edges
graph = commons.build_graph(keywords._get_words_for_graph(tokens))
keywords._set_graph_edges(graph, tokens, split_text)
del split_text # It's no longer used
commons.remove_unreachable_nodes(graph)

# # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score
pagerank_scores = keywords._pagerank_word(graph)

extracted_lemmas = keywords._extract_tokens(graph.nodes(), pagerank_scores, 0.2, None)

lemmas_to_word = keywords._lemmas_to_words(tokens)
keyWords = keywords._get_keywords_with_score(extracted_lemmas, lemmas_to_word)

# # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined
combined_keywords = keywords._get_combined_keywords(keyWords, txt.split())

[u'jrr', u'tolkien', u's', u'the', u'lord', u'of', u'the', u'rings', u'is', u'a', u'american', u'fantasy', u'film', u'directed', u'by', u'ralph', u'bakshi', u'it', u'uses', u'a', u'hybrid', u'of', u'traditional', u'cel', u'animation', u'and', u'rotoscoped', u'live', u'action', u'footage', u'it', u'is', u'an', u'adaptation', u'of', u'the', u'first', u'half', u'of', u'the', u'high', u'fantasy', u'epic', u'the', u'lord', u'of', u'the', u'rings', u'by', u'english', u'novelist', u'j', u'r', u'r', u'tolkien', u'set', u'in', u'middle', u'earth', u'the', u'film', u'follows', u'a', u'group', u'of', u'hobbits', u'elves', u'men', u'dwarves', u'and', u'wizards', u'who', u'form', u'a', u'fellowship', u'they', u'embark', u'on', u'a', u'quest', u'to', u'destroy', u'the', u'one', u'ring', u'made', u'by', u'the', u'dark', u'lord', u'sauron', u'and', u'ensure', u'his', u'destruction', u'the', u'film', u'features', u'the', u'voices', u'of', u'william', u'squire', u'john', u'hurt', u'michael', u'graham', 

In [24]:
kw_scores = keywords._format_results(keyWords, combined_keywords, False, True)

results = [Counter({ 'TEXTRANK_KEYWORD_SCORE': keyword_mean_score(su.basic, kw_scores) }) for su in text ]
# return results
# keyWords = textrank_keyword(txt)
# print keyWords

[(u'bakshi', 0.32127375127036567), (u'animated films', 0.26328261360608379), (u'fantasy film', 0.21489284876774678), (u'animation', 0.18444015274085487), (u'animator', 0.18444015274085487), (u'rings', 0.15103920300169268), (u'ring', 0.15103920300169268), (u'frodo', 0.14417937553142948), (u'gandalf', 0.13876424411504765), (u'hobbits', 0.10446001979575247), (u'hobbit', 0.10446001979575247), (u'michael', 0.094512141874428746), (u'book', 0.092336619120817781), (u'books', 0.092336619120817781), (u'artists', 0.092172084111127692), (u'artistic', 0.092172084111127692), (u'artist', 0.092172084111127692), (u'jackson', 0.090977944787364803), (u'aragorn', 0.086515575250571561), (u'adaptation', 0.080420292547296879), (u'adapt', 0.080420292547296879), (u'adaptations', 0.080420292547296879), (u'adapted', 0.080420292547296879), (u'peter', 0.079219023031011734), (u'legolas', 0.078532939131948351), (u'like', 0.078254003137500555), (u'gollum', 0.072906661696006625), (u'reviewer', 0.069644976995455313), (

In [25]:
def keyword_mean_score(sentence, wordScores):
    totalScore = sum([ s for w, s in wordScores if w in sentence ])
    return totalScore / len(sentence.split())

In [12]:
# <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
# TRAIN:
# <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
features = featurize(documents, surfaceFeatures)
model, featMatrix = train_classifier(features, labels)

scores = cross_val_score(model, featMatrix, labels, scoring="f1_macro") # accuracy, f1, log_loss
print model.coef_
predictions = model.predict(featMatrix)
print metrics.classification_report(labels, predictions)

STAGE [3] -- FEATURIZING -- (TextRank, LexRank, LDA) ...
******************
[u'jrr', u'tolkien', u's', u'the', u'lord', u'of']
******************


ZeroDivisionError: float division by zero

In [4]:
print documents[0][2].text
print documents[0][2].processed
print documents[0][2].label
print documents[0][2].index
print predictions[2]
print features[2]

It uses a hybrid of traditional cel animation and rotoscoped live action footage.
[(u'hybrid', 'JJ'), (u'traditional', 'JJ'), (u'cel', 'NN'), (u'animation', 'NN'), (u'live', 'JJ'), (u'action', 'NN'), (u'footage', 'NN')]
summary
2
SUMMARY
Counter({'CONTAINS_WORD_TYPE_VB': 1.0, 'CONTAINS_WORD_TYPE_NN': 1.0, 'SENTENCE_LENGTH_2': 1.0, 'CONTAINS_WORD_TYPE_JJ': 1.0, 'CONTAINS_PUNCTUATION_.': 1.0, 'TEXTRANK_SCORE': 0.97408521307294471, 'WORD_RATIO_NN': 0.36363636363636365, 'WORD_RATIO_JJ': 0.2727272727272727, 'WORD_RATIO_VB': 0.18181818181818182})


In [7]:
print featMatrix[2]

[ 0.          0.          0.          0.          1.          0.          0.
  1.          1.          0.          0.          1.          0.          0.
  1.          0.          0.          0.97408521  0.27272727  0.36363636
  0.          0.18181818]


In [None]:
# SUMMARY: +ve