In [1]:
from parser import *

import itertools
import random
import pickle
import numpy as np
from operator import itemgetter
from collections import Counter
from itertools import izip
import scipy
import scipy.spatial.distance
from numpy.linalg import svd
from sklearn.cluster import AffinityPropagation
from collections import defaultdict
from sklearn.feature_selection import RFE
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectFpr, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score
from sklearn import metrics

In [2]:
from summanlp_textrank import commons, graph, keywords, pagerank_weighted, \
                  summarizer, textrank, textcleaner, textrank_runtime_error

In [3]:
inputXML = "sample_1"
# inputXML = "sample_rawXML"
summaryFolder = "sample_summary"
bodyFolder = "sample_body"

In [4]:
# <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
# PARSE:
# <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
documents, labels = parse_xml_folder(inputXML, summaryFolder, bodyFolder, writeOption='none')
surfaceFeatures = process_data(documents)

<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
STAGE [1] -- PARSING XML -- from sample_1 ...
STAGE [2] -- PROCESSING DATA -- (tokenizing/tagging/stopwords/extracting) ...


In [5]:
def featurize(documents, surfaceFeatures):
    print "STAGE [3] -- FEATURIZING -- (TextRank, LexRank, LDA) ..."
    features = []
    for docIndex, doc in enumerate(documents):
        documentFeatures = extract_document_wide_features(doc)
        documentFeatures.append(surfaceFeatures[docIndex])
        features += [ counter_sum(fl) for fl in izip(*documentFeatures) ]
    return features

In [6]:
def extract_document_wide_features(document):
    documentFeatures = []

    documentFeatures.append(textrank_keyphrase(document))
    documentFeatures.append(lexrank_keyphrase(document))
    documentFeatures.append(textrank_keyword(document))

    return documentFeatures

In [7]:
def counter_sum(counterTuple):
    counterSum = Counter()
    for ele in counterTuple:
        counterSum += ele
    return counterSum

In [8]:
def textrank_keyphrase(text):

    # Creates the graph and calculates the similarity coefficient for every pair of nodes.
    graph = commons.build_graph([ syntacticUnit for syntacticUnit in text])
    summarizer._set_graph_edge_weights(graph)
    # Remove all nodes with all edges weights equal to zero.
    commons.remove_unreachable_nodes(graph)

    # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score
    pagerank_scores = summarizer._pagerank(graph)

    # Adds the summa scores to the sentence objects.
    # summarizer._add_scores_to_sentences(sentences, pagerank_scores)

    results = []
    for su in text:
        score = (1-pagerank_scores[su.label, su.index]) if (su.label, su.index) in pagerank_scores.keys() else 0.0
        results.append(Counter({ 'TEXTRANK_SCORE': score }))
    return results

In [9]:
def textrank_keyword(text):
    txt = u' '.join([ su.text for su in text ])
    # Gets a dict of word -> lemma
    tokens = textcleaner.clean_text_by_word(text, 'english')
    split_text = list(textcleaner.tokenize_by_word(txt))

    # Creates the graph and adds the edges
    graph = commons.build_graph(keywords._get_words_for_graph(tokens))
    keywords._set_graph_edges(graph, tokens, split_text)
    del split_text # It's no longer used
    commons.remove_unreachable_nodes(graph)

    # # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score
    pagerank_scores = keywords._pagerank_word(graph)
    extracted_lemmas = keywords._extract_tokens(graph.nodes(), pagerank_scores, 0.2, None)
    lemmas_to_word = keywords._lemmas_to_words(tokens)
    keyWords = keywords._get_keywords_with_score(extracted_lemmas, lemmas_to_word)
    # # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined
    combined_keywords = keywords._get_combined_keywords(keyWords, txt.split())
    kw_scores = keywords._format_results(keyWords, combined_keywords, False, True)

    results = [ Counter({ 'TEXTRANK_KEYWORD_SCORE': keyword_mean_score(su.basic, kw_scores) }) for su in text ]
    return results

In [10]:
def keyword_mean_score(sentence, wordScores):
    totalScore = sum([ s for w, s in wordScores if w in sentence ])
    return totalScore / len(sentence.split())

In [11]:
def lexrank_keyphrase(text):
    results = []
    for i in range(len(text)):
        results.append(Counter({ 'LEXRANK_SCORE': 0.0 }))
    return results

In [12]:
def train_classifier(features, labels):
    print "STAGE [4] -- TRAINING MODEL -- Logistic Regression ..."
    print '<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>'
    vectorizer = DictVectorizer(sparse=False)
    feature_matrix = vectorizer.fit_transform(features) # Features = List of counters
    mod = LogisticRegression(fit_intercept=True, intercept_scaling=1, class_weight='auto')
    mod.fit_transform(feature_matrix, labels)
    return mod, feature_matrix

In [13]:
# <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
# TRAIN:
# <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
features = featurize(documents, surfaceFeatures)
model, featMatrix = train_classifier(features, labels)

scores = cross_val_score(model, featMatrix, labels, scoring="f1_macro") # accuracy, f1, log_loss
print model.coef_
predictions = model.predict(featMatrix)
print metrics.classification_report(labels, predictions)

STAGE [3] -- FEATURIZING -- (TextRank, LexRank, LDA) ...
STAGE [4] -- TRAINING MODEL -- Logistic Regression ...
<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
[[-0.07702519 -0.15453099 -0.25571491  0.01753503  0.1711909  -0.02846504
   0.68764793  0.81465759 -0.18590128 -0.42057279  0.06774776 -0.59821813
   0.3894049  -0.21775336  0.04138842  0.27759811 -0.03553251  0.06351103
  -0.398737    0.20133042  0.62481678 -0.10930073 -0.27369215]]
             precision    recall  f1-score   support

       BODY       0.96      0.70      0.81       151
    SUMMARY       0.15      0.67      0.24        12

avg / total       0.90      0.69      0.77       163



In [19]:
print documents[0][2].text
print documents[0][2].processed
print documents[0][2].label
print documents[0][2].index
print predictions[2]
print features[2]

It uses a hybrid of traditional cel animation and rotoscoped live action footage.
[(u'hybrid', 'JJ'), (u'traditional', 'JJ'), (u'cel', 'NN'), (u'animation', 'NN'), (u'live', 'JJ'), (u'action', 'NN'), (u'footage', 'NN')]
summary
2
SUMMARY
Counter({'CONTAINS_WORD_TYPE_VB': 1.0, 'CONTAINS_WORD_TYPE_NN': 1.0, 'SENTENCE_LENGTH_2': 1.0, 'CONTAINS_WORD_TYPE_JJ': 1.0, 'CONTAINS_PUNCTUATION_.': 1.0, 'TEXTRANK_SCORE': 0.97408521307294482, 'WORD_RATIO_NN': 0.36363636363636365, 'WORD_RATIO_JJ': 0.2727272727272727, 'WORD_RATIO_VB': 0.18181818181818182, 'TEXTRANK_KEYWORD_SCORE': 0.031252926496807477})


In [7]:
print featMatrix[2]

[ 0.          0.          0.          0.          1.          0.          0.
  1.          1.          0.          0.          1.          0.          0.
  1.          0.          0.          0.97408521  0.27272727  0.36363636
  0.          0.18181818]


In [None]:
# SUMMARY: +ve