In [10]:
%load_ext autoreload
# To reload: 
%autoreload

In [11]:
from parser import *
from trainer import *

In [12]:
# inputXML = "sample_train"
# testXMLFolder = "sample_test"
inputXML = "sample_train_1"
testXMLFolder = "sample_test_1"
summaryFolder = "sample_summary"
bodyFolder = "sample_body"

In [13]:
# <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
# PARSE:
# <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
documents, labels = parse_xml_folder(inputXML, summaryFolder, bodyFolder, writeOption='none')
surfaceFeatures = process_data(documents)

<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
STAGE [1] -- PARSING XML -- from sample_train_1 ...
STAGE [2] -- PROCESSING DATA -- (tokenizing/tagging/stopwords/extracting) ...


In [14]:
# <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
# TRAIN:
# <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
features = featurize(documents, surfaceFeatures)
model, featMatrix, vectorizer = train_classifier(features, labels)

scores = cross_val_score(model, featMatrix, labels, scoring="f1_macro") # accuracy, f1, log_loss
print model.coef_
predictions = model.predict(featMatrix)
print metrics.classification_report(labels, predictions)

STAGE [3] -- FEATURIZING -- (TextRank, LexRank, LDA) ...
STAGE [4] -- TRAINING MODEL -- Logistic Regression ...
<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
[[-0.1107776  -0.44076819 -0.51537521 -0.26291182 -0.40936365 -0.65803028
   0.8746662   0.47629844  0.38003429 -0.43231561 -0.16282134  0.50850067
  -0.28099118  0.5485241   0.11009769 -0.06869526  0.31151657  0.20628777
   0.12439831 -0.2521433   0.14399234  1.24799225 -0.08775069 -0.47208916]]
             precision    recall  f1-score   support

       BODY       0.96      0.69      0.80       330
    SUMMARY       0.17      0.67      0.26        30

avg / total       0.89      0.69      0.76       360



In [15]:
# <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
# TEST:
# <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
testDocuments, testLabels = parse_xml_folder(testXMLFolder, summaryFolder, bodyFolder, writeOption='none')
testSurfaceFeatures = process_data(testDocuments)

<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
STAGE [1] -- PARSING XML -- from sample_test_1 ...
STAGE [2] -- PROCESSING DATA -- (tokenizing/tagging/stopwords/extracting) ...


In [16]:
testFeatures = featurize(testDocuments, testSurfaceFeatures)
testFeatureMatrix = vectorizer.transform(testFeatures) # Features = List of counters
print evaluate_trained_classifier(model, testFeatureMatrix, testLabels)

STAGE [3] -- FEATURIZING -- (TextRank, LexRank, LDA) ...
STAGE [5] -- TESTING -- Logistic Regression ...
<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
[ 0.29644856  0.36601732  0.33363971]
             precision    recall  f1-score   support

       BODY       0.89      0.61      0.73       278
    SUMMARY       0.04      0.17      0.06        24

avg / total       0.83      0.58      0.67       302



In [7]:
def extract_summary(document):
    summary = []
    for su in document:
        if su.label == 'summary':
            summary.append(su.text)
        else:
            break
    return summary

In [8]:
from heapq import nlargest

def extract_top_ranked(document, featureMatrix, num):
    predictions = model.predict_proba(featureMatrix)
    itr = range(len(document))
    topIndexes = nlargest(num, itr, key=lambda i: predictions[i][1])
    topSentences = [ document[index].text for index in topIndexes]
    return topSentences

In [9]:
# <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
# ROUGE TESTING:
# <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
featIndex = 0
rouge_gold_summaries = []
rouge_generated_summaries = []
for i, tdoc in enumerate(testDocuments):
    numSentences = len(tdoc)
    rouge_gold_summaries.append(extract_summary(tdoc))
    rouge_generated_summaries.append(extract_top_ranked(tdoc, testFeatureMatrix[featIndex:featIndex+numSentences], 10))
    featIndex += numSentences

#print rouge_gold_summaries
#print rouge_generated_summaries

In [None]:
pickle_gold_summaries = {}
pickle_generated_summaries = {}
for i in range(len(rouge_gold_summaries)):
    pickle_gold_summaries[i] = rouge_gold_summaries[i]
    pickle_generated_summaries[i] = rouge_generated_summaries[i]

In [None]:
import pickle
pickle.dump( pickle_gold_summaries, open( "pickle_gold_summaries.p", "wb" ) )
pickle.dump( pickle_generated_summaries, open( "pickle_generated_summaries.p", "wb" ) )

In [None]:
%autoreload
from RougeRunner import *

rougeResults = []

loaded_gold_summaries = pickle.load(open( "pickle_gold_summaries.p", "rb" ) )
loaded_generated_summaries = pickle.load( open( "pickle_generated_summaries.p", "rb" ) )

for summaryIndex in range(len(loaded_gold_summaries)):
    gold = loaded_gold_summaries[summaryIndex]
    genSum = loaded_generated_summaries[summaryIndex]
    rougeResults.append(compareUsingRouge(gold, genSum))
    
for result in rougeResults:
    print result