# Imports
To begin, import these:

In [163]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [173]:
# To reload: 
%autoreload

In [174]:
from parser import *
from trainer import *

Folders where files are stored:

In [166]:
# inputXML = "sample_train"
# testXMLFolder = "sample_test"
inputXML = "sample_train_1"
testXMLFolder = "sample_test_1"
summaryFolder = "sample_summary"
bodyFolder = "sample_body"

# Parse
Here is where we read and parse all of the Wikipedia articles.

In [167]:
documents, labels = parse_xml_folder(inputXML, summaryFolder, bodyFolder, writeOption='none')
surfaceFeatures = process_data(documents)

<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
STAGE [1] -- PARSING XML -- from sample_train_1 ...
  -- Done. Took 0.100056 seconds process time for parsing 1 xml file(s). Writing now ...
  -- Done. Took 9.00000000001e-05 seconds to write <=2 summary/body file(s)
STAGE [2] -- PROCESSING DATA -- (tokenizing/tagging/stopwords/extracting) ...
  -- Done. Took 6.968632 seconds process time for processing 1 document(s)


# Train
After parsing, take the documents & labels and then train our model.

In [196]:
# This might take awhile... 
features = featurize(documents, surfaceFeatures)

STAGE [3] -- FEATURIZING -- (TextRank, LexRank, LDA) ...
  -- Done. Took 56.708558 seconds process time to featurize 167 vector(s)


First, we try logistic regression:

In [200]:
%autoreload
model, featMatrix, vectorizer = train_classifier_log_reg(features, labels)
scores = cross_val_score(model, featMatrix, labels, scoring="f1_macro") # accuracy, f1, log_loss
#print model.coef_
predictions = model.predict(featMatrix)
print ""
print metrics.classification_report(labels, predictions)

STAGE [4] -- TRAINING MODEL -- Logistic Regression ...

  -- Done. Took 0.00897200000009 seconds process time to train 167 data points

             precision    recall  f1-score   support

       BODY       1.00      0.95      0.97       155
    SUMMARY       0.60      1.00      0.75        12

avg / total       0.97      0.95      0.96       167



Gaussian Naive Bayes:

In [201]:
model, featMatrix, vectorizer = train_classifier_gaussian_NB(features, labels)
scores = cross_val_score(model, featMatrix, labels, scoring="f1_macro") # accuracy, f1, log_loss
#print model.coef_
predictions = model.predict(featMatrix)
print ""
print metrics.classification_report(labels, predictions)

STAGE [4] -- TRAINING MODEL -- Gaussian Naive Bayes ...
  -- Done. Took 0.00816600000007 seconds process time to train 167 data points
             precision    recall  f1-score   support

       BODY       1.00      0.08      0.15       155
    SUMMARY       0.08      1.00      0.14        12

avg / total       0.93      0.15      0.15       167



# Test
Then, we test on the test data from the testXML folder

In [157]:
testDocuments, testLabels = parse_xml_folder(testXMLFolder, summaryFolder, bodyFolder, writeOption='none')
testSurfaceFeatures = process_data(testDocuments)

<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
STAGE [1] -- PARSING XML -- from sample_test_1 ...
  -- Done. Took 0.311815 seconds process time for parsing 5 xml file(s). Writing now ...
  -- Done. Took 0.00011399999994 seconds to write <=10 summary/body file(s)
STAGE [2] -- PROCESSING DATA -- (tokenizing/tagging/stopwords/extracting) ...
  -- Done. Took 28.733085 seconds process time for processing 5 document(s)


In [158]:
testFeatures = featurize(testDocuments, testSurfaceFeatures)
testFeatureMatrix = vectorizer.transform(testFeatures) # Features = List of counters
print evaluate_trained_classifier(model, testFeatureMatrix, testLabels)

STAGE [3] -- FEATURIZING -- (TextRank, LexRank, LDA) ...
  -- Done. Took 333.463294 seconds process time to featurize 787 vector(s)
STAGE [5] -- TESTING -- Logistic Regression ...
  -- Done. Took 0.00017600000001 seconds process time to test 787 data points
[ 0.67241421  0.57963124  0.77413793]
             precision    recall  f1-score   support

       BODY       0.97      0.82      0.89       711
    SUMMARY       0.31      0.75      0.44        76

avg / total       0.90      0.81      0.84       787



# ROUGE Analysis 
To evaluate against ROUGE, we first construct a summary for each document.

In [141]:
def extract_summary(document):
    summary = []
    for su in document:
        if su.label == 'summary':
            summary.append(su.text)
        else:
            break
    return summary

In [142]:
from heapq import nlargest

def extract_top_ranked(document, featureMatrix, num):
    predictions = model.predict_proba(featureMatrix)
    itr = range(len(document))
    topIndexes = nlargest(num, itr, key=lambda i: predictions[i][1])
    topSentences = [ document[index].text for index in topIndexes]
    return topSentences

# Run ROUGE
Now, we run ROUGE against the gold summaries.

In [143]:
featIndex = 0
rouge_gold_summaries = []
rouge_generated_summaries = []
for i, tdoc in enumerate(testDocuments):
    numSentences = len(tdoc)
    rouge_gold_summaries.append(extract_summary(tdoc))
    rouge_generated_summaries.append(extract_top_ranked(tdoc, testFeatureMatrix[featIndex:featIndex+numSentences], 10))
    featIndex += numSentences

#print rouge_gold_summaries
#print rouge_generated_summaries

In [144]:
pickle_gold_summaries = {}
pickle_generated_summaries = {}
for i in range(len(rouge_gold_summaries)):
    pickle_gold_summaries[i] = rouge_gold_summaries[i]
    pickle_generated_summaries[i] = rouge_generated_summaries[i]

In [145]:
import pickle
pickle.dump( pickle_gold_summaries, open( "pickle_gold_summaries.p", "wb" ) )
pickle.dump( pickle_generated_summaries, open( "pickle_generated_summaries.p", "wb" ) )

In [148]:
%autoreload
from RougeRunner import *

rougeResults = []

loaded_gold_summaries = pickle.load(open( "pickle_gold_summaries.p", "rb" ) )
loaded_generated_summaries = pickle.load( open( "pickle_generated_summaries.p", "rb" ) )

for summaryIndex in range(len(loaded_gold_summaries)):
    gold = loaded_gold_summaries[summaryIndex]
    genSum = loaded_generated_summaries[summaryIndex]
    rougeResults.append(compareUsingRouge(gold, genSum))
        
for result in rougeResults:
    print result
    


In the 20th�century, a light-aircraft factory, a college, and a paper mill, along with many smaller enterprises, drove the economy.
The city has three sites on the National Register of Historic Places—Memorial Park Site, a significant pre-European archaeological find; Heisey House, a Victorian-era museum; and Water Street District, an area with a mix of 19th- and 20th-century architecture.
A television station, Havenscope, and a radio station, WLHU, both managed by students, operate on the university campus.
Located near the confluence of the West Branch Susquehanna River and Bald Eagle Creek, it is the principal city of the Lock Haven, Pennsylvania, micropolitan statistical area, itself part of the Williamsport–Lock Haven combined statistical area.
An eight-room home, the Heisey House, restored to its mid-19th century appearance, displays Victorian-era collections; it was added to the National Register of Historic Places in 1972 and is home to the Clinton County Historical Society.
Th