# Imports
To begin, import these:

In [1]:
%load_ext autoreload

In [2]:
# To reload: 
%autoreload

In [3]:
from parser import *
from trainer import *

Folders where files are stored:

In [8]:
# inputXML = "sample_train"
# testXMLFolder = "sample_test"
inputXML = "sample_train_1"
testXMLFolder = "sample_test_1"
summaryFolder = "sample_summary"
bodyFolder = "sample_body"

# Parse
Here is where we read and parse all of the Wikipedia articles.

In [9]:
documents, labels = parse_xml_folder(inputXML, summaryFolder, bodyFolder, writeOption='none')
surfaceFeatures = process_data(documents)

<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
STAGE [1] -- PARSING XML -- from sample_train_1 ...
  -- Done. Took 0.030704 seconds process time for parsing 1 xml file(s). Writing now ...
  -- Done. Took 3.30000000002e-05 seconds to write <=2 summary/body file(s)
STAGE [2] -- PROCESSING DATA -- (tokenizing/tagging/stopwords/extracting) ...


TypeError: expected string or buffer

In [7]:
allSentenceLengths = []
for document in documents:
    allSentenceLengths+= [len(word_tokenize(sentenceUnits.text)) for sentenceUnits in document]

print np.std(allSentenceLengths)
print np.average(allSentenceLengths)

12.848466031
25.2694610778


# Train
After parsing, take the documents & labels and then train our model.

In [290]:
# This might take awhile... 
features = featurize(documents, surfaceFeatures)

STAGE [3] -- FEATURIZING -- (TextRank, LexRank, LDA) ...


TypeError: 'NoneType' object is not iterable

First, we try logistic regression:

In [257]:
logRegMod = train_and_print_results(features, labels, train_classifier_log_reg)

STAGE [4] -- TRAINING MODEL -- Logistic Regression ...

  -- Done. Took 0.009728 seconds process time to train 167 data points

             precision    recall  f1-score   support

       BODY       1.00      0.95      0.97       155
    SUMMARY       0.60      1.00      0.75        12

avg / total       0.97      0.95      0.96       167



Gaussian Naive Bayes:

In [258]:
gaussianNBMod = train_and_print_results(features, labels, train_classifier_gaussian_NB)

STAGE [4] -- TRAINING MODEL -- Gaussian Naive Bayes ...
 
  -- Done. Took 0.00887199999988 seconds process time to train 167 data points

             precision    recall  f1-score   support

       BODY       1.00      0.08      0.15       155
    SUMMARY       0.08      1.00      0.14        12

avg / total       0.93      0.15      0.15       167



In [259]:
SVMMod = train_and_print_results(features, labels, train_classifier_SVM)

STAGE [4] -- TRAINING MODEL -- Support Vector Machine (SVM) ...
 
  -- Done. Took 0.00827100000015 seconds process time to train 167 data points

             precision    recall  f1-score   support

       BODY       0.97      0.99      0.98       155
    SUMMARY       0.78      0.58      0.67        12

avg / total       0.95      0.96      0.96       167



Based on the above results, we will now chose the model to test with:

In [262]:
model = logRegMod

# Test
Then, we test on the test data from the testXML folder

In [247]:
testDocuments, testLabels = parse_xml_folder(testXMLFolder, summaryFolder, bodyFolder, writeOption='none')
testSurfaceFeatures = process_data(testDocuments)

<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
STAGE [1] -- PARSING XML -- from sample_test_1 ...
  -- Done. Took 0.107826 seconds process time for parsing 1 xml file(s). Writing now ...
  -- Done. Took 7.9999999798e-05 seconds to write <=2 summary/body file(s)
STAGE [2] -- PROCESSING DATA -- (tokenizing/tagging/stopwords/extracting) ...
  -- Done. Took 7.421173 seconds process time for processing 1 document(s)


Featurize the vectors, then evaluate the model

In [249]:
testFeatures = featurize(testDocuments, testSurfaceFeatures)

STAGE [3] -- FEATURIZING -- (TextRank, LexRank, LDA) ...
  -- Done. Took 120.123986 seconds process time to featurize 227 vector(s)


In [263]:
testFeatureMatrix = vectorizer.transform(testFeatures) # Features = List of counters
print evaluate_trained_classifier(model, testFeatureMatrix, testLabels)

STAGE [5] -- TESTING -- Logistic Regression ...
  -- Done. Took 0.000328999999965 seconds process time to test 227 data points
[ 1.          0.68965517  0.58471761]
             precision    recall  f1-score   support

       BODY       0.99      0.88      0.93       217
    SUMMARY       0.26      0.90      0.40        10

avg / total       0.96      0.88      0.91       227



# ROUGE Analysis 
To evaluate against ROUGE, we first construct a summary for each document.

In [264]:
def extract_summary(document):
    summary = []
    for su in document:
        if su.label == 'summary':
            summary.append(su.text)
        else:
            break
    return summary

In [265]:
from heapq import nlargest

def extract_top_ranked(document, featureMatrix, num):
    predictions = model.predict_proba(featureMatrix)
    itr = range(len(document))
    topIndexes = nlargest(num, itr, key=lambda i: predictions[i][1])
    topSentences = [ document[index].text for index in topIndexes]
    return topSentences

# Run ROUGE
Now, we run ROUGE against the gold summaries.

In [266]:
featIndex = 0
rouge_gold_summaries = []
rouge_generated_summaries = []
for i, tdoc in enumerate(testDocuments):
    numSentences = len(tdoc)
    rouge_gold_summaries.append(extract_summary(tdoc))
    rouge_generated_summaries.append(extract_top_ranked(tdoc, testFeatureMatrix[featIndex:featIndex+numSentences], 10))
    featIndex += numSentences

#print rouge_gold_summaries
#print rouge_generated_summaries

In [267]:
pickle_gold_summaries = {}
pickle_generated_summaries = {}
for i in range(len(rouge_gold_summaries)):
    pickle_gold_summaries[i] = rouge_gold_summaries[i]
    pickle_generated_summaries[i] = rouge_generated_summaries[i]

In [268]:
import pickle
pickle.dump( pickle_gold_summaries, open( "pickle_gold_summaries.p", "wb" ) )
pickle.dump( pickle_generated_summaries, open( "pickle_generated_summaries.p", "wb" ) )

In [269]:
%autoreload
from RougeRunner import *

rougeResults = []

loaded_gold_summaries = pickle.load(open( "pickle_gold_summaries.p", "rb" ) )
loaded_generated_summaries = pickle.load( open( "pickle_generated_summaries.p", "rb" ) )

for summaryIndex in range(len(loaded_gold_summaries)):
    gold = loaded_gold_summaries[summaryIndex]
    genSum = loaded_generated_summaries[summaryIndex]
    rougeResults.append(compareUsingRouge(gold, genSum))
        
for result in rougeResults:
    print result
    


0.39272
