# Imports
To begin, import these:

In [24]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [25]:
# To reload: 
%autoreload

In [26]:
from parser import *
from trainer import *

Folders where files are stored:

In [13]:
# inputXML = "sample_train"
# testXMLFolder = "sample_test"
inputXML = "sample_train_1"
testXMLFolder = "sample_test_1"
summaryFolder = "sample_summary"
bodyFolder = "sample_body"

# Parse
Here is where we read and parse all of the Wikipedia articles.

In [14]:
documents, labels = parse_xml_folder(inputXML, summaryFolder, bodyFolder, writeOption='none')
surfaceFeatures = process_data(documents)

<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
STAGE [1] -- PARSING XML -- from sample_train_1 ...
  -- Done. Took 0.034007 seconds process time for parsing 1 xml file(s). Writing now ...
  -- Done. Took 4.39999999999e-05 seconds to write <=2 summary/body file(s)
STAGE [2] -- PROCESSING DATA -- (tokenizing/tagging/stopwords/extracting) ...
  -- Done. Took 2.915654 seconds process time for processing 1 document(s)


# Train
After parsing, take the documents & labels and then train our model.

In [15]:
# This might take awhile... 
features = featurize(documents, surfaceFeatures)

STAGE [3] -- FEATURIZING -- (TextRank, LexRank, LDA) ...
  -- Done. Took 18.271223 seconds process time to featurize 84 vector(s)


First, we try logistic regression:

In [27]:
logRegMod, logRegVectorizer = train_and_print_results(features, labels, train_classifier_log_reg)

STAGE [4] -- TRAINING MODEL -- Logistic Regression ...

  -- Done. Took 0.00686999999999 seconds process time to train 84 data points

             precision    recall  f1-score   support

       BODY       0.95      0.82      0.88        74
    SUMMARY       0.35      0.70      0.47        10

avg / total       0.88      0.81      0.83        84



Gaussian Naive Bayes:

In [28]:
gaussianNBMod, gaNBVectorizer = train_and_print_results(features, labels, train_classifier_gaussian_NB)

STAGE [4] -- TRAINING MODEL -- Gaussian Naive Bayes ...
 
  -- Done. Took 0.00622999999999 seconds process time to train 84 data points

             precision    recall  f1-score   support

       BODY       1.00      0.09      0.17        74
    SUMMARY       0.13      1.00      0.23        10

avg / total       0.90      0.20      0.18        84



In [29]:
SVMMod, SVMVectorizer = train_and_print_results(features, labels, train_classifier_SVM)

STAGE [4] -- TRAINING MODEL -- Support Vector Machine (SVM) ...
 
  -- Done. Took 0.00586200000001 seconds process time to train 84 data points

             precision    recall  f1-score   support

       BODY       0.88      1.00      0.94        74
    SUMMARY       0.00      0.00      0.00        10

avg / total       0.78      0.88      0.83        84



Based on the above results, we will now chose the model to test with:

In [32]:
model = logRegMod
vectorizer = logRegVectorizer

# Test
Then, we test on the test data from the testXML folder

In [20]:
testDocuments, testLabels = parse_xml_folder(testXMLFolder, summaryFolder, bodyFolder, writeOption='none')
testSurfaceFeatures = process_data(testDocuments)

<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
STAGE [1] -- PARSING XML -- from sample_test_1 ...
  -- Done. Took 0.125541 seconds process time for parsing 1 xml file(s). Writing now ...
  -- Done. Took 9.99999999998e-05 seconds to write <=2 summary/body file(s)
STAGE [2] -- PROCESSING DATA -- (tokenizing/tagging/stopwords/extracting) ...
  -- Done. Took 8.139814 seconds process time for processing 1 document(s)


Featurize the vectors, then evaluate the model

In [21]:
testFeatures = featurize(testDocuments, testSurfaceFeatures)

STAGE [3] -- FEATURIZING -- (TextRank, LexRank, LDA) ...
  -- Done. Took 127.416766 seconds process time to featurize 227 vector(s)


In [35]:
testFeatureMatrix = vectorizer.transform(testFeatures) # Features = List of counters
print evaluate_trained_classifier(model, testFeatureMatrix, testLabels)

STAGE [5] -- TESTING -- Logistic Regression ...
  -- Done. Took 0.000326000000001 seconds process time to test 227 data points
[ 0.94099617  0.65277778  0.57386364]
             precision    recall  f1-score   support

       BODY       1.00      0.55      0.71       217
    SUMMARY       0.09      1.00      0.17        10

avg / total       0.96      0.57      0.69       227



# ROUGE Analysis 
To evaluate against ROUGE, we first construct a summary for each document.

In [264]:
def extract_summary(document):
    summary = []
    for su in document:
        if su.label == 'summary':
            summary.append(su.text)
        else:
            break
    return summary

In [265]:
from heapq import nlargest

def extract_top_ranked(document, featureMatrix, num):
    predictions = model.predict_proba(featureMatrix)
    itr = range(len(document))
    topIndexes = nlargest(num, itr, key=lambda i: predictions[i][1])
    topSentences = [ document[index].text for index in topIndexes]
    return topSentences

# Run ROUGE
Now, we run ROUGE against the gold summaries.

In [266]:
featIndex = 0
rouge_gold_summaries = []
rouge_generated_summaries = []
for i, tdoc in enumerate(testDocuments):
    numSentences = len(tdoc)
    rouge_gold_summaries.append(extract_summary(tdoc))
    rouge_generated_summaries.append(extract_top_ranked(tdoc, testFeatureMatrix[featIndex:featIndex+numSentences], 10))
    featIndex += numSentences

#print rouge_gold_summaries
#print rouge_generated_summaries

In [267]:
pickle_gold_summaries = {}
pickle_generated_summaries = {}
for i in range(len(rouge_gold_summaries)):
    pickle_gold_summaries[i] = rouge_gold_summaries[i]
    pickle_generated_summaries[i] = rouge_generated_summaries[i]

In [268]:
import pickle
pickle.dump( pickle_gold_summaries, open( "pickle_gold_summaries.p", "wb" ) )
pickle.dump( pickle_generated_summaries, open( "pickle_generated_summaries.p", "wb" ) )

In [269]:
%autoreload
from RougeRunner import *

rougeResults = []

loaded_gold_summaries = pickle.load(open( "pickle_gold_summaries.p", "rb" ) )
loaded_generated_summaries = pickle.load( open( "pickle_generated_summaries.p", "rb" ) )

for summaryIndex in range(len(loaded_gold_summaries)):
    gold = loaded_gold_summaries[summaryIndex]
    genSum = loaded_generated_summaries[summaryIndex]
    rougeResults.append(compareUsingRouge(gold, genSum))
        
for result in rougeResults:
    print result
    


0.39272
