# Imports
To begin, import these:

In [97]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [98]:
# To reload: 
%autoreload

In [99]:
from parser import *
from trainer import *

Folders where files are stored:

In [4]:
# inputXML = "sample_train"
# testXMLFolder = "sample_test"
inputXML = "sample_train_1"
testXMLFolder = "sample_test_1"
summaryFolder = "sample_summary"
bodyFolder = "sample_body"

# Parse
Here is where we read and parse all of the Wikipedia articles.

In [6]:
documents, labels = parse_xml_folder(inputXML, summaryFolder, bodyFolder, writeOption='none')
surfaceFeatures = process_data(documents)

<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
STAGE [1] -- PARSING XML -- from sample_train_1 ...
  -- Done. Took 0.905723 seconds process time for parsing 13 xml file(s). Writing now ...
  -- Done. Took 0.000198 seconds to write <=26 summary/body file(s)
STAGE [2] -- PROCESSING DATA -- (tokenizing/tagging/stopwords/extracting) ...
  -- Done. Took 86.337423 seconds process time for processing 13 document(s)


# Train
After parsing, take the documents & labels and then train our model.

In [7]:
# This might take awhile... 
features = featurize(documents, surfaceFeatures)

STAGE [3] -- FEATURIZING -- (TextRank, LexRank, LDA) ...
  -- Done. Took 2115.900845 seconds process time to featurize 2544 vector(s)


First, we try logistic regression:

In [13]:
logRegMod, logRegVectorizer = train_and_print_results(features, labels, train_classifier_log_reg)

STAGE [4] -- TRAINING MODEL -- Logistic Regression ...

  -- Done. Took 0.107077 seconds process time to train 2544 data points

             precision    recall  f1-score   support

       BODY       0.98      0.84      0.90      2370
    SUMMARY       0.27      0.81      0.40       174

avg / total       0.93      0.84      0.87      2544



Gaussian Naive Bayes:

In [14]:
gaussianNBMod, gaNBVectorizer = train_and_print_results(features, labels, train_classifier_gaussian_NB)

STAGE [4] -- TRAINING MODEL -- Gaussian Naive Bayes ...
 
  -- Done. Took 0.08115 seconds process time to train 2544 data points

             precision    recall  f1-score   support

       BODY       0.97      0.21      0.35      2370
    SUMMARY       0.08      0.92      0.15       174

avg / total       0.91      0.26      0.33      2544



In [100]:
SVMMod, SVMVectorizer = train_and_print_results(features, labels, train_classifier_SVM)

STAGE [4] -- TRAINING MODEL -- Support Vector Machine (SVM) ...
 
  -- Done. Took 0.903484 seconds process time to train 2544 data points

             precision    recall  f1-score   support

       BODY       0.94      1.00      0.97      2370
    SUMMARY       0.69      0.06      0.12       174

avg / total       0.92      0.93      0.91      2544



Based on the above results, we will now chose the model to test with:

In [16]:
model = logRegMod
vectorizer = logRegVectorizer

# Test
Then, we test on the test data from the testXML folder

In [32]:
testDocuments, testLabels = parse_xml_folder(testXMLFolder, summaryFolder, bodyFolder, writeOption='none')
testSurfaceFeatures = process_data(testDocuments)

<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
STAGE [1] -- PARSING XML -- from sample_test_1 ...
  -- Done. Took 0.484701 seconds process time for parsing 10 xml file(s). Writing now ...
  -- Done. Took 0.000145999999859 seconds to write <=20 summary/body file(s)
STAGE [2] -- PROCESSING DATA -- (tokenizing/tagging/stopwords/extracting) ...
  -- Done. Took 49.571896 seconds process time for processing 10 document(s)


Featurize the vectors, then evaluate the model

In [33]:
testFeatures = featurize(testDocuments, testSurfaceFeatures)

STAGE [3] -- FEATURIZING -- (TextRank, LexRank, LDA) ...
  -- Done. Took 759.657489 seconds process time to featurize 1509 vector(s)


In [57]:
testFeatureMatrixLogReg = logRegVectorizer.transform(testFeatures) # Features = List of counters
print evaluate_trained_classifier(logRegMod, testFeatureMatrixLogReg, testLabels)

STAGE [5] -- TESTING -- Logistic Regression ...
  -- Done. Took 0.000527000000147 seconds process time to test 1509 data points
[ 0.62652575  0.71864776  0.68469152]
             precision    recall  f1-score   support

       BODY       0.97      0.84      0.90      1362
    SUMMARY       0.33      0.74      0.46       147

avg / total       0.91      0.83      0.85      1509



In [58]:
testFeatureMatrixGNB = gaNBVectorizer.transform(testFeatures) # Features = List of counters
print evaluate_trained_classifier(gaussianNBMod, testFeatureMatrixGNB, testLabels)

STAGE [5] -- TESTING -- Logistic Regression ...
  -- Done. Took 0.00380399999995 seconds process time to test 1509 data points
[ 0.57202353  0.16290276  0.18290258]
             precision    recall  f1-score   support

       BODY       0.96      0.21      0.34      1362
    SUMMARY       0.11      0.91      0.20       147

avg / total       0.87      0.28      0.33      1509



In [101]:
testFeatureMatrixSVM = SVMVectorizer.transform(testFeatures) # Features = List of counters
print evaluate_trained_classifier(SVMMod, testFeatureMatrixSVM, testLabels)

STAGE [5] -- TESTING -- Logistic Regression ...
  -- Done. Took 0.0438509999999 seconds process time to test 1509 data points
[ 0.47439916  0.47439916  0.47384937]
             precision    recall  f1-score   support

       BODY       0.90      1.00      0.95      1362
    SUMMARY       0.50      0.02      0.04       147

avg / total       0.86      0.90      0.86      1509



# ROUGE Analysis 
To evaluate against ROUGE, we first construct a summary for each document.

In [61]:
def extract_summary(document):
    summary = []
    for su in document:
        if su.label == 'summary':
            summary.append(su.text)
        else:
            break
    return summary

In [86]:
from heapq import nlargest

def extract_top_ranked(document, featureMatrix, num, modelToRun):
    predictions = modelToRun.predict_proba(featureMatrix)
    itr = range(len(document))
    topIndexes = nlargest(num, itr, key=lambda i: predictions[i][1])
    topSentences = [ document[index].text for index in topIndexes]
    return topSentences

# Run ROUGE
Now, we run ROUGE against the gold summaries.

In [87]:
from RougeRunner import *

def runROUGEOnMatrix(testFeatureMatrixYeah, model):

    featIndex = 0
    rouge_gold_summaries = []
    rouge_generated_summaries = []
    for i, tdoc in enumerate(testDocuments):
        numSentences = len(tdoc)
        rouge_gold_summaries.append(extract_summary(tdoc))
        rouge_generated_summaries.append(extract_top_ranked(tdoc, testFeatureMatrixYeah[featIndex:featIndex+numSentences], 10, model))
        featIndex += numSentences

    pickle_gold_summaries = {}
    pickle_generated_summaries = {}
    for i in range(len(rouge_gold_summaries)):
        pickle_gold_summaries[i] = rouge_gold_summaries[i]
        pickle_generated_summaries[i] = rouge_generated_summaries[i]
        
    #import pickle
    #pickle.dump( pickle_gold_summaries, open( "pickle_gold_summaries.p", "wb" ) )
    #pickle.dump( pickle_generated_summaries, open( "pickle_generated_summaries.p", "wb" ) )        

    rougeResults = []

    #loaded_gold_summaries = pickle.load(open( "pickle_gold_summaries.p", "rb" ) )
    #loaded_generated_summaries = pickle.load( open( "pickle_generated_summaries.p", "rb" ) )
    loaded_gold_summaries = pickle_gold_summaries
    loaded_generated_summaries = pickle_generated_summaries

    for summaryIndex in range(len(loaded_gold_summaries)):
        gold = loaded_gold_summaries[summaryIndex]
        genSum = loaded_generated_summaries[summaryIndex]
        rougeResults.append(compareUsingRouge(gold, genSum))

    for result in rougeResults:
        print result
        
    print "Average {0}".format(np.mean(rougeResults))

In [80]:
runROUGEOnMatrix(testFeatureMatrixGNB, gaussianNBMod)

0.38923
0.21937
0.23735
0.36333
0.15921
0.45506
0.33887
0.22013
0.48063
0.13498
Average 0.299816


In [81]:
runROUGEOnMatrix(testFeatureMatrixLogReg, logRegMod)

0.34839
0.22434
0.30792
0.29
0.16513
0.0938
0.58267
0.13668
0.4007
0.14767
Average 0.26973


In [102]:
runROUGEOnMatrix(testFeatureMatrixSVM, SVMMod)

0.38426
0.2141
0.1756
0.42341
0.3253
0.27273
0.36786
0.19591
0.41217
0.10167
Average 0.287301
