# Summarizers Interface


This file instantiates all general parameters linked to model testing (and run the tests).

## Import et chargement des méthodes

In [None]:
%matplotlib inline

%run Process_Summary.ipynb

%run Summary_Processes/Generic_Summarizer.ipynb
%run Summary_Processes/Baseline_Summay.ipynb
%run Summary_Processes/Random_Summary.ipynb
%run Summary_Processes/TextRank_TFIDF_Assym_Summary.ipynb

%run datasets_utils/duc.ipynb
%run datasets_utils/dmoz.ipynb
%run datasets_utils/dmoz_html.ipynb

import os
import rouge
import random
import time
import math
from pprint import pprint as pp
import seaborn as sns
import matplotlib.pyplot as plt

## Summarizers Initialization

Define the _option_ variable which a list of summarization processes to test.

* Each summarization process is instantiaded with its options.
* A summarization process is a model and its routines.
* Models and their routines are defined in the [Summary_Processes folder](http://localhost:8888/tree/web-summary/Summary_Processes).

In [None]:
print("Running summarizers")
import math
#List of all options 
def const_1(x):
    return 1
def const_2(x):
    return 2
emb_fol = "../wdl/clustering/crawl-dmoz-fr/docWords/"
option = ([#Baseline_process(),
           Random_process(),
          #TextRank_Jaccard_process(1, 0, weighted = False, method = "tr", lsanbcompfun = math.log, diag = "none"),
          #TextRank_Jaccard_process(1, 0, weighted = False, method = "lsa", lsanbcompfun = math.log, diag = "before"),
          #TextRank_TFIDF_Summarizer_Assym_process(1, 0, weighted = False),
          #TextRank_TFIDF_Summarizer_Assym_process(1, 0, weighted = False,
          #                                        method = "lsa", lsanbcompfun = math.log, diag = "before"),
          # Embeddings_process("wordsembeddings.npy", "wordslist.txt", "words.df.csv" , "docs-count.txt", 
          #                   emb_fol, weighted = False, exponentiation = 4, ldanbcompfun = const_1, bias = 0.0),
          #Embeddings_process("wordsembeddings.npy", "wordslist.txt", "words.df.csv" , "docs-count.txt", 
          #                   emb_fol, weighted = False, exponentiation = 4,
          #                   method = "lsa", ldanbcompfun = const_1),
         ]
          + [TextRank_TFIDF_Summarizer_Assym_process(1, 0.25*i, weighted = False, method = method, lsanbcompfun = fun, diag = diag, bias = 0.5*bias)
              for diag in ["none","before"] for i in range(1) for method in ["tr", "lsa"] for fun in [const_1] for bias in [3]]
         #+ [ TextRank_Jaccard_process(1, 0.25*i, weighted = False, method = m, lsanbcompfun = const_1, diag = diag, bias = 0.5*bias)
         #     for i in range(1) for diag in ("none", "before") for m in ("tr", "lsa") for bias in [3]] 
         #+ [TextRank_TFIDF_Summarizer_Assym_process(1, 0.25*i, weighted = False, method = method, lsanbcompfun = fun, diag = diag, bias = 0.5*bias)
         #     for diag in ["none","before"] for i in range(1) for method in ["lsa","tr"] for bias in [3] for fun in [const_1] ]
          #+ [Embeddings_process("wordsembeddings.npy", "wordslist.txt", "words.df.csv" , "docs-count.txt", 
         #                    emb_fol, a = 1, b = 0.25*i, weighted = False, method = method, lsanbcompfun = fun, diag = diag, bias = 0.5*bias)
         #     for diag in ["none","before"] for i in range(1) for method in ["lsa","tr"] for bias in [3] for fun in [const_1] ]
         )


folder = "../wdl/resume/docs/"

## Summarizers testing process

* Load stop words (optional)
* Cut the corpus (is this tokenization ?)
* Generate corpuses
* Mystic code

In [None]:
import pandas as pd
from datetime import date

SCORE_DIR = "../scores"
SCORE_FILE = os.path.join(SCORE_DIR, "%s_bias_tfidf_summarizers_scores.csv" % date.today().strftime("%d-%m-%y"))
COLS_NAME = ["summarizer_name", "exec_time", "avg_rl_f", "avg_rl_p", "avg_rl_r", "text_cut", "sampling_size"]

def save_summarizer_score(summarizer_name, time, average_score, text_cut, sampling_size, clear_prev=False):
    """
    Saves the avergae rouge score of a summarizer.
    
    Creates a file named summarizers_scores.csv and adds the different variables for the model.
    CSV header is defined as follow: summarizer_name,exec_time,avg_score,sampling_size
    
    :param summarizer_name:    Name of the summarizer model.
    :param time:               Time the model took to create the summaries.
    :param average_scores:     The average rouge-l score of the model.
    :param sampling_size:      The % of document selected.
    :param clear_prev:         Wether previous entries for this model should be deleted.
    """
    
    # creates the directory for scores if it doesn't exists.
    if not os.path.exists(SCORE_DIR):
        os.makedirs(SCORE_DIR)
        
    # Prepare the row to save in the data frame
    new_row = [summarizer_name, time, average_score["f"], average_score["p"], average_score["r"],
               str(text_cut), sampling_size]
    
    # Loads the csv file if it exists
    if os.path.exists(SCORE_FILE):
        df = pd.read_csv(SCORE_FILE, index_col=0)
        df = df.append(dict(zip(COLS_NAME, new_row)), ignore_index=True)
    else:
        df = pd.DataFrame(np.array([new_row]), columns = COLS_NAME)
    df.to_csv(SCORE_FILE)

In [None]:
def corpus_gen(corpus, language, sampling):
    biased_vocab = None
    if corpus == "dmoz":
        docs, gold_tokenized_summaries = generate_corpus(language, sampling = sampling)
    elif corpus == "dmoz-html":
        docs, gold_tokenized_summaries, biased_vocab = generate_corpus_dmoz_html(language, sampling = sampling)
    else:
        docs, gold_tokenized_summaries = generate_corpus_duc(language, sampling = sampling)
    return docs, gold_tokenized_summaries, biased_vocab 

In [None]:
def rouge_preprocess(docs, gold_sum_dict, pred_sum, overall, len_summary, gen, stops, k):
    # Création des phrases
    all_hypothesis = []
    all_references = []
    
    for doc_url in set(overall.keys()).intersection(docs.keys()) :
        if len(gen) > 2:
            pred_sum_words = set()
            sen = ""
            for sen_id in pred_sum[k][doc_url]:
                segment = docs[doc_url][sen_id].split()
                for i in range(sen_id * gen[2], sen_id * gen[2] + gen[1]):
                    if i in pred_sum_words:
                        pass
                    else :
                        pred_sum_words.add(i)
                        sen += " " + segment[i - sen_id * gen[2]]
                        if len(pred_sum_words) == len_summary :
                            break
                else :
                    continue
                break

            hypothesis = sen
        else :
            hypothesis = join_and_cut_at([docs[doc_url][sen_id] for sen_id in pred_sum[k][doc_url]],
                                                  len_summary, stops)
        all_hypothesis.append(hypothesis)
        all_references.append(join_and_cut_at(gold_sum_dict[doc_url], len_summary, stops))
    print("Rouge preprocess done")
    return all_hypothesis, all_references

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def run_summaries(summarizer_test_list, len_summary, rouge_score, corpus, language, sampling=1.0):
    """
    :param summarizer_test_list:       List of summarizer process to be tested.
    :param len_summary:  Number of words making the final summary.
    :param rouge_score:  Empty dictionary in wich ROUGE L median scores will be stored.
    :param corpus:       String to indentify the corpus to use.
    :param language:     String to indicate the language.
                         If set, allows to use the stop words of the given language.
    :param sampling:     Threshold value for document selection.
    """
    
    # list of hypotetic / predicted summaries generated with a summarizer
    pred_sum = []
    k = 0
    
    raw_docs, raw_gold_sum_dict, raw_biased_vocab = corpus_gen(corpus, language, sampling = sampling)

    # Corpus necessary for tfidf fit.
    # Array of strings. Each string is a sentence token. The whole dataset is flatten, document are not separated.
    corpus = [doc for doc in raw_docs.values()]
    
    # Build the vectorizer only once. 
    # Builds the idf matrix with all sentence tokens of the corpus.
    # Also builds the vocabulary of the corpus. It's a dictionay mapping a word to its feature indice (index).
    vectorizer = TfidfVectorizer()
    # Learn our representation space, ie, its dimension (vocabulary size) and the idf factors.
    vectorizer.fit(corpus)
    
    # Build the biased vocabulary : a dictionnary mapping a word index to its weight.
    biased_vocab = None
    if raw_biased_vocab is not None:
        vocab = vectorizer.vocabulary_
        biased_vocab = build_vocab_bias(vocab, raw_biased_vocab, get_html_bias())

    # Choix du découpage des corpus
    #for gen in [('overlap', 5, 2)]:
    #for gen in [('brutal', 5)] + [('overlap', 5, 2), ('overlap', 10, 4)]+[('nltk',)] + [('spacy',)]:
    for gen in [('brutal',i*5) for i in range(1,4)] + [('overlap', i*5, i*2) for i in range(1,4)]+[('nltk',)] + [('spacy',)]:
        print("\n####"+"-".join([str(x) for x in gen])+"####")
        
        # Loads stop words for a given language if set.
        stops = set() #if language is None else set(stopwords.words(language))
        
        docs, gold_sum_dict = prepare_corpus(language, gen, raw_docs, raw_gold_sum_dict)
        # Build overall
        overall = {x : "" for x in set(docs.keys()).intersection(gold_sum_dict.keys())}
        assert(len(overall) != 0)
        assert(len(docs) != 0)
            
        for summarizer in summarizer_test_list :
            # Set the summarizer of the model to the unique build
            summarizer.vectorizer = vectorizer
            
            # Creation of hypotetic summaries with the    being tested.
            print("\n"+summarizer.__name__)
            s = time.time()
            pred_sum.append(doc_summarizer(docs, summarizer, len_summary, 0 if len(gen) < 3 else gen[2], biased_vocab))
            e = time.time()
            print("\nTime :", "{:.2f}s".format(e-s))
            print("Summary done")
            
            assert(len(docs) == len(pred_sum[k]))
            
            # If overlap tokenization method was used, rebuild the sentences as initially found in the document.
            all_hypothesis, all_references = rouge_preprocess(docs, gold_sum_dict, pred_sum, overall, len_summary,
                                                              gen, stops, k)
            assert(len(all_hypothesis) == len(all_references))

            #Calcul des scores rouges.
            under = [x["rouge-1"] for x in evaluator.get_scores(all_hypothesis, all_references)]
            med = { k : sum(t[k] for t in under)/len(under) for k in under[0] }
            rouge_score["-".join([str(x) for x in gen])][summarizer.__name__] = {"rouge-1" : med}
            print("Rouge done : "+ str(med))
            save_summarizer_score(summarizer.__name__, "{:.2f}s".format(e-s), med, "-".join([str(x) for x in gen]), sampling)
            
            k += 1
            
    # Return predicted summaries
    return docs, gold_sum_dict, pred_sum

In [None]:
# Defines an empty dictionary in wich median ROUGE L scores will be stored.
evaluator = rouge.Rouge(metrics=['rouge-1'],)
rouge_score = defaultdict(dict)

len_summary = 20

docs, gold, summaries = run_summaries(option,
                                      len_summary,
                                      rouge_score,
                                      corpus="dmoz-html",
                                      language="french",
                                      sampling=0.01)

## Display methods

In [None]:
from IPython.core.display import display, HTML

def html_gen(docs, summaries):
    for doc_url in set(docs.keys()).intersection(set(summaries[0].keys())) :
        
        #Head string with name of the file
        s = '<h2>'+ doc_url +'</h2><p style="text-align:justify">'
        
        #Color Modifiers
        m1, m2, m3 = 0,0,0
        
        for i in range(len(docs[doc_url])) :
            k = len(summaries)
            
            #Color assignement
            m1 = 160 * (i in summaries[0][doc_url])
            if k-1 :
                m2 = 160 * (i in summaries[1][doc_url])
                if k-2 :
                    m3 = 160 * (i in summaries[2][doc_url])
            
            #Generating the sentence colored
            s += ('<span style="color:rgb('+str(m1)+','+str(m2)+','+str(m3)+')'
                +';background-color:rgb('+str(255)+',255,'+str(255)+')">'
                + docs[doc_url][i]
                +" </span>")
        
        #Display of the sentence
        display(HTML(s+"</p>"))

In [None]:
display_docs = dict(list(docs.items())[:10])
html_gen(display_docs, summaries)

In [None]:
#print(list(gold.values())[0])

#html_gen(display_docs, gold)

## Display some results

In [None]:
sns.set(rc={'figure.figsize':(13,9)})
    
#Used to turn the dictionary into a usable dataframe.
rowsR = []
for g in rouge_score :
    for m in rouge_score[g] :
        for s in rouge_score[g][m] :
            rowsR.append({"Méthode" : m, "Score" : g, "Recall" : rouge_score[g][m][s]['r']})
replacement = (["brutal-5", "overlap-5-2"],["brutal-05", "overlap-05-2"])
dataR = pd.DataFrame(rowsR, columns = ["Méthode","Score", "Recall"]).replace(*replacement)

#For showing datas in a graph
sns.lineplot(data=dataR, x="Score", y="Recall",hue="Méthode" )
plt.legend(bbox_to_anchor=(0.01, 0.3), loc=2, borderaxespad=0.)
plt.show()


dfR = pd.DataFrame(index = sorted(set(dataR["Méthode"])),
                   columns = sorted(set(dataR["Score"]))).rename_axis("Rappel", axis="columns")
for r in dataR.iterrows()  :
    dfR.at[r[1]["Méthode"], r[1]["Score"]] = r[1]["Recall"]
from IPython.display import display
display(dfR.sort_values(['brutal-05'],ascending=False))