# Experiment 4: CRSum using fastText word embeddings

In this experiment, summaries are generated by a CRSum model. CRSum is an atttention neural network trained to predict the cosine similarity of a sentence to a hypothetical summary. The actual summary is obtained by selecting the *n* sentences with the highest predicted similarity, where *n* is the desired number of sentences in the summary. The model is trained on pre-trained aligned fasText word embeddings (https://fasttext.cc/docs/en/aligned-vectors.html). Sentence embeddings are generated implicitly through the hidden layers of the model.

In [None]:
import pandas as pd
import tqdm
from rouge import Rouge 

In [None]:
from models.supervised import CRSum

In [None]:
from Preprocessors import CRSumPreprocessor
from Evaluator import USEevaluator

In [None]:
test_data = pd.read_pickle("./training_data/test_raw.pkl")

In [None]:
test_data = test_data.sort_values(by=['Language'])

In [None]:
test_data.head()

In [None]:
summarizer = CRSum(embedding_model=None, preprocessor=CRSumPreprocessor, M=5, N=5, verbose=False)

In [None]:
summarizer.loadWeights("best_model.h5")

In [None]:
comparator = USEevaluator(metric="cosine")

In [None]:
summaries = []
cosims = []

In [None]:
flatdict = {}
rouge = Rouge()

In [None]:
for i, row in tqdm.tqdm_notebook(test_data.iterrows(), total=len(test_data.index)):
    try:
        smry = summarizer.summarize(row.Body, row.Language, 0.2)
    except:
        smry = " "
    if smry == "":
        smry = " "
    summaries.append(smry)
    flatlist = []
    scores = rouge.get_scores(smry, row.Lead)[0]
    for metric in scores:
        for key in scores[metric]:
            flatlist.append(scores[metric][key])
    flatdict[i] = flatlist
    cosims.append(comparator.compare(smry, row.Lead))

In [None]:
test_data["Summary_CRSum"] = summaries

In [None]:
test_data.to_pickle('./training_data/test_raw.pkl')

In [None]:
r_scores = pd.DataFrame.from_dict(flatdict, orient="index",
                       columns=['R1_f', 'R1_p', 'R1_r', 'R2_f', 'R2_p', 'R2_r','Rl_f', 'Rl_p', 'Rl_r'])

In [None]:
test_data = pd.merge(test_data, r_scores, left_index=True, right_index=True)

In [None]:
test_data["cosine_sim"] = cosims

In [None]:
test_data.head()

In [None]:
test_data.R2_f.describe()

In [None]:
test_data.R2_p.describe()

In [None]:
test_data.R2_r.describe()

In [None]:
test_data.cosine_sim.describe()