# Experiment 2: K-Means using SIF-weighted fastText embeddings

In this experiment, summaries are generated by running K-Means clustering on the emedded sentences of a document. The length of the summary is determined by the number of clusters *k*, where *k* equals to the desired number of sentences in the summary.
Sentence embeddings are obtained as the average of the individual fastText word embeddings, weighted by the smooth inverse frequencies.

In [None]:
import pandas as pd
import tqdm
from rouge import Rouge

In [None]:
from Fasttext import FTEmbedder
from Preprocessors import StandardPreprocessor
from Evaluator import USEevaluator
from models.unsupervised import kMeans

In [None]:
test_data = pd.read_pickle("./training_data/test_raw.pkl")

In [None]:
test_data = test_data.sort_values(by=['Language'])

In [None]:
test_data.head()

In [None]:
summarizer = kMeans(FTEmbedder, StandardPreprocessor)
comparator = USEevaluator(metric="cosine")

In [None]:
summaries = []
cosims = []

In [None]:
flatdict = {}
rouge = Rouge()

In [None]:
for i, row in tqdm.tqdm_notebook(test_data.iterrows(), total=len(test_data.index)):
    try:
        smry = summarizer.summarize(row.Body, row.Language, 0.2, sif=True)
    except:
        smry = " "
    if len(smry)<5:
        smry=" "
    summaries.append(smry)
    flatlist = []
    scores = rouge.get_scores(smry, row.Lead)[0]
    for metric in scores:
        for key in scores[metric]:
            flatlist.append(scores[metric][key])
    flatdict[i] = flatlist
    cosims.append(comparator.compare(smry, row.Lead))

In [None]:
test_data["Summary_Fasttext_SIF"] = summaries

In [None]:
test_data.to_pickle('./training_data/test_raw.pkl')

In [None]:
r_scores = pd.DataFrame.from_dict(flatdict, orient="index",
                       columns=['R1_f', 'R1_p', 'R1_r', 'R2_f', 'R2_p', 'R2_r','Rl_f', 'Rl_p', 'Rl_r'])

In [None]:
test_data = pd.merge(test_data, r_scores, left_index=True, right_index=True)

In [None]:
test_data["cosine_sim"] = cosims

In [None]:
test_data.head()

In [None]:
test_data.R2_f.describe()

In [None]:
test_data.R2_p.describe()

In [None]:
test_data.R2_r.describe()

In [None]:
test_data.cosine_sim.describe()