# Experiment 1: K-Means using averaged fastText embeddings

In this experiment, summaries are generated by running K-Means clustering on the emedded sentences of a document. The length of the summary is determined by the number of clusters *k*, where *k* equals to the desired number of sentences in the summary.
Sentence embeddings are obtained by averaging the individual fastText embeddings of the words in the sentence.

In [1]:
import pandas as pd
import tqdm
from rouge import Rouge

In [2]:
from Fasttext import FTEmbedder
from Preprocessors import StandardPreprocessor
from Evaluator import USEevaluator
from models.unsupervised import kMeans

In [3]:
test_data = pd.read_pickle("./training_data/test_raw.pkl")

In [4]:
test_data = test_data.sort_values(by=['Language'])

In [5]:
test_data.head()

Unnamed: 0,index,Lead,Body,Language,ID,isTrain,Summary_CRSum,Summary_Fasttext_SIF,Summary_Loc_Baseline
2,2,Yukos' owner Menatep Group says it will ask Ro...,Yukos unit buyer faces loan claim The owners o...,English,2,False,Yukos unit buyer faces loan claim The owners o...,Rosneft officials were unavailable for comment...,Yukos unit buyer faces loan claim The owners o...
9456,9456,Isaac hit Louisiana as a hurricane and lingere...,"For Urban Treuil , there 's no escaping the mi...",English,9456,False,"Because of Hurricane Isaac , Treuil 's home in...","For Urban Treuil , there 's no escaping the mi...","For Urban Treuil , there 's no escaping the mi..."
9459,9459,Hundreds of officers resume search hours after...,Hundreds of law enforcement officers searched ...,English,9459,False,Hundreds of law enforcement officers searched ...,"On Friday night , police had surrounded an are...",Hundreds of law enforcement officers searched ...
9462,9462,Martha Burk : A decade of protests opened Augu...,Dividing up the newspapers on a recent weekend...,English,9462,False,This week 's Masters Golf Tournament marks the...,This week 's Masters Golf Tournament marks the...,Dividing up the newspapers on a recent weekend...
9471,9471,Photographer John Ferguson documents the lives...,Jason Griffin straps his right arm in bandages...,English,9471,False,Watch online : the video trailer for the docum...,"Life for a black cowboy was tough , explains M...",Jason Griffin straps his right arm in bandages...


In [6]:
summarizer = kMeans(FTEmbedder, StandardPreprocessor)

In [7]:
comparator = USEevaluator(metric="cosine")

In [8]:
summaries = []
cosims = []

In [9]:
flatdict = {}
rouge = Rouge()

In [10]:
for i, row in tqdm.tqdm_notebook(test_data.iterrows(), total=len(test_data.index)):
    try:
        smry = summarizer.summarize(row.Body, row.Language, 0.2, sif=False)
    except:
        smry = " "
    summaries.append(smry)
    flatlist = []
    scores = rouge.get_scores(smry, row.Lead)[0]
    for metric in scores:
        for key in scores[metric]:
            flatlist.append(scores[metric][key])
    flatdict[i] = flatlist
    cosims.append(comparator.compare(smry, row.Lead))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(FloatProgress(value=0.0, max=8430.0), HTML(value='')))

Loading embeddings for English
Done.


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/swrdata/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Loading embeddings for French
Done.


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/swrdata/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  n_local_trials = 2 + int(np.log(n_clusters))
  n_local_trials = 2 + int(np.log(n_clusters))
  n_local_trials = 2 + int(np.log(n_clusters))
  n_local_trials = 2 + int(np.log(n_clusters))


Loading embeddings for German
Done.


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/swrdata/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!





In [11]:
test_data["Summary_Fasttext_Mean"] = summaries

In [12]:
test_data.to_pickle('./training_data/test_raw.pkl')

In [13]:
r_scores = pd.DataFrame.from_dict(flatdict, orient="index",
                       columns=['R1_f', 'R1_p', 'R1_r', 'R2_f', 'R2_p', 'R2_r','Rl_f', 'Rl_p', 'Rl_r'])

In [14]:
test_data = pd.merge(test_data, r_scores, left_index=True, right_index=True)

In [15]:
test_data["cosine_sim"] = cosims

In [16]:
test_data.head()

Unnamed: 0,index,Lead,Body,Language,ID,isTrain,Summary_CRSum,Summary_Fasttext_SIF,Summary_Loc_Baseline,Summary_Fasttext_Mean,R1_f,R1_p,R1_r,R2_f,R2_p,R2_r,Rl_f,Rl_p,Rl_r,cosine_sim
2,2,Yukos' owner Menatep Group says it will ask Ro...,Yukos unit buyer faces loan claim The owners o...,English,2,False,Yukos unit buyer faces loan claim The owners o...,Rosneft officials were unavailable for comment...,Yukos unit buyer faces loan claim The owners o...,"""The pledged assets are with Rosneft, so it wi...",0.449704,0.904762,0.299213,0.419162,0.853659,0.277778,0.539683,0.894737,0.386364,0.594428
9456,9456,Isaac hit Louisiana as a hurricane and lingere...,"For Urban Treuil , there 's no escaping the mi...",English,9456,False,"Because of Hurricane Isaac , Treuil 's home in...","For Urban Treuil , there 's no escaping the mi...","For Urban Treuil , there 's no escaping the mi...","But all that pales to what Treuil , the fire c...",0.229249,0.149485,0.491525,0.031873,0.020725,0.068966,0.24581,0.171875,0.431373,0.336664
9459,9459,Hundreds of officers resume search hours after...,Hundreds of law enforcement officers searched ...,English,9459,False,Hundreds of law enforcement officers searched ...,"On Friday night , police had surrounded an are...",Hundreds of law enforcement officers searched ...,There have been no credible sightings since la...,0.223404,0.155556,0.396226,0.043011,0.029851,0.076923,0.228571,0.168421,0.355556,0.419369
9462,9462,Martha Burk : A decade of protests opened Augu...,Dividing up the newspapers on a recent weekend...,English,9462,False,This week 's Masters Golf Tournament marks the...,This week 's Masters Golf Tournament marks the...,Dividing up the newspapers on a recent weekend...,Big argument indeed . Female members will be p...,0.277603,0.180328,0.60274,0.069841,0.045267,0.152778,0.251208,0.167742,0.5,0.594572
9471,9471,Photographer John Ferguson documents the lives...,Jason Griffin straps his right arm in bandages...,English,9471,False,Watch online : the video trailer for the docum...,"Life for a black cowboy was tough , explains M...",Jason Griffin straps his right arm in bandages...,Jason Griffin straps his right arm in bandages...,0.221053,0.15,0.42,0.031915,0.021583,0.061224,0.180556,0.127451,0.309524,0.623698


In [17]:
test_data.R2_f.describe()

count    8430.000000
mean        0.060295
std         0.097910
min         0.000000
25%         0.000000
50%         0.027027
75%         0.067925
max         0.958333
Name: R2_f, dtype: float64

In [18]:
test_data.R2_p.describe()

count    8430.000000
mean        0.060930
std         0.134376
min         0.000000
25%         0.000000
50%         0.018948
75%         0.050633
max         1.000000
Name: R2_p, dtype: float64

In [19]:
test_data.R2_r.describe()

count    8430.000000
mean        0.086378
std         0.112140
min         0.000000
25%         0.000000
50%         0.050000
75%         0.121212
max         1.000000
Name: R2_r, dtype: float64

In [20]:
test_data.cosine_sim.describe()

count    8430.000000
mean        0.484538
std         0.153141
min        -0.044249
25%         0.384365
50%         0.490423
75%         0.593652
max         0.993369
Name: cosine_sim, dtype: float64