# Experiment 3: K-Means using Universal Sentence Encoder

In this experiment, summaries are generated by running K-Means clustering on the emedded sentences of a document. The length of the summary is determined by the number of clusters *k*, where *k* equals to the desired number of sentences in the summary.
Sentence embeddings are obtained from Google's Universal Sentence Encoder.

In [1]:
import pandas as pd
import tqdm
from rouge import Rouge 

In [2]:
from models.unsupervised import kMeans

In [3]:
from UniversalSentenceEncoder import USEEmbedder
from Preprocessors import PlaceboPreprocessor
from Evaluator import USEevaluator

In [4]:
test_data = pd.read_pickle("./training_data/test_raw.pkl")

In [5]:
test_data = test_data.sort_values(by=['Language'])

In [6]:
test_data.head()

Unnamed: 0,index,Lead,Body,Language,ID,isTrain,Summary_Fasttext_Mean,Summary_Fasttext_SIF
2,2,Yukos' owner Menatep Group says it will ask Ro...,Yukos unit buyer faces loan claim The owners o...,English,2,False,"""The pledged assets are with Rosneft, so it wi...",Rosneft officials were unavailable for comment...
9871,9871,"NEW : An opposition leader says crackdown , no...",A Bahrain court sentenced eight Shiite opposit...,English,9871,False,She was arrested . Rights groups have urged Ba...,"Soon after the judge read the verdict , the de..."
9880,9880,"NEW : Death toll could reach 50,000 , accordin...","SICHUAN , China Li Yunxia wipes away tears as ...",English,9880,False,"Watch parents ' anguished vigil "" The death to...",Other parents wail as soldiers in blue masks t...
9881,9881,Isobel Coleman : Obama mainly addressed domest...,President Obama 's State of the Union address ...,English,9881,False,"On North Korea , boilerplate promises to isola...","On North Korea , boilerplate promises to isola..."
9823,9823,Chelsea beat Aston Villa 3-0 to reach final of...,Holders Chelsea stayed on course for the domes...,English,9823,False,Portsmouth will go into that match as massive ...,Terry had been given the chance to shoot after...


In [7]:
summarizer = kMeans(USEEmbedder, PlaceboPreprocessor)
comparator = USEevaluator(metric="cosine")

In [8]:
summaries = []
cosims = []

In [9]:
flatdict = {}
rouge = Rouge()

In [10]:
for i, row in tqdm.tqdm_notebook(test_data.iterrows(), total=len(test_data.index)):
    try:
        smry = summarizer.summarize(row.Body, row.Language, 0.2, sif=True)
    except:
        smry = " "
    summaries.append(smry)
    flatlist = []
    scores = rouge.get_scores(smry, row.Lead)[0]
    for metric in scores:
        for key in scores[metric]:
            flatlist.append(scores[metric][key])
    flatdict[i] = flatlist
    cosims.append(comparator.compare(smry, row.Lead))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(FloatProgress(value=0.0, max=8430.0), HTML(value='')))

  n_local_trials = 2 + int(np.log(n_clusters))
  n_local_trials = 2 + int(np.log(n_clusters))
  n_local_trials = 2 + int(np.log(n_clusters))
  n_local_trials = 2 + int(np.log(n_clusters))
  n_local_trials = 2 + int(np.log(n_clusters))
  n_local_trials = 2 + int(np.log(n_clusters))
  n_local_trials = 2 + int(np.log(n_clusters))
  n_local_trials = 2 + int(np.log(n_clusters))
  n_local_trials = 2 + int(np.log(n_clusters))





In [11]:
test_data["Summary_USE"] = summaries

In [12]:
test_data.to_pickle('./training_data/test_raw.pkl')

In [13]:
r_scores = pd.DataFrame.from_dict(flatdict, orient="index",
                       columns=['R1_f', 'R1_p', 'R1_r', 'R2_f', 'R2_p', 'R2_r','Rl_f', 'Rl_p', 'Rl_r'])

In [14]:
test_data = pd.merge(test_data, r_scores, left_index=True, right_index=True)

In [15]:
test_data["cosine_sim"] = cosims

In [16]:
test_data.head()

Unnamed: 0,index,Lead,Body,Language,ID,isTrain,Summary_Fasttext_Mean,Summary_Fasttext_SIF,Summary_USE,R1_f,R1_p,R1_r,R2_f,R2_p,R2_r,Rl_f,Rl_p,Rl_r,cosine_sim
2,2,Yukos' owner Menatep Group says it will ask Ro...,Yukos unit buyer faces loan claim The owners o...,English,2,False,"""The pledged assets are with Rosneft, so it wi...",Rosneft officials were unavailable for comment...,State-owned Rosneft bought the Yugansk unit fo...,0.523256,1.0,0.354331,0.505882,0.977273,0.34127,0.625,1.0,0.454545,0.720757
9871,9871,"NEW : An opposition leader says crackdown , no...",A Bahrain court sentenced eight Shiite opposit...,English,9871,False,She was arrested . Rights groups have urged Ba...,"Soon after the judge read the verdict , the de...",They were among 21 people who were convicted o...,0.227545,0.171171,0.339286,0.024242,0.018182,0.036364,0.212121,0.16092,0.311111,0.53601
9880,9880,"NEW : Death toll could reach 50,000 , accordin...","SICHUAN , China Li Yunxia wipes away tears as ...",English,9880,False,"Watch parents ' anguished vigil "" The death to...",Other parents wail as soldiers in blue masks t...,China 's population minister recently praised ...,0.314465,0.238095,0.462963,0.050955,0.038462,0.075472,0.34375,0.282051,0.44,0.616351
9881,9881,Isobel Coleman : Obama mainly addressed domest...,President Obama 's State of the Union address ...,English,9881,False,"On North Korea , boilerplate promises to isola...","On North Korea , boilerplate promises to isola...","Notably , China was mentioned only twice -- on...",0.265823,0.214286,0.35,0.064103,0.051546,0.084746,0.268908,0.219178,0.347826,0.512566
9823,9823,Chelsea beat Aston Villa 3-0 to reach final of...,Holders Chelsea stayed on course for the domes...,English,9823,False,Portsmouth will go into that match as massive ...,Terry had been given the chance to shoot after...,Holders Chelsea stayed on course for the domes...,0.27907,0.24,0.333333,0.062992,0.054054,0.075472,0.275229,0.241935,0.319149,0.652152


In [17]:
test_data.R2_f.describe()

count    8430.000000
mean        0.065149
std         0.099700
min         0.000000
25%         0.000000
50%         0.031447
75%         0.076409
max         1.000000
Name: R2_f, dtype: float64

In [18]:
test_data.R2_p.describe()

count    8430.000000
mean        0.066437
std         0.139268
min         0.000000
25%         0.000000
50%         0.022472
75%         0.058824
max         1.000000
Name: R2_p, dtype: float64

In [19]:
test_data.R2_r.describe()

count    8430.000000
mean        0.091745
std         0.111421
min         0.000000
25%         0.000000
50%         0.057307
75%         0.131148
max         1.000000
Name: R2_r, dtype: float64

In [20]:
test_data.cosine_sim.describe()

count    8430.000000
mean        0.517712
std         0.150026
min        -0.089866
25%         0.420147
50%         0.525611
75%         0.625512
max         1.000000
Name: cosine_sim, dtype: float64