In [1]:
import pandas as pd
import tqdm
from rouge import Rouge

In [2]:
from Fasttext import FTEmbedder
from Preprocessors import StandardPreprocessor
from models.unsupervised import kMeans

In [3]:
test_data = pd.read_pickle("./training_data/test_raw.pkl")

In [4]:
test_data = test_data.sort_values(by=['Language'])

In [5]:
test_data.head()

Unnamed: 0,index,Lead,Body,Language,ID,isTrain
3,3,"Rod Eddington, BA's chief executive, said the ...",High fuel prices hit BA's profits British Air...,English,3,False
1465,1465,Chester have won only two of their last 10 Lea...,Rush future at Chester uncertain Ian Rush's f...,English,1465,False
1463,1463,Victory took Real to within six points of lead...,Owen delighted with Real display Michael Owen...,English,1463,False
1456,1456,"""We got our goals early and in the minds of so...",Souness delight at Euro progress Boss Graeme ...,English,1456,False
1454,1454,"""For a fourth-choice striker at the start I'm ...",Owen determined to stay in Madrid England for...,English,1454,False


In [6]:
summarizer = kMeans(FTEmbedder, StandardPreprocessor)

In [7]:
summaries = []

In [8]:
flatdict = {}
rouge = Rouge()

In [9]:
for i, row in tqdm.tqdm_notebook(test_data.iterrows(), total=len(test_data.index)):
    try:
        smry = summarizer.summarize(row.Body, row.Language, 0.3, sif=False)
    except:
        smry = " "
    summaries.append(smry)
    flatlist = []
    scores = rouge.get_scores(smry, row.Lead)[0]
    for metric in scores:
        for key in scores[metric]:
            flatlist.append(scores[metric][key])
    flatdict[i] = flatlist

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(FloatProgress(value=0.0, max=6315.0), HTML(value='')))

Loading embeddings for English
Done.


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/swrdata/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Loading embeddings for French
Done.


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/swrdata/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  n_local_trials = 2 + int(np.log(n_clusters))
  n_local_trials = 2 + int(np.log(n_clusters))
  n_local_trials = 2 + int(np.log(n_clusters))
  n_local_trials = 2 + int(np.log(n_clusters))
  n_local_trials = 2 + int(np.log(n_clusters))
  n_local_trials = 2 + int(np.log(n_clusters))
  n_local_trials = 2 + int(np.log(n_clusters))
  n_local_trials = 2 + int(np.log(n_clusters))
  n_local_trials = 2 + int(np.log(n_clusters))


Loading embeddings for German
Done.


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/swrdata/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  n_local_trials = 2 + int(np.log(n_clusters))
  n_local_trials = 2 + int(np.log(n_clusters))
  n_local_trials = 2 + int(np.log(n_clusters))
  n_local_trials = 2 + int(np.log(n_clusters))





In [10]:
test_data["Summary_Fasttext"] = summaries

In [11]:
r_scores = pd.DataFrame.from_dict(flatdict, orient="index",
                       columns=['R1_f', 'R1_p', 'R1_r', 'R2_f', 'R2_p', 'R2_r','Rl_f', 'Rl_p', 'Rl_r'])

In [12]:
test_data = pd.merge(test_data, r_scores, left_index=True, right_index=True)

In [13]:
test_data.head()

Unnamed: 0,index,Lead,Body,Language,ID,isTrain,Summary_Fasttext,R1_f,R1_p,R1_r,R2_f,R2_p,R2_r,Rl_f,Rl_p,Rl_r
3,3,"Rod Eddington, BA's chief executive, said the ...",High fuel prices hit BA's profits British Air...,English,3,False,Reporting its results for the three months to ...,0.475904,0.626984,0.383495,0.321212,0.424,0.258537,0.463519,0.55102,0.4
1465,1465,Chester have won only two of their last 10 Lea...,Rush future at Chester uncertain Ian Rush's f...,English,1465,False,Chester have won only two of their last 10 Lea...,0.606383,0.850746,0.471074,0.580645,0.818182,0.45,0.652778,0.839286,0.534091
1463,1463,Victory took Real to within six points of lead...,Owen delighted with Real display Michael Owen...,English,1463,False,Owen delighted with Real display Michael Owen...,0.818182,0.96,0.712871,0.770115,0.905405,0.67,0.842105,0.949153,0.756757
1456,1456,"""We got our goals early and in the minds of so...",Souness delight at Euro progress Boss Graeme ...,English,1456,False,An early own goal followed by an Alan Shearer ...,0.40678,0.571429,0.315789,0.331429,0.467742,0.256637,0.40625,0.530612,0.329114
1454,1454,"""For a fourth-choice striker at the start I'm ...",Owen determined to stay in Madrid England for...,English,1454,False,Owen determined to stay in Madrid England for...,0.47482,0.702128,0.358696,0.376812,0.55914,0.284153,0.521739,0.695652,0.417391


In [15]:
test_data.R2_f.describe()

count    6315.000000
mean        0.068935
std         0.122381
min         0.000000
25%         0.000000
50%         0.023881
75%         0.065217
max         0.844444
Name: R2_f, dtype: float64