In [11]:
import pandas as pd
import tqdm
from rouge import Rouge

In [2]:
from Fasttext import FTEmbedder
from Preprocessors import StandardPreprocessor
from models.unsupervised import kMeans

In [3]:
test_data = pd.read_pickle("./training_data/test_raw.pkl")

In [4]:
test_data = test_data.sort_values(by=['Language'])

In [5]:
test_data.head()

Unnamed: 0,index,Lead,Body,Language,ID,isTrain
3,3,"Rod Eddington, BA's chief executive, said the ...",High fuel prices hit BA's profits British Air...,English,3,False
1465,1465,Chester have won only two of their last 10 Lea...,Rush future at Chester uncertain Ian Rush's f...,English,1465,False
1463,1463,Victory took Real to within six points of lead...,Owen delighted with Real display Michael Owen...,English,1463,False
1456,1456,"""We got our goals early and in the minds of so...",Souness delight at Euro progress Boss Graeme ...,English,1456,False
1454,1454,"""For a fourth-choice striker at the start I'm ...",Owen determined to stay in Madrid England for...,English,1454,False


In [6]:
summarizer = kMeans(FTEmbedder, StandardPreprocessor)

In [7]:
summaries = []

In [12]:
flatdict = {}
rouge = Rouge()

In [13]:
for i, row in tqdm.tqdm_notebook(test_data.iterrows(), total=len(test_data.index)):
    try:
        smry = summarizer.summarize(row.Body, row.Language, 0.3, sif=True)
    except:
        smry = " "
    summaries.append(smry)
    flatlist = []
    scores = rouge.get_scores(smry, row.Lead)[0]
    for metric in scores:
        for key in scores[metric]:
            flatlist.append(scores[metric][key])
    flatdict[i] = flatlist

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(FloatProgress(value=0.0, max=6315.0), HTML(value='')))

Loading embeddings for English
Done.


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/swrdata/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Loading embeddings for French
Done.


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/swrdata/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  self.explained_variance_ratio_ = exp_var / full_var
  n_local_trials = 2 + int(np.log(n_clusters))
  self.explained_variance_ratio_ = exp_var / full_var
  n_local_trials = 2 + int(np.log(n_clusters))
  self.explained_variance_ratio_ = exp_var / full_var
  n_local_trials = 2 + int(np.log(n_clusters))
  self.explained_variance_ratio_ = exp_var / full_var
  n_local_trials = 2 + int(np.log(n_clusters))
  self.explained_variance_ratio_ = exp_var / full_var
  n_local_trials = 2 + int(np.log(n_clusters))
  self.explained_variance_ratio_ = exp_var / full_var
  n_local_trials = 2 + int(np.log(n_clusters))
  self.explained_variance_ratio_ = exp_var / full_var
  n_local_trials = 2 + int(np.log(n_clusters))
  self.explained_variance_ratio_ = exp_var / full_var
  n_local_trials = 2 + int(np.log(n_clusters))
  self.explained_variance_ratio_ = exp_var / full

Loading embeddings for German
Done.


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/swrdata/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  self.explained_variance_ratio_ = exp_var / full_var
  n_local_trials = 2 + int(np.log(n_clusters))
  self.explained_variance_ratio_ = exp_var / full_var
  n_local_trials = 2 + int(np.log(n_clusters))
  self.explained_variance_ratio_ = exp_var / full_var
  n_local_trials = 2 + int(np.log(n_clusters))
  self.explained_variance_ratio_ = exp_var / full_var
  n_local_trials = 2 + int(np.log(n_clusters))





In [14]:
test_data["Summary_Fasttext"] = summaries

In [15]:
r_scores = pd.DataFrame.from_dict(flatdict, orient="index",
                       columns=['R1_f', 'R1_p', 'R1_r', 'R2_f', 'R2_p', 'R2_r','Rl_f', 'Rl_p', 'Rl_r'])

In [16]:
test_data = pd.merge(test_data, r_scores, left_index=True, right_index=True)

In [17]:
test_data.head()

Unnamed: 0,index,Lead,Body,Language,ID,isTrain,Summary_Fasttext,R1_f,R1_p,R1_r,R2_f,R2_p,R2_r,Rl_f,Rl_p,Rl_r
3,3,"Rod Eddington, BA's chief executive, said the ...",High fuel prices hit BA's profits British Air...,English,3,False,Reporting its results for the three months to ...,0.262821,0.386792,0.199029,0.083871,0.12381,0.063415,0.231111,0.288889,0.192593
1465,1465,Chester have won only two of their last 10 Lea...,Rush future at Chester uncertain Ian Rush's f...,English,1465,False,Chester have won only two of their last 10 Lea...,0.369048,0.659574,0.256198,0.325301,0.586957,0.225,0.4,0.619048,0.295455
1463,1463,Victory took Real to within six points of lead...,Owen delighted with Real display Michael Owen...,English,1463,False,Midfielder Edu reduced the deficit after half-...,0.325,0.440678,0.257426,0.227848,0.310345,0.18,0.351145,0.403509,0.310811
1456,1456,"""We got our goals early and in the minds of so...",Souness delight at Euro progress Boss Graeme ...,English,1456,False,"Souness also paid tribute to Laurent Robert, w...",0.173333,0.361111,0.114035,0.027027,0.057143,0.017699,0.135922,0.291667,0.088608
1454,1454,"""For a fourth-choice striker at the start I'm ...",Owen determined to stay in Madrid England for...,English,1454,False,"""England is my country. ""England is my country...",0.173554,0.362069,0.11413,0.058333,0.122807,0.038251,0.173913,0.304348,0.121739


In [19]:
test_data.R2_f.describe()

count    6315.000000
mean        0.052391
std         0.096106
min         0.000000
25%         0.000000
50%         0.018182
75%         0.051948
max         0.857143
Name: R2_f, dtype: float64