# Experiment 0: Baselines

In this experiment, summaries are generated based on simple heuristics.

In [1]:
import pandas as pd
import nltk
import tqdm
from rouge import Rouge
from nltk.tokenize import sent_tokenize

In [2]:
from Evaluator import USEevaluator

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/swrdata/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
test_data = pd.read_pickle("./training_data/test_raw.pkl")

In [5]:
test_data = test_data.sort_values(by=['Language'])

In [6]:
test_data.head()

Unnamed: 0,index,Lead,Body,Language,ID,isTrain,Summary_CRSum,Summary_Fasttext_SIF,Summary_Loc_Baseline,Summary_Fasttext_Mean
2,2,Yukos' owner Menatep Group says it will ask Ro...,Yukos unit buyer faces loan claim The owners o...,English,2,False,Yukos unit buyer faces loan claim The owners o...,Rosneft officials were unavailable for comment...,Yukos unit buyer faces loan claim The owners o...,"""The pledged assets are with Rosneft, so it wi..."
15,15,India's rupee has hit a five-year high after S...,India's rupee hits five-year high India's rupe...,English,15,False,India's rupee hits five-year high India's rupe...,"""The upgrade is positive and basically people ...",India's rupee hits five-year high India's rupe...,"""Money has moved out from India in the first t..."
23,23,The affected vehicles in the product recall ar...,Safety alert as GM recalls cars The world's bi...,English,23,False,Safety alert as GM recalls cars The world's bi...,This is because of possible malfunctions with ...,Safety alert as GM recalls cars The world's bi...,Safety alert as GM recalls cars The world's bi...
27,27,"US growth would reach 4.4% in 2004, but fall t...",Soaring oil 'hits world economy' The soaring c...,English,27,False,Soaring oil 'hits world economy' The soaring c...,The price of crude is about 50% higher than it...,Soaring oil 'hits world economy' The soaring c...,"In its latest bi-annual report, the OECD cut i..."
28,28,"Irish shares have risen to a record high, with...",Irish markets reach all-time high Irish shares...,English,28,False,Irish markets reach all-time high Irish shares...,"Telecoms firm Eircom, which recently revealed ...",Irish markets reach all-time high Irish shares...,The ISEQ index of leading shares closed up 23 ...


In [7]:
comparator = USEevaluator(metric="cosine")

In [8]:
summaries = []
cosims = []

In [9]:
flatdict = {}
rouge = Rouge()

In [10]:
for i, row in tqdm.tqdm_notebook(test_data.iterrows(), total=len(test_data.index)):
    try:
        sents = sent_tokenize(row.Body, row.Language.lower())
        sumlen = round(0.2*len(sents))
        senlens = [len(s.split()) for s in sents]
        res = sorted(range(len(senlens)), key = lambda sub: senlens[sub])[-3:] 
        res.sort()
        smry = " ".join([sents[i] for i in res])
    except:
        smry = " "
    if smry=="":
        smry = " "
    summaries.append(smry)
    flatlist = []
    scores = rouge.get_scores(smry, row.Lead)[0]
    for metric in scores:
        for key in scores[metric]:
            flatlist.append(scores[metric][key])
    flatdict[i] = flatlist
    cosims.append(comparator.compare(smry, row.Lead))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(FloatProgress(value=0.0, max=8430.0), HTML(value='')))




In [11]:
test_data["Summary_Loc_Baseline"] = summaries

In [12]:
r_scores = pd.DataFrame.from_dict(flatdict, orient="index",
                       columns=['R1_f', 'R1_p', 'R1_r', 'R2_f', 'R2_p', 'R2_r','Rl_f', 'Rl_p', 'Rl_r'])

In [13]:
test_data = pd.merge(test_data, r_scores, left_index=True, right_index=True)

In [14]:
test_data["cosine_sim"] = cosims

In [15]:
test_data.head()

Unnamed: 0,index,Lead,Body,Language,ID,isTrain,Summary_CRSum,Summary_Fasttext_SIF,Summary_Loc_Baseline,Summary_Fasttext_Mean,R1_f,R1_p,R1_r,R2_f,R2_p,R2_r,Rl_f,Rl_p,Rl_r,cosine_sim
2,2,Yukos' owner Menatep Group says it will ask Ro...,Yukos unit buyer faces loan claim The owners o...,English,2,False,Yukos unit buyer faces loan claim The owners o...,Rosneft officials were unavailable for comment...,Yukos unit buyer faces loan claim The owners o...,"""The pledged assets are with Rosneft, so it wi...",0.45815,0.52,0.409449,0.32,0.363636,0.285714,0.481928,0.512821,0.454545,0.856127
15,15,India's rupee has hit a five-year high after S...,India's rupee hits five-year high India's rupe...,English,15,False,India's rupee hits five-year high India's rupe...,"""The upgrade is positive and basically people ...",India's rupee hits five-year high India's rupe...,"""Money has moved out from India in the first t...",0.382353,0.329114,0.45614,0.268657,0.230769,0.321429,0.414414,0.348485,0.511111,0.783486
23,23,The affected vehicles in the product recall ar...,Safety alert as GM recalls cars The world's bi...,English,23,False,Safety alert as GM recalls cars The world's bi...,This is because of possible malfunctions with ...,Safety alert as GM recalls cars The world's bi...,Safety alert as GM recalls cars The world's bi...,0.744589,0.955556,0.609929,0.707424,0.910112,0.578571,0.788889,0.946667,0.67619,0.799912
27,27,"US growth would reach 4.4% in 2004, but fall t...",Soaring oil 'hits world economy' The soaring c...,English,27,False,Soaring oil 'hits world economy' The soaring c...,The price of crude is about 50% higher than it...,But the outlook was worst for the 12-member eu...,"In its latest bi-annual report, the OECD cut i...",0.543624,0.80198,0.411168,0.472973,0.7,0.357143,0.599034,0.765432,0.492063,0.74305
28,28,"Irish shares have risen to a record high, with...",Irish markets reach all-time high Irish shares...,English,28,False,Irish markets reach all-time high Irish shares...,"Telecoms firm Eircom, which recently revealed ...",Irish markets reach all-time high Irish shares...,The ISEQ index of leading shares closed up 23 ...,0.669565,0.950617,0.516779,0.631579,0.9,0.486486,0.718563,0.923077,0.588235,0.884414


In [16]:
test_data.R2_f.describe()

count    8430.000000
mean        0.064370
std         0.106593
min         0.000000
25%         0.000000
50%         0.028423
75%         0.069767
max         0.969325
Name: R2_f, dtype: float64

In [17]:
test_data.R2_p.describe()

count    8430.000000
mean        0.057302
std         0.126776
min         0.000000
25%         0.000000
50%         0.018519
75%         0.048276
max         1.000000
Name: R2_p, dtype: float64

In [18]:
test_data.R2_r.describe()

count    8430.000000
mean        0.105227
std         0.140821
min         0.000000
25%         0.000000
50%         0.057692
75%         0.137931
max         1.000000
Name: R2_r, dtype: float64

In [19]:
test_data.cosine_sim.describe()

count    8430.000000
mean        0.490970
std         0.152030
min        -0.022299
25%         0.388154
50%         0.498183
75%         0.598949
max         0.952937
Name: cosine_sim, dtype: float64