# Experiment 0: Baselines

In this experiment, summaries are generated based on simple heuristics.

In [1]:
import pandas as pd
import nltk
import tqdm
from rouge import Rouge
from nltk.tokenize import sent_tokenize

In [2]:
from Evaluator import USEevaluator

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/swrdata/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
test_data = pd.read_pickle("./training_data/test_raw.pkl")

In [5]:
test_data = test_data.sort_values(by=['Language'])

In [6]:
test_data.head()

Unnamed: 0,index,Lead,Body,Language,ID,isTrain,Summary_CRSum,Summary_Fasttext_SIF
2,2,Yukos' owner Menatep Group says it will ask Ro...,Yukos unit buyer faces loan claim The owners o...,English,2,False,Yukos unit buyer faces loan claim The owners o...,Rosneft officials were unavailable for comment...
14,14,"On Tuesday, the company's administrator, turna...","Parmalat boasts doubled profits Parmalat, the ...",English,14,False,Less welcome was the news that the firm had be...,But a crucial factor for the company's future ...
15,15,India's rupee has hit a five-year high after S...,India's rupee hits five-year high India's rupe...,English,15,False,India's rupee hits five-year high India's rupe...,"""The upgrade is positive and basically people ..."
23,23,The affected vehicles in the product recall ar...,Safety alert as GM recalls cars The world's bi...,English,23,False,Safety alert as GM recalls cars The world's bi...,This is because of possible malfunctions with ...
27,27,"US growth would reach 4.4% in 2004, but fall t...",Soaring oil 'hits world economy' The soaring c...,English,27,False,Soaring oil 'hits world economy' The soaring c...,The price of crude is about 50% higher than it...


In [7]:
comparator = USEevaluator(metric="cosine")

In [8]:
summaries = []
cosims = []

In [9]:
flatdict = {}
rouge = Rouge()

In [10]:
for i, row in tqdm.tqdm_notebook(test_data.iterrows(), total=len(test_data.index)):
    try:
        sents = sent_tokenize(row.Body, row.Language.lower())
        sumlen = round(0.2*len(sents))
        smry = " ".join(sents[0:sumlen])
    except:
        smry = " "
    if smry=="":
        smry = " "
    summaries.append(smry)
    flatlist = []
    scores = rouge.get_scores(smry, row.Lead)[0]
    for metric in scores:
        for key in scores[metric]:
            flatlist.append(scores[metric][key])
    flatdict[i] = flatlist
    cosims.append(comparator.compare(smry, row.Lead))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(FloatProgress(value=0.0, max=8430.0), HTML(value='')))




In [11]:
test_data["Summary_Loc_Baseline"] = summaries

In [12]:
test_data.to_pickle('./training_data/test_raw.pkl')

In [13]:
r_scores = pd.DataFrame.from_dict(flatdict, orient="index",
                       columns=['R1_f', 'R1_p', 'R1_r', 'R2_f', 'R2_p', 'R2_r','Rl_f', 'Rl_p', 'Rl_r'])

In [14]:
test_data = pd.merge(test_data, r_scores, left_index=True, right_index=True)

In [15]:
test_data["cosine_sim"] = cosims

In [16]:
test_data.head()

Unnamed: 0,index,Lead,Body,Language,ID,isTrain,Summary_CRSum,Summary_Fasttext_SIF,Summary_Loc_Baseline,R1_f,R1_p,R1_r,R2_f,R2_p,R2_r,Rl_f,Rl_p,Rl_r,cosine_sim
2,2,Yukos' owner Menatep Group says it will ask Ro...,Yukos unit buyer faces loan claim The owners o...,English,2,False,Yukos unit buyer faces loan claim The owners o...,Rosneft officials were unavailable for comment...,Yukos unit buyer faces loan claim The owners o...,0.404372,0.660714,0.291339,0.276243,0.454545,0.198413,0.442748,0.674419,0.329545,0.647187
14,14,"On Tuesday, the company's administrator, turna...","Parmalat boasts doubled profits Parmalat, the ...",English,14,False,Less welcome was the news that the firm had be...,But a crucial factor for the company's future ...,"Parmalat boasts doubled profits Parmalat, the ...",0.161765,0.234043,0.123596,0.0,0.0,0.0,0.127273,0.175,0.1,0.221596
15,15,India's rupee has hit a five-year high after S...,India's rupee hits five-year high India's rupe...,English,15,False,India's rupee hits five-year high India's rupe...,"""The upgrade is positive and basically people ...",India's rupee hits five-year high India's rupe...,0.428571,0.512195,0.368421,0.354167,0.425,0.303571,0.506329,0.588235,0.444444,0.573081
23,23,The affected vehicles in the product recall ar...,Safety alert as GM recalls cars The world's bi...,English,23,False,Safety alert as GM recalls cars The world's bi...,This is because of possible malfunctions with ...,Safety alert as GM recalls cars The world's bi...,0.482412,0.827586,0.340426,0.416244,0.719298,0.292857,0.547771,0.826923,0.409524,0.673235
27,27,"US growth would reach 4.4% in 2004, but fall t...",Soaring oil 'hits world economy' The soaring c...,English,27,False,Soaring oil 'hits world economy' The soaring c...,The price of crude is about 50% higher than it...,Soaring oil 'hits world economy' The soaring c...,0.416357,0.777778,0.284264,0.374532,0.704225,0.255102,0.478261,0.758621,0.349206,0.703876


In [17]:
test_data.R2_f.describe()

count    8430.000000
mean        0.095220
std         0.128683
min         0.000000
25%         0.017544
50%         0.054323
75%         0.122939
max         1.000000
Name: R2_f, dtype: float64

In [18]:
test_data.R2_p.describe()

count    8430.000000
mean        0.090179
std         0.157655
min         0.000000
25%         0.011315
50%         0.038462
75%         0.092224
max         1.000000
Name: R2_p, dtype: float64

In [19]:
test_data.R2_r.describe()

count    8430.000000
mean        0.144460
std         0.157358
min         0.000000
25%         0.034483
50%         0.103448
75%         0.210526
max         1.000000
Name: R2_r, dtype: float64

In [20]:
test_data.cosine_sim.describe()

count    8430.000000
mean        0.547844
std         0.165931
min        -0.044249
25%         0.439824
50%         0.563739
75%         0.669051
max         1.000000
Name: cosine_sim, dtype: float64