# Importing libraries

In [13]:
import datasets
import pandas as pd
import textacy

In [2]:
train= datasets.load_dataset("cnn_dailymail", "3.0.0",split="train")
test= datasets.load_dataset("cnn_dailymail", "3.0.0",split="test")
validation = datasets.load_dataset("cnn_dailymail", "3.0.0",split="validation")

In [3]:
df_train = pd.DataFrame(train)
df_test = pd.DataFrame(test)
df_validation = pd.DataFrame(validation)

In [11]:
df_train.head()

Unnamed: 0,article,highlights,id
0,"LONDON, England (Reuters) -- Harry Potter star...",Harry Potter star Daniel Radcliffe gets £20M f...,42c027e4ff9730fbb3de84c1af0d2c506e41c3e4
1,Editor's note: In our Behind the Scenes series...,Mentally ill inmates in Miami are housed on th...,ee8871b15c50d0db17b0179a6d2beab35065f1e9
2,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...","NEW: ""I thought I was going to die,"" driver sa...",06352019a19ae31e527f37f7571c6dd7f0c5da37
3,WASHINGTON (CNN) -- Doctors removed five small...,"Five small polyps found during procedure; ""non...",24521a2abb2e1f5e34e6824e0f9e56904a2b0e88
4,(CNN) -- The National Football League has ind...,"NEW: NFL chief, Atlanta Falcons owner critical...",7fe70cc8b12fab2d0a258fababf7d9c6b5e1262a


In [12]:
df_test.head()

Unnamed: 0,article,highlights,id
0,(CNN)The Palestinian Authority officially beca...,Membership gives the ICC jurisdiction over all...,f001ec5c4704938247d27a44948eebb37ae98d01
1,(CNN)Never mind cats having nine lives. A stra...,"Theia, a bully breed mix, was apparently hit b...",230c522854991d053fe98a718b1defa077a8efef
2,"(CNN)If you've been following the news lately,...",Mohammad Javad Zarif has spent more time with ...,4495ba8f3a340d97a9df1476f8a35502bcce1f69
3,(CNN)Five Americans who were monitored for thr...,17 Americans were exposed to the Ebola virus w...,a38e72fed88684ec8d60dd5856282e999dc8c0ca
4,(CNN)A Duke student has admitted to hanging a ...,Student is no longer on Duke University campus...,c27cf1b136cc270023de959e7ab24638021bc43f


# Pre-Processing

In [14]:
from textacy import preprocessing as tprep
from spacy.lang.en.stop_words import STOP_WORDS
import re
from tqdm.autonotebook import tqdm
tqdm.pandas()
process = tprep.make_pipeline(
    tprep.replace.emails,
    tprep.replace.emojis,
    tprep.replace.urls,
    tprep.replace.phone_numbers,
    tprep.replace.hashtags,
    tprep.replace.currency_symbols,
    lambda text: re.sub(r"\n", " ", text),
    tprep.remove.html_tags,
    tprep.remove.brackets,
    tprep.normalize.hyphenated_words,
    tprep.normalize.quotation_marks,
    tprep.normalize.unicode,
    tprep.normalize.bullet_points,
    tprep.normalize.whitespace,
)

In [15]:
def sample_df(df, frac=0.1):
    return df.sample(frac=frac, random_state=42).reset_index(drop=True)

def preprocess(df):
    df.article = df.article.progress_apply(process)
    df.highlights = df.highlights.progress_apply(process)
    return df

ds = sample_df(df_train, 0.01)
ds = preprocess(ds)

ds_test = sample_df(df_test, 0.1)
ds_test = preprocess(ds_test)

  0%|          | 0/2871 [00:00<?, ?it/s]

  0%|          | 0/2871 [00:00<?, ?it/s]

  0%|          | 0/1149 [00:00<?, ?it/s]

  0%|          | 0/1149 [00:00<?, ?it/s]

In [19]:
article0, highlights0 = ds.loc[0, 'article'], ds.loc[0, 'highlights']
article0

"Nasa has warned of an impending asteroid pass - and says it will be the closest until 2027. The asteroid, designated 2004 BL86, will safely pass about three times the distance of Earth to the moon on January 26. It will be the closest by any known space rock this large until asteroid 1999 AN10 flies past Earth in 2027. See the Asteroid's route below . At the time of its closest approach on January 26, the asteroid will be approximately 745,000 miles from Earth. Due to its orbit around the sun, the asteroid is currently only visible by astronomers with large telescopes who are located in the southern hemisphere. But by Jan. 26, the space rock's changing position will make it visible to those in the northern hemisphere. From its reflected brightness, astronomers estimate that the asteroid is about a third of a mile in size. At the time of its closest approach on January 26, the asteroid will be approximately 745,000 miles from Earth. 'Monday, January 26 will be the closest asteroid 2004

In [20]:
highlights0

"2004 BL86 will pass about three times the distance of Earth to the moon . Estimate that the asteroid is about a third of a mile in size . Nasa says it poses no threat to Earth 'for the foreseeable future'"

In [26]:
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
from sumy.summarizers.lsa import LsaSummarizer
from nltk import tokenize
import nltk
nltk.download('punkt')

def textrank_summary(text, num_summary_sentence, language='english'):
    summary_sentence = []
    
    stemmer = Stemmer(language)
    parser = PlaintextParser.from_string(text, Tokenizer(language))
    
    summarizer = TextRankSummarizer(stemmer)
    summarizer.stop_words = STOP_WORDS    # get_stop_words(language)
    
    for sentence in summarizer(parser.document, num_summary_sentence):
        summary_sentence.append(str(sentence))
    return summary_sentence

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\T2ABIZZ\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [27]:
textrank_summary(article0, 3)

["'Monday, January 26 will be the closest asteroid 2004 BL86 will get to Earth for at least the next 200 years,' said Don Yeomans, who is retiring as manager of NASA's Near Earth Object Program Office at the Jet Propulsion Laboratory in Pasadena, California, after 16 years in the position.",
 "'And while it poses no threat to Earth for the foreseeable future, it's a relatively close approach by a relatively large asteroid, so it provides us a unique opportunity to observe and learn more.'",
 "Nasa says 'While it poses no threat to Earth for the foreseeable future, it's a relatively close approach by a relatively large asteroid, so it provides us a unique opportunity to observe and learn more.'"]