# **Imports & NLTK downloads**

In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.porter import PorterStemmer
import math
import numpy as np
import pickle
from rouge import Rouge

In [2]:
from summarizer import Summarizer #From bert-extractive-summarizer (pip install bert-extractive-summarizer)

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/elsayed/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/elsayed/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/elsayed/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# Get the English stop words
stop_words = set(stopwords.words('english')) 
my_stopwords = stopwords.words()
# Create the stemmer
stemmer = PorterStemmer()

## **Importing dataset**

In [5]:
data = pd.read_csv('./archive/news_summary.csv', encoding='latin-1') #check encodings types
#more_data = pd.read_csv('news_summary_more.csv', encoding='latin-1')

# **Exploring the dataset**

## **news_summary dataset**


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4514 entries, 0 to 4513
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   author     4514 non-null   object
 1   date       4514 non-null   object
 2   headlines  4514 non-null   object
 3   read_more  4514 non-null   object
 4   text       4514 non-null   object
 5   ctext      4396 non-null   object
dtypes: object(6)
memory usage: 211.7+ KB


Nan values is found in the complete text 

In [7]:
data.head()

Unnamed: 0,author,date,headlines,read_more,text,ctext
0,Chhavi Tyagi,"03 Aug 2017,Thursday",Daman & Diu revokes mandatory Rakshabandhan in...,http://www.hindustantimes.com/india-news/raksh...,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...
1,Daisy Mowke,"03 Aug 2017,Thursday",Malaika slams user who trolled her for 'divorc...,http://www.hindustantimes.com/bollywood/malaik...,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo..."
2,Arshiya Chopra,"03 Aug 2017,Thursday",'Virgin' now corrected to 'Unmarried' in IGIMS...,http://www.hindustantimes.com/patna/bihar-igim...,The Indira Gandhi Institute of Medical Science...,The Indira Gandhi Institute of Medical Science...
3,Sumedha Sehra,"03 Aug 2017,Thursday",Aaj aapne pakad liya: LeT man Dujana before be...,http://indiatoday.intoday.in/story/abu-dujana-...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Aarushi Maheshwari,"03 Aug 2017,Thursday",Hotel staff to get training to spot signs of s...,http://indiatoday.intoday.in/story/sex-traffic...,Hotels in Maharashtra will train their staff t...,Hotels in Mumbai and other Indian cities are t...


In [8]:
duplicateRows1 = data[data.duplicated(subset=['ctext'])]
print('complete text duplicates')
print(duplicateRows1)

complete text duplicates
                  author                   date  \
42          Chhavi Tyagi  02 Aug 2017,Wednesday   
190         Chhavi Tyagi     31 Jul 2017,Monday   
231   Niharika Prabhakar     31 Jul 2017,Monday   
286        Saloni Tandon   29 Jul 2017,Saturday   
368         Chhavi Tyagi     28 Jul 2017,Friday   
...                  ...                    ...   
4381        Chhavi Tyagi   04 Mar 2017,Saturday   
4423      Mansha Mahajan  01 Mar 2017,Wednesday   
4454     Abhishek Bansal    28 Feb 2017,Tuesday   
4500      Mansha Mahajan     24 Feb 2017,Friday   
4508        Tarun Khanna     24 Feb 2017,Friday   

                                              headlines  \
42    Rakshabandhan compulsory for employees in Dama...   
190   Sad we are debating lynching and not Digital I...   
231   Delhi woman alleges in-laws set her on fire, dies   
286   People count my failures, I don't, says Virat ...   
368   If JD(U) against corruption, why allied with d...   
...     

In [9]:
print(str(data[4283:4284]['text']))
print(str(data[4285:4286]['text']))

4283    Elections in Goa ended up in a hung Assembly, ...
Name: text, dtype: object
4285    Uttar Pradesh Chief Minister Akhilesh Yadav on...
Name: text, dtype: object


**duplicates and Nans are found in the complete text attribute wich will not be used. The `'ctex'`column will be dropped any way and it is okay to have different summaries and headlines for the same ctext.**

**still need to look at examples of these duplicates to make sure**

In [10]:
duplicateRows2 = data[data.duplicated(subset=['text'])]
print('summary text duplicates',duplicateRows2)


summary text duplicates Empty DataFrame
Columns: [author, date, headlines, read_more, text, ctext]
Index: []


In [11]:
duplicateRows3 = data[data.duplicated(subset=['headlines'])]
print('headlines duplicates',duplicateRows3)

headlines duplicates Empty DataFrame
Columns: [author, date, headlines, read_more, text, ctext]
Index: []


In [12]:
selected_features = data[['headlines','text']]
selected_features.head()

Unnamed: 0,headlines,text
0,Daman & Diu revokes mandatory Rakshabandhan in...,The Administration of Union Territory Daman an...
1,Malaika slams user who trolled her for 'divorc...,Malaika Arora slammed an Instagram user who tr...
2,'Virgin' now corrected to 'Unmarried' in IGIMS...,The Indira Gandhi Institute of Medical Science...
3,Aaj aapne pakad liya: LeT man Dujana before be...,Lashkar-e-Taiba's Kashmir commander Abu Dujana...
4,Hotel staff to get training to spot signs of s...,Hotels in Maharashtra will train their staff t...


In [13]:
selected_features.isnull().values.any()


False

**The small dataset is cleaned**

In [14]:
X = selected_features['text'].values
Y = selected_features['headlines'].values
type(X)

numpy.ndarray

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2)
X_train.shape, X_test.shape

((3611,), (903,))

# **Extractive Models**

## TF-IDF based summurization

### Main Functions

In [16]:
def apply_preprocessing(text):
    original_sentences = sent_tokenize(text.lower())
    sentences = []
    # Sentences Pre-processing
    for sent in original_sentences:
        tokens = [item for item in word_tokenize(sent) if item not in my_stopwords and item != '.' and item.isalpha()] # Preprocessing
        sentences.append(' '.join([stemmer.stem(token) for token in tokens])) # Stemming
    return original_sentences, sentences

def word_tf(word,sentence):
    #tf_score =  sentence.count(word)/ len(sentence)
    return sentence.count(word)/ len(sentence)

def word_idf(word, sentences):
    return math.log10(' '.join(sentences).count(word)/ len(sentences))

def word_in_sentence_tfidf(word, sentence, sentences):
    return word_tf(word,sentence)*word_idf(word, sentences)

def sentence_score(sentence, sentences):
    return sum([word_in_sentence_tfidf(word, sentence, sentences) for word in sentence])

def sentences_scores(sentences):
    return [sentence_score(sent, sentences) for sent in sentences]

def get_best_k_sentences_indecies(k, sentences, original_sentences):
    #lista = sorted(list(np.argsort(sentences_scores(sentences))[-k:-1]))
    return sorted(list(np.argsort(sentences_scores(sentences))[-k:]))

def get_best_k_sentences(k, sentences, original_sentences):
    return [original_sentences[idx] for idx in get_best_k_sentences_indecies(k, sentences, original_sentences)]

def summarize(text, k):
    # Apply preprocessing
    original_sentences, sentences = apply_preprocessing(text)
    # Summariez
    return ' '.join(get_best_k_sentences(k, sentences, original_sentences))

### Experiment & Results

In [18]:
ext_summaries = []
n_sentences = 1
for txt in X_train:
    ext_summaries.append(summarize(txt, n_sentences))

In [21]:
# Take a look at the 
ext_summaries[0]

"indian super league side atletico de kolkata has been renamed to 'aamar, tomar kolkata' (atk) after the team's partnership with spanish football club atletico madrid ended."

## BERT Extractive Summarization

In [22]:
model = Summarizer() #Generate the summarizer object
bert_summaries = []
for txt in X_train:
    bert_summaries.append(model(txt, num_sentences = n_sentences))
# Get them in lowercase
bert_summaries = [x.lower() for x in bert_summaries]

In [47]:
# Save the BERT summaries as it took too much time..
with open('bert_summaries', 'wb') as f:
    pickle.dump(bert_summaries, f)

In [48]:
# Load the BERT summaries
with open('bert_summaries', 'rb') as f:
    bert_summaries = pickle.load(f)

In [49]:
# Have a look !
bert_summaries[0]

"indian super league side atletico de kolkata has been renamed to 'aamar, tomar kolkata' (atk) after the team's partnership with spanish football club atletico madrid ended."

## **Comparison with BERT**

In [55]:
rouge_metric = Rouge()
score = rouge_metric.get_scores(ext_summaries, bert_summaries, avg=True)
rouge1 = score.get('rouge-1')
print(rouge1)

{'f': 0.7357794890362269, 'p': 0.8368993562845303, 'r': 0.6861554970380593}


**We thought that our SCORE would be higher! Let's take a look at the sentences that don't match with BERT.**

In [67]:
for (x,y) in zip(bert_summaries, ext_summaries):
    if x != y:
        print('BERT: ', x)
        print('OURS: ', y)
        print('=========')

BERT:  prime minister narendra modi on monday accompanied his visiting australian counterpart malcolm turnbull on a metro ride in new delhi. the two prime ministers boarded the metro at the mandi house station and headed towards the akshardham temple. "
OURS:  prime minister narendra modi on monday accompanied his visiting australian counterpart malcolm turnbull on a metro ride in new delhi.
BERT:  actor randeep hooda visited the kargil war memorial in dras ahead of kargil vijay diwas, which is observed on july 26. "
OURS:  actor randeep hooda visited the kargil war memorial in dras ahead of kargil vijay diwas, which is observed on july 26.
BERT:  the two-day unesco natural heritage festival began on saturday at great himalayan national park in sai ropa, himachal pradesh. the wildlife institute of india said the festival will feature a media workshop on environmental journalism, discussions on natural and cultural heritage of the himalayan region and a heritage walk.
OURS:  the wildlif

BERT:  interpol has circulated a list of 173 islamic state fighters it believes could carry out suicide attacks in europe, in revenge for isis' military defeats. it is based on information captured during the offensive against isis in iraq and syria.
OURS:  interpol has circulated a list of 173 islamic state fighters it believes could carry out suicide attacks in europe, in revenge for isis' military defeats.
BERT:  foreign secretary s jaishankar has asked china not to give a political colour to india's aspiration to join the nuclear suppliers group (nsg). " keeping in mind...solidarity of major developing states, it is important that china views this as a developmental aspiration," he said.
OURS:  foreign secretary s jaishankar has asked china not to give a political colour to india's aspiration to join the nuclear suppliers group (nsg).
BERT:  jude sparks, a 10-year-old kid, accidentally discovered a rare 1.2 million-year-old skull of a stegomastodon, a prehistoric ancestor of elepha

OURS:  taapsee pannu opted out of an event she was supposed to be a part of after she learnt that it was being organised by a fairness cream.
BERT:  odisha agriculture minister pradeep maharathy on sunday said that farmers commit suicides only when the assembly is in session. "
OURS:  odisha agriculture minister pradeep maharathy on sunday said that farmers commit suicides only when the assembly is in session.
BERT:  the supreme court on friday dismissed a plea seeking its nod for terminating the 32-week-old pregnancy of a 10-year-old rape victim. the sc made the decision after taking note of a medical report stating that abortion was neither good for the girl nor the foetus.
OURS:  the sc made the decision after taking note of a medical report stating that abortion was neither good for the girl nor the foetus.
BERT:  a report by an independent bipartisan american body, sponsored by the us commission on international religious freedom (uscirf), has alleged that under modi regime, relig

BERT:  reacting to hollywood actor nicolas cage wearing traditional kazakhstani outfit at the 13th annual eurasia international festival, a user tweeted, "maybe it's just a wax mannequin posing as the real". " in even other news nicholas cage has no idea where he is, how he got there, or why he's wearing that," wrote another user.
OURS:  reacting to hollywood actor nicolas cage wearing traditional kazakhstani outfit at the 13th annual eurasia international festival, a user tweeted, "maybe it's just a wax mannequin posing as the real".
BERT:  congress on monday informed rajya sabha that raj babbar has been injured in police action against villagers protesting demolition of their houses in uttar pradesh. the house's deputy chairman dismissed the issue of babbar's safety and security saying the state government should deal with this.
OURS:  congress on monday informed rajya sabha that raj babbar has been injured in police action against villagers protesting demolition of their houses in u

BERT:  minister of state for home affairs hansraj ahir said on wednesday that the government is not considering a separate law to tackle mob lynching incidents, in response to congress leader digvijaya singh's question in the rajya sabha. "
OURS:  minister of state for home affairs hansraj ahir said on wednesday that the government is not considering a separate law to tackle mob lynching incidents, in response to congress leader digvijaya singh's question in the rajya sabha.
BERT:  sachin tendulkar, sourav ganguly and vvs laxman have written to committee of administrators head vinod rai, asking him to clarify that rahul dravid and zaheer khan were made consultants after discussion with ravi shastri. "
OURS:  sachin tendulkar, sourav ganguly and vvs laxman have written to committee of administrators head vinod rai, asking him to clarify that rahul dravid and zaheer khan were made consultants after discussion with ravi shastri.
BERT:  social media users have criticised the cover of the 1

OURS:  the indian railways is planning to induct about 40,000 coaches with refurbished interiors and upgraded facilities worth ?8,000 crore, to provide world-class comfort to passengers.
BERT:  actor sooraj pancholi has revealed that he is in a relationship and his current girlfriend is not from the film industry.
OURS:  sooraj was earlier dating late actress jiah khan, who committed suicide by hanging herself in the year 2013.
BERT:  zain ali, an ex-contestant on music reality show sa re ga ma pa, was found dead under mysterious circumstances at his friend's home in sheikhupura, pakistan on friday. zain's brother revealed he had a history of drug addiction but denied that an overdose could have been the cause.
OURS:  zain ali, an ex-contestant on music reality show sa re ga ma pa, was found dead under mysterious circumstances at his friend's home in sheikhupura, pakistan on friday.
BERT:  an exit poll conducted by india today-axis my india has predicted 202-220 seats for bjp in the 27

OURS:  over the last 2-3 years, with all the capital coming..., our entire industry, including ourselves, started making mistakes."
BERT:  according to imf's asia and pacific department's assistant director paul cashin, one way to characterise india's demonetisation is as a "vacuum cleaner". " it's sucking in cash, withdrawing it from the economy, and then the vacuum cleaner is going in reverse, slowly replacing cash," he stated.
OURS:  according to imf's asia and pacific department's assistant director paul cashin, one way to characterise india's demonetisation is as a "vacuum cleaner".
BERT:  ranveer singh chopped off his beard and long moustache on a live video on instagram.
OURS:  he did this as he had finished shooting for the older version of alauddin khilji in the film 'padmavati' and will be shooting for the character's younger version.
BERT:  the british parliament has passed a resolution condemning pakistan's move to declare gilgit-baltistan in pok as its fifth province. "
OU

BERT:  filmmaker imtiaz ali has revealed that when he first met shah rukh khan, a drunk girl was hanging on to him. "
OURS:  imtiaz further said, "gauri (srk's wife) was also with him and everybody was trying to deal with this crazy fan".
BERT:  west bengal chief minister mamata banerjee on friday said that prime minister narendra modi must go and be replaced by senior bjp leaders lk advani, arun jaitley or rajnath singh as the head of the government. "
OURS:  west bengal chief minister mamata banerjee on friday said that prime minister narendra modi must go and be replaced by senior bjp leaders lk advani, arun jaitley or rajnath singh as the head of the government.
BERT:  over 60 retired ias and ips officers have written an open letter to the narendra modi government asking it to enforce the rule of law and not allow vigilantism to grow. "
OURS:  over 60 retired ias and ips officers have written an open letter to the narendra modi government asking it to enforce the rule of law and no

BERT:  a us-based study has found brain's cognitive abilities may be significantly reduced in the mere presence of smartphone, even if it's switched off.
OURS:  in a series of tests requiring concentration, participants with their phones in another room outperformed those with their phones on the desk, and also those who had kept their phones in a pocket or bag, researchers said.
BERT:  actress shenaz treasury, while speaking about nepotism, said we live in a very classist society. " further, speaking about kangana ranaut who called filmmaker karan johar 'flagbearer of nepotism', shenaz said, "people like kangana have to work a million times harder than someone from a filmy family."
OURS:  further, speaking about kangana ranaut who called filmmaker karan johar 'flagbearer of nepotism', shenaz said, "people like kangana have to work a million times harder than someone from a filmy family."
BERT:  singer lata mangeshkar has revealed that in early 1950s, playback singers were called 'ghos

BERT:  union home minister rajnath singh on monday said that no one had taken care of the north east even after several decades since independence. "
OURS:  "and now prime minister narendra modi had rechristened the look east policy as act east policy to fast track development in the region after years of negligence," rajnath added.
BERT:  khadi india has threatened to sue ethnic-wear brand fabindia, for indulging in "unfair trade practice" by unauthorisedly selling cotton products under its registered brand name 'khadi'. fabindia has been asked to immediately stop using 'khadi' from all cotton products and remove display banners from showrooms.
OURS:  khadi india has threatened to sue ethnic-wear brand fabindia, for indulging in "unfair trade practice" by unauthorisedly selling cotton products under its registered brand name 'khadi'.
BERT:  a 23-year-old man has been arrested for sending obscene messages to television actress sonarika bhadoria. she revealed that the man used 25 differ

OURS:  the dubai police have officially inducted the world's first operational robot police officer into the force.
BERT:  the unique identification authority of india has launched 'maadhaar', a new app for syncing aadhaar data on mobile phones. the app allows users to carry their aadhaar information including name, date of birth, gender, address, and the linked photograph, on their smartphones.
OURS:  the app allows users to carry their aadhaar information including name, date of birth, gender, address, and the linked photograph, on their smartphones.
BERT:  shiv sena mp anandrao adsul has compared ravindra gaikwad thrashing an air india employee with an in-flight incident involving tv show host kapil sharma. " even kapil sharma misbehaved on the flight after getting drunk, but no ban imposed on him," adsul said.
OURS:  shiv sena mp anandrao adsul has compared ravindra gaikwad thrashing an air india employee with an in-flight incident involving tv show host kapil sharma.
BERT:  mahara

**Well! Our summarizer effecieny/score is much more higher! The differences is maiinoly due to different pre-procssing done for both summarizer's input.**
As an example: In the first mismatched senetences: The pre-processing in BERT didn't separate 2 sentences based on the period, hence it came out with 2 senetences not 1.