In [None]:
!pip install transformers==3.0.2

Collecting transformers==3.0.2
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |▍                               | 10kB 21.0MB/s eta 0:00:01[K     |▉                               | 20kB 6.2MB/s eta 0:00:01[K     |█▎                              | 30kB 8.7MB/s eta 0:00:01[K     |█▊                              | 40kB 8.5MB/s eta 0:00:01[K     |██▏                             | 51kB 7.1MB/s eta 0:00:01[K     |██▋                             | 61kB 8.0MB/s eta 0:00:01[K     |███                             | 71kB 8.3MB/s eta 0:00:01[K     |███▍                            | 81kB 8.6MB/s eta 0:00:01[K     |███▉                            | 92kB 9.5MB/s eta 0:00:01[K     |████▎                           | 102kB 9.3MB/s eta 0:00:01[K     |████▊                           | 112kB 9.3MB/s eta 0:00:01[K     |█████▏                          | 122

In [None]:
import numpy as np
import pandas as pd
import nltk
import sys
nltk.download('punkt')
import transformers
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model=transformers.BertModel.from_pretrained('bert-base-uncased').to(device).eval()
tokenizer=transformers.BertTokenizer.from_pretrained('bert-base-uncased')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
def get_tfidf_features(paragraphs):
    print("Extracting tf-idf features")
    all_paragraph_features=[]
    vectorizer = TfidfVectorizer()
    lengths=[]
    for paragraph in paragraphs:
        vectorizer = TfidfVectorizer()
        curr_features=vectorizer.fit_transform(paragraph)
        lengths.append(len(vectorizer.get_feature_names()))
        all_paragraph_features.append(np.array(curr_features.todense()))
    lengths=np.array(lengths)
    print("Min length of dictionary: {}".format(np.amin(lengths)))
    print("Avg length of dictionary: {}".format(np.mean(lengths)))
    print("Max length of dictionary: {}".format(np.amax(lengths)))
    return np.asarray(all_paragraph_features)

In [None]:
print("GPU/CPU:",torch.cuda.get_device_name(0)) 
def bert_features(paragraph):
    max_seq_length=500
    batch_tokens=[]
    longest_seq=0
    for sentence in paragraph:
        tokens=tokenizer.tokenize(sentence)
        tokens_id=tokenizer.convert_tokens_to_ids(tokens)
        longest_seq = max(longest_seq, len(tokens))
        batch_tokens.append(tokens_id)
    features = {}
    pad_seq_length = min(longest_seq,max_seq_length) + 3
    for text in batch_tokens:
        sentence_features = tokenizer.prepare_for_model(text,max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt',truncation=True)
        for feature_name in sentence_features:
            if feature_name not in features:
                features[feature_name] = []
            features[feature_name].append(sentence_features[feature_name])
    for feature_name in features:
        features[feature_name] = torch.cat(features[feature_name]).to(device)
    with torch.no_grad():
        outputs = model(**features)
    pooled_output = outputs[1]
    return pooled_output
    
def get_bert_features(paragraphs):
    all_paragraph_features=[]
    for i,paragraph in enumerate(paragraphs):
        curr_features=bert_features(paragraph)
        sys.stdout.write("\rExtracted features for {} of {}".format(i+1,paragraphs.shape[0]))
        sys.stdout.flush()
        all_paragraph_features.append(curr_features)
    return np.asarray(all_paragraph_features)

GPU/CPU: Tesla T4


In [None]:
from nltk.cluster.util import cosine_distance
def build_similarity_matrix(paragraph_features,is_tensor):
    if is_tensor:
        paragraph_features=paragraph_features.cpu().numpy()
    num_sentences=paragraph_features.shape[0]
    similarity_matrix = np.zeros((num_sentences, num_sentences))
    for idx1 in range(num_sentences):
        for idx2 in range(num_sentences):
            if idx1 == idx2: #ignore if both are same sentences
                continue 
            similarity_matrix[idx1][idx2] = 1-cosine_distance(paragraph_features[idx1],paragraph_features[idx2])
    return similarity_matrix

import networkx as nx
def generate_summaries(paragraphs,features_paragraphs,is_tensor=False):
    text_summaries=[]
    cnt=0
    for i,paragraph_features in enumerate(features_paragraphs):
        sys.stdout.write("\rGenerating summary for {} of {}".format(i+1,paragraphs.shape[0]))
        sys.stdout.flush()
        similarity_matrix=build_similarity_matrix(paragraph_features,is_tensor)
        sentence_similarity_graph = nx.from_numpy_array(similarity_matrix)
        try:
            scores = nx.pagerank(sentence_similarity_graph,max_iter=100)
            ranked_sentence = sorted(((scores[j],j) for j,s in enumerate(paragraphs[i])), reverse=True)    
            top_n=3
            summary_indices=[]
            for j in range(top_n):
                summary_indices.append(ranked_sentence[j][1])
            out=np.array(paragraphs[i])
            out_indices=sorted(summary_indices)
            text_summaries.append(("".join(out),"".join(out[out_indices]),similarity_matrix))
        except:
            cnt+=1
    print("Pagerank didn't converge for {} paragraphs".format(cnt))
    return text_summaries

In [None]:
#DATASET-1
#Loading and cleaning
data=pd.read_csv('news_summary.csv',encoding='iso-8859-1')
data_paragraphs=data['ctext']
all_paragraphs=[]
cnt=0
for paragraph in data_paragraphs:
    try:
        all_paragraphs.append(nltk.sent_tokenize(paragraph))
    except:
        cnt+=1
print("{} paragraphs omitted".format(cnt))
all_paragraphs=np.array(all_paragraphs)
print("{} paragraphs parsed".format(all_paragraphs.shape[0]))
lengths=[]
min_thres=3
max_thres=35
paragraphs=[]
for paragraph in all_paragraphs:
    curr_len=len(paragraph)
    if curr_len>=min_thres and curr_len<=max_thres:
        paragraphs.append(paragraph)
print("{} paragraphs in dataset after applying length constraint".format(len(paragraphs)))
paragraphs=np.array(paragraphs)

118 paragraphs omitted
4396 paragraphs parsed
3878 paragraphs in dataset after applying length constraint


In [None]:
#Visualizing
for paragraph in paragraphs:
    if len(paragraph)==min_thres:
        print(paragraph)
        break
print("\n")
for paragraph in paragraphs:
    if len(paragraph)==max_thres:
        print(paragraph)
        break

['President Donald Trump?s son-in-law says the Trump campaign couldn?t have colluded with Russia because the team was too dysfunctional and disorganised to coordinate with a foreign government.Jared Kushner, a senior adviser to the president, made the comment Monday during a closed-door session with congressional interns.A Democratic congressional aide says Kushner was responding to a question about Special Counsel Robert Mueller?s investigation into whether the Trump campaign colluded with Moscow.ForeignPolicy.com first reported Kushner?s remarks.', 'The aide was knowledgeable of the meeting and confirmed the accuracy of the comments.Last week, Kushner met privately at the Capitol with members of the Senate and House intelligence committees.He acknowledged four meetings with Russians during and after Trump?s victorious White House bid and insisted that he had ?nothing to hide.', '?']


['It took him five years and lot of courage to openly speak about his battle with drug addiction.', 

In [None]:
#TF-IDF FEATURES AND SUMMARIES

In [None]:
features_tfidf=get_tfidf_features(paragraphs)

Extracting tf-idf features
Min length of dictionary: 34
Avg length of dictionary: 170.2885507993811
Max length of dictionary: 538


In [None]:
tfidf_summaries=generate_summaries(paragraphs,features_tfidf)

Generating summary for 18 of 3878

  sqrt(numpy.dot(u, u)) * sqrt(numpy.dot(v, v))))


Generating summary for 3878 of 3878Pagerank didn't converge for 218 paragraphs


In [None]:
###BERT FEATURES AND SUMMARIES

In [None]:
features_bert=get_bert_features(paragraphs)

Extracted features for 3878 of 3878

In [None]:
bert_summaries=generate_summaries(paragraphs,features_bert,is_tensor=True)

Generating summary for 3878 of 3878Pagerank didn't converge for 37 paragraphs


In [None]:
###ANALYSIS

In [None]:
bert_summaries[0][:2]

('The Daman and Diu administration on Wednesday withdrew a circular that asked women staff to tie rakhis on male colleagues after the order triggered a backlash from employees and was ripped apart on social media.The union territory?s administration was forced to retreat within 24 hours of issuing the circular that made it compulsory for its staff to celebrate Rakshabandhan at workplace.?It has been decided to celebrate the festival of Rakshabandhan on August 7.In this connection, all offices/ departments shall remain open and celebrate the festival collectively at a suitable time wherein all the lady staff shall tie rakhis to their colleagues,?the order, issued on August 1 by Gurpreet Singh, deputy secretary (personnel), had said.To ensure that no one skipped office, an attendance report was to be sent to the government the next evening.The two notifications ?one mandating the celebration of Rakshabandhan (left) and the other withdrawing the mandate (right) ?were issued by the Daman a

In [None]:
tfidf_summaries[0][:2]

('The Daman and Diu administration on Wednesday withdrew a circular that asked women staff to tie rakhis on male colleagues after the order triggered a backlash from employees and was ripped apart on social media.The union territory?s administration was forced to retreat within 24 hours of issuing the circular that made it compulsory for its staff to celebrate Rakshabandhan at workplace.?It has been decided to celebrate the festival of Rakshabandhan on August 7.In this connection, all offices/ departments shall remain open and celebrate the festival collectively at a suitable time wherein all the lady staff shall tie rakhis to their colleagues,?the order, issued on August 1 by Gurpreet Singh, deputy secretary (personnel), had said.To ensure that no one skipped office, an attendance report was to be sent to the government the next evening.The two notifications ?one mandating the celebration of Rakshabandhan (left) and the other withdrawing the mandate (right) ?were issued by the Daman a

In [None]:
for i in range(paragraphs.shape[0]):
    if len(paragraphs[i])==6:
        print(i)

22
27
28
86
132
148
181
236
248
256
276
289
302
368
382
383
433
447
462
468
478
516
526
532
555
560
566
569
629
648
683
708
714
750
882
905
926
944
950
956
963
968
974
983
1013
1016
1023
1028
1043
1048
1056
1059
1061
1070
1097
1110
1115
1184
1193
1198
1199
1233
1254
1277
1278
1299
1301
1373
1377
1393
1427
1439
1446
1457
1470
1471
1479
1486
1495
1511
1512
1545
1546
1570
1581
1587
1592
1598
1603
1624
1629
1645
1656
1666
1674
1694
1695
1715
1718
1724
1746
1758
1779
1783
1796
1799
1902
1983
1992
2005
2009
2016
2023
2025
2069
2103
2106
2163
2195
2225
2231
2235
2243
2245
2272
2292
2328
2336
2350
2369
2371
2374
2391
2424
2443
2452
2482
2512
2526
2527
2529
2548
2555
2560
2563
2582
2606
2614
2641
2650
2675
2680
2697
2709
2715
2716
2752
2754
2767
2779
2820
2862
2865
2866
2874
2877
2878
2880
2889
2895
2901
2904
2916
2919
2926
2929
2945
2954
2956
2989
3001
3005
3008
3021
3028
3036
3040
3042
3056
3059
3068
3073
3093
3116
3129
3147
3157
3177
3181
3188
3189
3202
3215
3220
3230
3232
3247
3253
3259
326

In [None]:
bert_summaries[27][:2]

('The remains of a German hiker who disappeared while climbing in the Swiss Alps 30 years ago has been found embedded in a glacier, police said on Wednesday.The find was made on July 25 by two people climbing the Lagginhorn mountain, in southern Switzerland, police said in a statement.A few hundred metres before reaching the peak, the hikers spotted ?a hand and two shoes?, area police said.Extracting the remains that day was impossible because of poor weather, but rescue workers arrived by helicopter the following day, removed the remains from the ice and brought them to a hospital in the capital Bern for identification.Test have confirmed the climber was a German national born in 1943 who disappeared on August 11, 1987.The discovery came just two days after a Swiss couple who disappeared while walking in the Alps in 1942 were found in a receding glacier.',
 'The find was made on July 25 by two people climbing the Lagginhorn mountain, in southern Switzerland, police said in a statement

In [None]:
tfidf_summaries[25][:2]

('The remains of a German hiker who disappeared while climbing in the Swiss Alps 30 years ago has been found embedded in a glacier, police said on Wednesday.The find was made on July 25 by two people climbing the Lagginhorn mountain, in southern Switzerland, police said in a statement.A few hundred metres before reaching the peak, the hikers spotted ?a hand and two shoes?, area police said.Extracting the remains that day was impossible because of poor weather, but rescue workers arrived by helicopter the following day, removed the remains from the ice and brought them to a hospital in the capital Bern for identification.Test have confirmed the climber was a German national born in 1943 who disappeared on August 11, 1987.The discovery came just two days after a Swiss couple who disappeared while walking in the Alps in 1942 were found in a receding glacier.',
 'The remains of a German hiker who disappeared while climbing in the Swiss Alps 30 years ago has been found embedded in a glacier