https://towardsdatascience.com/text-summarization-in-python-with-jaro-winkler-and-pagerank-72d693da94e8

In [40]:
import networkx as nx
import numpy as np
import pandas as pd
from rouge import Rouge
import spacy

import nltk
import jaro

nlp = spacy.load("en_core_web_md")

try:
    from nltk.corpus import stopwords
except:
    import nltk
    nltk.download('stopwords')
finally:
    from nltk.corpus import stopwords
    
# constants
sw = list(set(stopwords.words('english')))
punct = [
    '!','#','$','%','&','(',')','*',
    '+',',','-','/',':',';','<','=','>','@',
    '[','\\',']','^','_','`','{','|','}','~'
]

# try:
#     book = nltk.corpus.gutenberg.raw('shakespeare-caesar.txt')
# except:
#     nltk.download('gutenberg')
# finally:
#     book = nltk.corpus.gutenberg.raw('shakespeare-caesar.txt')

In [17]:
df = pd.read_csv("new_sum_1000_articles.csv")
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   title           1000 non-null   object
 1   content         1000 non-null   object
 2   summarizedText  999 non-null    object
 3   tf_idf_summary  1000 non-null   object
dtypes: object(4)
memory usage: 31.4+ KB
None


In [12]:
def clean_text(text, sw = sw, punct = punct):
    '''
    This function will clean the input text by lowering, removing certain punctuations, stopwords and 
    new line tags.
    
    params:
        text (String) : The body of text you want to clean
        sw (List) : The list of stopwords you wish to removed from the input text
        punct (List) : The slist of punctuations you wish to remove from the input text
        
    returns:
        This function will return the input text after it's cleaned (the output will be a string) and 
        a dictionary mapping of the original sentences with its index
    '''
    article = text.lower()
    
    # clean punctuations
    for pun in punct:
        article = article.replace(pun, '')
    
    article = article.replace("[^a-zA-Z]", " ").replace('\r\n', ' ').replace('\n', ' ')
    original_text_mapping = {k:v for k,v in enumerate(article.split('. '))}
    
    article = article.split(' ')
    
    # clean stopwords
    article = [x.lstrip().rstrip() for x in article if x not in sw]
    article = [x for x in article if x]
    article = ' '.join(article)

    return original_text_mapping, article
  

# print(len(sentences))

In [13]:
def create_similarity_matrix(sentences):
    '''
    The purpose of this function will be to create an N x N similarity matrix.
    N represents the number of sentences and the similarity of a pair of sentences
    will be determined through the Jaro-Winkler Score.
    
    params:
        sentences (List -> String) : This is a list of strings you want to create
                                     the similarity matrix with.
     
    returns:
        This function will return a square numpy matrix
    '''
    
    # identify sentence similarity matrix with Jaro Winkler score
    sentence_length = len(sentences)
    sim_mat = np.zeros((sentence_length, sentence_length))

    for i in range(sentence_length):
        for j in range(sentence_length):
            if i != j:
                similarity = jaro.jaro_winkler_metric(sentences[i], sentences[j])
                sim_mat[i][j] = similarity
    return sim_mat



# print(ranked_sentences[0][0])

In [48]:
row = 0
def generate_summary(text, sum_percentage=50):
    global row
    row += 1
    print("row:\t", row)

    '''
    This function will generate the summary given a list of ranked sentences and the
    number of sentences the user wants in their summary.
    
    params:
        ranked_sentences (List -> Tuples) : The list of ranked sentences where each
                                            element is a tuple, the first value in the
                                            tuple is the sentence, the second value is
                                            the rank
        N (Integer) : The number of sentences the user wants in the summary
        
    returns:
        This function will return a string associated to the summarized ranked_sentences
        of a book
    '''
    original_text_mapping, processedText = clean_text(text)

# get sentences
    sentences = [x for x in processedText.split('. ') if x not in ['', ' ', '..', '.', '...']]  
    sim_mat = create_similarity_matrix(sentences)

    # create network
    # G = nx.from_numpy_matrix(sim_mat)
    G = nx.DiGraph(np.array(sim_mat))

    # calculate page rank scores
    pr_sentence_similarity = nx.pagerank(G)

    ranked_sentences = [
        (original_text_mapping[sent], rank) for sent,rank in sorted(pr_sentence_similarity.items(), key=lambda item: item[1], reverse = True)
    ]
    sum_length = round(len(sentences) * sum_percentage/100)
    summary = '. '.join([sent[0] for sent in ranked_sentences[0:sum_length]])
    return summary

  
# N = 2
# text = "As a kid, what did you want to be when you grew up and how does that inform what you do today. What’s one of the most important lessons you’ve learned in your career. What happened in your career that led you to Collective Health. How would you explain what you do to a 5-year-old. What excites you most about where Collective Health is going."
# summary = generate_summary(text)
# print("summary:\t",summary)

In [49]:
df['TextRank_Summary'] = df['content'].apply(generate_summary)

row:	 1
row:	 2
row:	 3
row:	 4
row:	 5
row:	 6
row:	 7
row:	 8
row:	 9
row:	 10
row:	 11
row:	 12
row:	 13
row:	 14
row:	 15
row:	 16
row:	 17
row:	 18
row:	 19
row:	 20
row:	 21
row:	 22
row:	 23
row:	 24
row:	 25
row:	 26
row:	 27
row:	 28
row:	 29
row:	 30
row:	 31
row:	 32
row:	 33
row:	 34
row:	 35
row:	 36
row:	 37
row:	 38
row:	 39
row:	 40
row:	 41
row:	 42
row:	 43
row:	 44
row:	 45
row:	 46
row:	 47
row:	 48
row:	 49
row:	 50
row:	 51
row:	 52
row:	 53
row:	 54
row:	 55
row:	 56
row:	 57
row:	 58
row:	 59
row:	 60
row:	 61
row:	 62
row:	 63
row:	 64
row:	 65
row:	 66
row:	 67
row:	 68
row:	 69
row:	 70
row:	 71
row:	 72
row:	 73
row:	 74
row:	 75
row:	 76
row:	 77
row:	 78
row:	 79
row:	 80
row:	 81
row:	 82
row:	 83
row:	 84
row:	 85
row:	 86
row:	 87
row:	 88
row:	 89
row:	 90
row:	 91
row:	 92
row:	 93
row:	 94
row:	 95
row:	 96
row:	 97
row:	 98
row:	 99
row:	 100
row:	 101
row:	 102
row:	 103
row:	 104
row:	 105
row:	 106
row:	 107
row:	 108
row:	 109
row:	 110
row:	 11

In [50]:
df.to_csv("new_sum_1000_articles_v2.csv",index=False)

In [51]:
df_2 = pd.read_csv("new_sum_1000_articles_v2.csv")
df_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   title             1000 non-null   object
 1   content           1000 non-null   object
 2   summarizedText    999 non-null    object
 3   tf_idf_summary    1000 non-null   object
 4   TextRank_Summary  990 non-null    object
dtypes: object(5)
memory usage: 39.2+ KB


In [52]:
df_2.head()

Unnamed: 0,title,content,summarizedText,tf_idf_summary,TextRank_Summary
0,Michael Flynn Was Paid to Represent Turkey’s I...,WASHINGTON — The candidate he was advising ...,WASHINGTON — The candidate he was advising ...,President Trump made the decision to pull the...,flynn positioned himself as someone willing to...
1,Hillary Clinton Crashes Tribeca Film Festival,NEW YORK (AP) — The premiere of a virtual r...,NEW YORK (AP) — The premiere of a virtual r...,President Trump made the decision to pull the...,new york ap — the premiere of a virtual rea...
2,Former PM David Cameron Resigns From Parliament,David Cameron announced he will resign from th...,David Cameron announced he will resign from th...,President Trump made the decision to pull the...,mr cameron also claimed it would be hard not b...
3,Gavin Newsom: Trump’s Election a ’Leap Backwar...,California Lt. Governor Gavin Newsom described...,Governor Gavin Newsom described the election o...,President Trump made the decision to pull the...,” the times admits that california has fewer l...
4,Trump to Louisiana Flood Victims: ’When You Hu...,"BATON ROUGE, Louisiana — In a heartfelt mom...","BATON ROUGE, Louisiana — In a heartfelt mom...",President Trump made the decision to pull the...,“and if the president can interrupt his vacati...


In [53]:
df.loc[0, 'content']

'WASHINGTON  —   The candidate he was advising last fall was running on a platform of America First. The client he was working for last fall was paying him more than $500, 000 to put Turkey first. Michael T. Flynn, who went from the campaign trail to the White House as President Trump’s first national security adviser, filed papers this week acknowledging that he worked as a foreign agent last year representing the interests of the Turkish government in a dispute with the United States. His surprising admission, coming more than four months after the election, raised further questions about the rise and fall of a presidential confidant who was forced to resign after 24 days in office for withholding the full story of his communications with Russia’s ambassador. Even now, out of government and out of favor, Mr. Flynn and his contact with foreign figures presented a new headache for a White House eager to move on. Mr. Flynn, a retired Army lieutenant general, registered as a lobbyist las

In [54]:

df_2['TextRank_Summary'] = df_2['TextRank_Summary'].astype(str)
df_2['summarizedText'] = df_2['summarizedText'].astype(str)

rouge = Rouge()
rouge.get_scores(df_2['TextRank_Summary'], df_2['summarizedText'], avg=True)

{'rouge-1': {'r': 0.5494947678840644,
  'p': 0.46628917299777184,
  'f': 0.4996188044810514},
 'rouge-2': {'r': 0.42100801255368564,
  'p': 0.3223717022963697,
  'f': 0.36021192030043686},
 'rouge-l': {'r': 0.5423149898213075,
  'p': 0.459844843855109,
  'f': 0.4928790169430374}}