In [18]:
import networkx as nx
import numpy as np
import pandas as pd
from rouge import Rouge
import spacy
import itertools
import nltk
import jaro
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/hostage/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
# Load the spaCy model for English language processing
nlp = spacy.load("en_core_web_md")

# Creat lists containing stopWords and punctations to remove remove
stopWords_list = list(set(stopwords.words('english')))
punct_list = [
    '!','#','$','%','&','(',')','*',
    '+',',','-','/',':',';','<','=','>','@',
    '[','\\',']','^','_','`','{','|','}','~'
]

In [20]:
# Performing text preprocessing
def preprocess_text(text, sw = stopWords_list, punct = punct_list):
    processed_text = text.lower()
    for pun in punct:
        processed_text = processed_text.replace(pun, '')
    processed_text = processed_text.replace("[^a-zA-Z]", " ").replace('\r\n', ' ').replace('\n', ' ')
    original_text_mapping = dict(enumerate(processed_text.split('. ')))
    processed_text = processed_text.split(' ')
    processed_text = [x.lstrip().rstrip() for x in processed_text if x not in sw]
    processed_text = [x for x in processed_text if x]
    processed_text = ' '.join(processed_text)

    return original_text_mapping, processed_text

In [21]:
# Create the similarity matrix based on the Jaro-Winkler score 
def create_similarity_matrix(sentences):
    sentence_length = len(sentences)
    similarity_matrix = np.zeros((sentence_length, sentence_length))

    for i, j in itertools.product(range(sentence_length), range(sentence_length)):
        if i != j:
            similarity = jaro.jaro_winkler_metric(sentences[i], sentences[j])
            similarity_matrix[i][j] = similarity
    return similarity_matrix

In [22]:
# Create the summary for the given text
def generate_summary(text, sum_percentage=50):
    original_text_mapping, processedText = preprocess_text(text)
    sentences = [x for x in processedText.split('. ') if x not in ['', ' ', '..', '.', '...']]
    sim_mat = create_similarity_matrix(sentences)
    GRAPH = nx.DiGraph(np.array(sim_mat))
    
    # calculate page rank scores
    pr_sentence_similarity = nx.pagerank(GRAPH)
    ranked_sentences = [(original_text_mapping[sent], rank) for sent,rank in sorted(pr_sentence_similarity.items(), key=lambda item: item[1], reverse = True)]
    sum_length = round(len(sentences) * sum_percentage/100)

    return '. '.join([sent[0] for sent in ranked_sentences[:sum_length]])


text = "WASHINGTON - The Trump administration has ordered the military to start withdrawing roughly 7,000 troops from Afghanistan in the coming months, two defense officials said Thursday, an abrupt shift in the 17-year-old war there and a decision that stunned Afghan officials, who said they had not been briefed on the plans. President Trump made the decision to pull the troops - about half the number the United States has in Afghanistan now - at the same time he decided to pull American forces out of Syria, one official said. The announcement came hours after Jim Mattis, the secretary of defense, said that he would resign from his position at the end of February after disagreeing with the president over his approach to policy in the Middle East. The whirlwind of troop withdrawals and the resignation of Mr. Mattis leave a murky picture for what is next in the United States’ longest war, and they come as Afghanistan has been troubled by spasms of violence afflicting the capital, Kabul, and other important areas.  The United States has also been conducting talks with representatives of the Taliban, in what officials have described as discussions that could lead to formal talks to end the conflict. Senior Afghan officials and Western diplomats in Kabul woke up to the shock of the news on Friday morning, and many of them braced for chaos ahead.  Several Afghan officials, often in the loop on security planning and decision-making, said they had received no indication in recent days that the Americans would pull troops out.  The fear that Mr. Trump might take impulsive actions, however, often loomed in the background of discussions with the United States, they said. They saw the abrupt decision as a further sign that voices from the ground were lacking in the debate over the war and that with Mr. Mattis’s resignation, Afghanistan had lost one of the last influential voices in Washington who channeled the reality of the conflict into the White House’s deliberations. The president long campaigned on bringing troops home, but in 2017, at the request of Mr. Mattis, he begrudgingly pledged an additional 4,000 troops to the Afghan campaign to try to hasten an end to the conflict. Though Pentagon officials have said the influx of forces - coupled with a more aggressive air campaign - was helping the war effort, Afghan forces continued to take nearly unsustainable levels of casualties and lose ground to the Taliban. The renewed American effort in 2017 was the first step in ensuring Afghan forces could become more independent without a set timeline for a withdrawal.  But with plans to quickly reduce the number of American troops in the country, it is unclear if the Afghans can hold their own against an increasingly aggressive Taliban. Currently, American airstrikes are at levels not seen since the height of the war, when tens of thousands of American troops were spread throughout the country.  That air support, officials say, consists mostly of propping up Afghan troops while they try to hold territory from a resurgent Taliban."
summary = generate_summary(text, 60)
print("summary:\t",summary)

summary:	 president trump made the decision to pull the troops  about half the number the united states has in afghanistan now  at the same time he decided to pull american forces out of syria one official said.  several afghan officials often in the loop on security planning and decisionmaking said they had received no indication in recent days that the americans would pull troops out. mattis leave a murky picture for what is next in the united states’ longest war and they come as afghanistan has been troubled by spasms of violence afflicting the capital kabul and other important areas. the announcement came hours after jim mattis the secretary of defense said that he would resign from his position at the end of february after disagreeing with the president over his approach to policy in the middle east. senior afghan officials and western diplomats in kabul woke up to the shock of the news on friday morning and many of them braced for chaos ahead. currently american airstrikes are at

## Evaluation of the approach

In [2]:
df = pd.read_csv("final_evaluation_dataset.csv")
print(df.info())

In [49]:
df['TextRank_Summary'] = df['content'].apply(generate_summary)

row:	 1
row:	 2
row:	 3
row:	 4
row:	 5
row:	 6
row:	 7
row:	 8
row:	 9
row:	 10
row:	 11
row:	 12
row:	 13
row:	 14
row:	 15
row:	 16
row:	 17
row:	 18
row:	 19
row:	 20
row:	 21
row:	 22
row:	 23
row:	 24
row:	 25
row:	 26
row:	 27
row:	 28
row:	 29
row:	 30
row:	 31
row:	 32
row:	 33
row:	 34
row:	 35
row:	 36
row:	 37
row:	 38
row:	 39
row:	 40
row:	 41
row:	 42
row:	 43
row:	 44
row:	 45
row:	 46
row:	 47
row:	 48
row:	 49
row:	 50
row:	 51
row:	 52
row:	 53
row:	 54
row:	 55
row:	 56
row:	 57
row:	 58
row:	 59
row:	 60
row:	 61
row:	 62
row:	 63
row:	 64
row:	 65
row:	 66
row:	 67
row:	 68
row:	 69
row:	 70
row:	 71
row:	 72
row:	 73
row:	 74
row:	 75
row:	 76
row:	 77
row:	 78
row:	 79
row:	 80
row:	 81
row:	 82
row:	 83
row:	 84
row:	 85
row:	 86
row:	 87
row:	 88
row:	 89
row:	 90
row:	 91
row:	 92
row:	 93
row:	 94
row:	 95
row:	 96
row:	 97
row:	 98
row:	 99
row:	 100
row:	 101
row:	 102
row:	 103
row:	 104
row:	 105
row:	 106
row:	 107
row:	 108
row:	 109
row:	 110
row:	 11

In [50]:
df.to_csv("new_sum_1000_articles_v2.csv",index=False)

In [51]:
df_2 = pd.read_csv("new_sum_1000_articles_v2.csv")
df_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   title             1000 non-null   object
 1   content           1000 non-null   object
 2   summarizedText    999 non-null    object
 3   tf_idf_summary    1000 non-null   object
 4   TextRank_Summary  990 non-null    object
dtypes: object(5)
memory usage: 39.2+ KB


In [52]:
df_2.head()

Unnamed: 0,title,content,summarizedText,tf_idf_summary,TextRank_Summary
0,Michael Flynn Was Paid to Represent Turkey’s I...,WASHINGTON — The candidate he was advising ...,WASHINGTON — The candidate he was advising ...,President Trump made the decision to pull the...,flynn positioned himself as someone willing to...
1,Hillary Clinton Crashes Tribeca Film Festival,NEW YORK (AP) — The premiere of a virtual r...,NEW YORK (AP) — The premiere of a virtual r...,President Trump made the decision to pull the...,new york ap — the premiere of a virtual rea...
2,Former PM David Cameron Resigns From Parliament,David Cameron announced he will resign from th...,David Cameron announced he will resign from th...,President Trump made the decision to pull the...,mr cameron also claimed it would be hard not b...
3,Gavin Newsom: Trump’s Election a ’Leap Backwar...,California Lt. Governor Gavin Newsom described...,Governor Gavin Newsom described the election o...,President Trump made the decision to pull the...,” the times admits that california has fewer l...
4,Trump to Louisiana Flood Victims: ’When You Hu...,"BATON ROUGE, Louisiana — In a heartfelt mom...","BATON ROUGE, Louisiana — In a heartfelt mom...",President Trump made the decision to pull the...,“and if the president can interrupt his vacati...


In [54]:

df_2['TextRank_Summary'] = df_2['TextRank_Summary'].astype(str)
df_2['summarizedText'] = df_2['summarizedText'].astype(str)

rouge = Rouge()
rouge.get_scores(df_2['TextRank_Summary'], df_2['summarizedText'], avg=True)

{'rouge-1': {'r': 0.5494947678840644,
  'p': 0.46628917299777184,
  'f': 0.4996188044810514},
 'rouge-2': {'r': 0.42100801255368564,
  'p': 0.3223717022963697,
  'f': 0.36021192030043686},
 'rouge-l': {'r': 0.5423149898213075,
  'p': 0.459844843855109,
  'f': 0.4928790169430374}}