In [1]:
# Import Required Libraries
import numpy as np
import pandas as pd
import nltk

# Punkt Sentence Tokenizer. This tokenizer divides a text into a list of sentences
nltk.download('punkt') # one time execution
import re

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\AYAN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Read the Data
df = pd.read_csv("tennis_articles.csv", encoding='cp1252')
df.head()

Unnamed: 0,article_id,article_title,article_text,source
0,1,"I do not have friends in tennis, says Maria Sh...",Maria Sharapova has basically no friends as te...,https://www.tennisworldusa.org/tennis/news/Mar...
1,2,Federer defeats Medvedev to advance to 14th Sw...,"BASEL, Switzerland (AP) — Roger Federer advanc...",http://www.tennis.com/pro-game/2018/10/copil-s...
2,3,Tennis: Roger Federer ignored deadline set by ...,Roger Federer has revealed that organisers of ...,https://scroll.in/field/899938/tennis-roger-fe...
3,4,Nishikori to face off against Anderson in Vien...,Kei Nishikori will try to end his long losing ...,http://www.tennis.com/pro-game/2018/10/nishiko...
4,5,Roger Federer has made this huge change to ten...,"Federer, 37, first broke through on tour over ...",https://www.express.co.uk/sport/tennis/1036101...


In [3]:
df['article_text'][0]

"Maria Sharapova has basically no friends as tennis players on the WTA Tour. The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much. I think everyone knows this is my job here. When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net. So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match. I'm a pretty competitive girl. I say my hellos, but I'm not sending any players flowers as well. Uhm, I'm not really friendly or close to many players. I have not a lot of friends away from the courts.' When she said she is not really close to a lot of players, is that something strategic that she is doing? Is it different on the men's tour than the women's tour? 'No, not at all. I think just because you're in the same

In [4]:
# Now we have 2 options – we can either summarize each article individually, or we can generate a single summary for all the 
# articles.
# For our purpose, we will go ahead with the latter.

In [5]:
import numpy as np
# Split Text into Sentences
from nltk.tokenize import sent_tokenize # Return a sentence-tokenized copy of text
sentences = []
for s in df['article_text']:
    sentences.append(sent_tokenize(s))

sentences = [y for x in sentences for y in x] # flatten list
sentences

['Maria Sharapova has basically no friends as tennis players on the WTA Tour.',
 "The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much.",
 'I think everyone knows this is my job here.',
 "When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.",
 "So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.",
 "I'm a pretty competitive girl.",
 "I say my hellos, but I'm not sending any players flowers as well.",
 "Uhm, I'm not really friendly or close to many players.",
 "I have not a lot of friends away from the courts.'",
 'When she said she is not really close to a lot of players, is that something strategic that she is doing?',
 "Is it different on the men's tour than the women's tour?",
 "'No, not at

In [6]:
# Text Preprocessing

# remove punctuations, numbers and special characters
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

# make alphabets lowercase
clean_sentences = [s.lower() for s in clean_sentences]

clean_sentences

['maria sharapova has basically no friends as tennis players on the wta tour ',
 'the russian player has no problems in openly speaking about it and in a recent interview she said   i don t really hide any feelings too much ',
 'i think everyone knows this is my job here ',
 'when i m on the courts or when i m on the court playing  i m a competitor and i want to beat every single person whether they re in the locker room or across the net ',
 'so i m not the one to strike up a conversation about the weather and know that in the next few minutes i have to go and try to win a tennis match ',
 'i m a pretty competitive girl ',
 'i say my hellos  but i m not sending any players flowers as well ',
 'uhm  i m not really friendly or close to many players ',
 'i have not a lot of friends away from the courts  ',
 'when she said she is not really close to a lot of players  is that something strategic that she is doing ',
 'is it different on the men s tour than the women s tour ',
 ' no  not at

In [7]:
# Remove stop words

nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

# function to remove stopwords
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

# remove stopwords from the sentences
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

clean_sentences

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AYAN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['maria sharapova basically friends tennis players wta tour',
 'russian player problems openly speaking recent interview said really hide feelings much',
 'think everyone knows job',
 'courts court playing competitor want beat every single person whether locker room across net',
 'one strike conversation weather know next minutes go try win tennis match',
 'pretty competitive girl',
 'say hellos sending players flowers well',
 'uhm really friendly close many players',
 'lot friends away courts',
 'said really close lot players something strategic',
 'different men tour women tour',
 '',
 'think sport mean friends everyone categorized tennis player going get along tennis players',
 'think every person different interests',
 'friends completely different jobs interests met different parts life',
 'think everyone thinks tennis players greatest friends',
 'ultimately tennis small part',
 'many things interested',
 'also read maria sharapova reveals tennis keeps motivated',
 'basel switzerl

In [16]:
# Vector Representation of Sentences

# Extract features using TFIDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=784)
sentence_vectors = vectorizer.fit_transform(clean_sentences)

In [17]:
sentence_vectors.shape

(130, 784)

In [18]:
df_idf = pd.DataFrame(vectorizer.idf_, index=vectorizer.get_feature_names(),columns=["tfidf_weights"])
df_idf.sort_values(by=['tfidf_weights'],ascending = False).head()

Unnamed: 0,tfidf_weights
lingering,5.18205
start,5.18205
spending,5.18205
sponsors,5.18205
sport,5.18205


In [19]:
len(df_idf)

784

In [20]:
# Similarity Matrix Preparation

# similarity matrix
sim_mat = np.zeros([len(sentences), len(sentences)])

sim_mat

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [22]:
# We will use Cosine Similarity to compute the similarity between a pair of sentences And initialize the matrix with 
# cosine similarity scores.

from sklearn.metrics.pairwise import cosine_similarity

for i in range(len(sentences)):
    for j in range(len(sentences)):
        if i != j:
            sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,784), sentence_vectors[j].reshape(1,784))[0,0]

sim_mat

array([[0.        , 0.        , 0.        , ..., 0.04411288, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.04411288, 0.        , 0.        , ..., 0.        , 0.05873896,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.05873896, 0.        ,
        0.0955938 ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.0955938 ,
        0.        ]])

In [27]:
sim_mat.shape

(130, 130)

In [23]:
# Applying PageRank Algorithm
# Before proceeding further, let’s convert the similarity matrix sim_mat into a graph.
# The nodes of this graph will represent the sentences and the edges will represent the similarity scores between the sentences.
# On this graph, we will apply the PageRank algorithm to arrive at the sentence rankings.

import networkx as nx
import matplotlib.pyplot as plt

nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)



In [24]:
scores

{0: 0.012120926272708476,
 1: 0.005466021217777897,
 2: 0.006192250205206941,
 3: 0.0048476527393588095,
 4: 0.009667733801509013,
 5: 0.0021069915019457375,
 6: 0.005982755919406462,
 7: 0.009197398880376463,
 8: 0.006445728346635655,
 9: 0.010992054956563633,
 10: 0.0048797880039023215,
 11: 0.0011691348402182388,
 12: 0.014122061474072066,
 13: 0.007807825903908312,
 14: 0.005717913023231399,
 15: 0.013399330413629304,
 16: 0.004784924261700597,
 17: 0.0028175029296035256,
 18: 0.006402062139390285,
 19: 0.01125257293510506,
 20: 0.009611897965242398,
 21: 0.012953279592834785,
 22: 0.007446877579837339,
 23: 0.0030009384455129346,
 24: 0.009028715471624182,
 25: 0.009428433789784649,
 26: 0.007788275722783704,
 27: 0.007639933030492206,
 28: 0.009523087107360554,
 29: 0.007853447222207394,
 30: 0.011199270161777629,
 31: 0.008999549788121002,
 32: 0.011143740325213591,
 33: 0.01036269953848649,
 34: 0.003369935733106482,
 35: 0.010853427429650949,
 36: 0.00808249600107831,
 37: 0.0

In [25]:
# Summary Extraction
# Finally, it’s time to extract the top N sentences based on their rankings for summary generation.

ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
ranked_sentences

[(0.015654536358444597,
  "“I didn't serve very well [against first-round opponent Filip Kranjovic,” Federer said."),
 (0.014122061474072066,
  "I think just because you're in the same sport doesn't mean that you have to be friends with everyone just because you're categorized, you're a tennis player, so you're going to get along with tennis players."),
 (0.013473666101305874,
  'He used his first break point to close out the first set before going up 3-0 in the second and wrapping up the win on his first match point.'),
 (0.013399330413629304,
  "I think everyone just thinks because we're tennis players we should be the greatest of friends."),
 (0.012953279592834785,
  'Federer dominated the 20th-ranked Medvedev and had his first match-point chance to break serve again at 5-1.'),
 (0.012944925448437495,
  'Two players, Stefanos Tsitsipas and Kyle Edmund, won their first career ATP titles last week (13:26).'),
 (0.012834510049476918,
  'The epic victory set-up his showdown with Federer

In [26]:
# Extract top 10 sentences as the summary
for i in range(10):
    print(ranked_sentences[i][1])

“I didn't serve very well [against first-round opponent Filip Kranjovic,” Federer said.
I think just because you're in the same sport doesn't mean that you have to be friends with everyone just because you're categorized, you're a tennis player, so you're going to get along with tennis players.
He used his first break point to close out the first set before going up 3-0 in the second and wrapping up the win on his first match point.
I think everyone just thinks because we're tennis players we should be the greatest of friends.
Federer dominated the 20th-ranked Medvedev and had his first match-point chance to break serve again at 5-1.
Two players, Stefanos Tsitsipas and Kyle Edmund, won their first career ATP titles last week (13:26).
The epic victory set-up his showdown with Federer, who would go on to win his first Australian Open.
“I was on a nice trajectorythen,” Reid recalled.“If I hadn’t got sick, I think I could have started pushing towards the second week at the slams and then w