In [1]:
import numpy as np
import pandas as pd
import nltk

In [2]:
nltk.download('punkt')
import re

# importing the libraries we’ll be using

[nltk_data] Downloading package punkt to /Users/apple/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Reading the dataset
df = pd.read_csv("tennis_articles.csv")

In [4]:
df.head()


Unnamed: 0,article_id,article_text,source
0,1,Maria Sharapova has basically no friends as te...,https://www.tennisworldusa.org/tennis/news/Mar...
1,2,"BASEL, Switzerland (AP), Roger Federer advance...",http://www.tennis.com/pro-game/2018/10/copil-s...
2,3,Roger Federer has revealed that organisers of ...,https://scroll.in/field/899938/tennis-roger-fe...
3,4,Kei Nishikori will try to end his long losing ...,http://www.tennis.com/pro-game/2018/10/nishiko...
4,5,"Federer, 37, first broke through on tour over ...",https://www.express.co.uk/sport/tennis/1036101...


In [5]:
df['article_text'][0]

"Maria Sharapova has basically no friends as tennis players on the WTA Tour. The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much. I think everyone knows this is my job here. When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match. I'm a pretty competitive girl. I say my hellos, but I'm not sending any players flowers as well. Uhm, I'm not really friendly or close to many players. I have not a lot of friends away from the courts.' When she said she is not really close to a lot of players, is that something strategic that she is doing? Is it different on the men's tour than the women's tour? 'No, not at all. I think just because you're in the same 

In [6]:
df['article_text'][2]

'Roger Federer has revealed that organisers of the re-launched and condensed Davis Cup gave him three days to decide if he would commit to the controversial competition. Speaking at the Swiss Indoors tournament where he will play in Sundays final against Romanian qualifier Marius Copil, the world number three said that given the impossibly short time frame to make a decision, he opted out of any commitment. "They only left me three days to decide", Federer said. "I didn\'t to have time to consult with all the people I had to consult. "I could not make a decision in that time, so I told them to do what they wanted." The 20-time Grand Slam champion has voiced doubts about the wisdom of the one-week format to be introduced by organisers Kosmos, who have promised the International Tennis Federation up to $3 billion in prize money over the next quarter-century. The competition is set to feature 18 countries in the November 18-24 finals in Madrid next year, and will replace the classic home-

In [7]:
# Now we will break the text into individual sentences. 
# We will also use the sent_tokenize( ) method of the nltk module to our help .
from nltk.tokenize import sent_tokenize
tokenised_article = list()
for s in df['article_text']:
    tokenised_article.append(sent_tokenize(s))

sentences = [sentence for sentences in tokenised_article for sentence in sentences]

In [9]:
sentences[:7] 

['Maria Sharapova has basically no friends as tennis players on the WTA Tour.',
 "The Russian player has no problems in openly speaking about it and in a recent interview she said: 'I don't really hide any feelings too much.",
 'I think everyone knows this is my job here.',
 "When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.",
 "I'm a pretty competitive girl.",
 "I say my hellos, but I'm not sending any players flowers as well.",
 "Uhm, I'm not really friendly or close to many players."]

In [10]:
# We will be using the pre-trained Wikipedia 2014 + Gigaword 5 GloVe as our collection of word-embeddings

# Extract word vectors
word_embeddings = {}
file = open('glove.6B/glove.6B.100d.txt', encoding='utf-8')
for line in file:
    values = line.split()
    word = values[0]
    coefficients = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefficients
file.close()

In [11]:
len(word_embeddings)

400000

In [12]:
# TEXT Pre-Processing

# remove special characters and numeric characters
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

# make alphabets lowercase
clean_sentences = [character.lower() for character in clean_sentences]

In [13]:
# Removing Stop-Words (commonly used words of a language – is, am, the, of, in, etc.)

# Dwonloading the stop words
nltk.download('stopwords')

# Now we can import the stopwords.
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

# function to remove stopwords
def remove_stopwords(sentence):
    return " ".join([word for word in sentence if word not in stop_words])

# remove stopwords from the sentences
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

[nltk_data] Downloading package stopwords to /Users/apple/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
# VECTOR REPRESENTATION OF SENTENCES

sentence_vectors = []
for sentence in clean_sentences:
    if len(sentence) != 0:
        vector = sum([word_embeddings.get(word, np.zeros((100,))) for word in sentence.split()])/(len(sentence.split())+0.001)
    else:
        vector = np.zeros((100,))
    sentence_vectors.append(vector)

In [17]:
# PREPARATION OF SIMILARITY MATRIX

# We will use Cosine Similarity to compute the similarity between a pair of sentences.
from sklearn.metrics.pairwise import cosine_similarity
for i in range(len(sentences)):
    for j in range(len(sentences)):
        if i != j:
            similarity_matrix[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]

In [19]:
# APPLYING PAGE RANK ALGORITHM
# Before going further, let us convert the similarity matrix into a graph. 
# The nodes of this graph will represent the sentences and the edges will represent 
# the similarity scores between the sentences. 
# On this graph, we will apply the PageRank algorithm to arrive at the sentence rankings.
import networkx as nx

nx_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(nx_graph)
scores

{0: 0.008072651850222512,
 1: 0.008501993234642567,
 2: 0.007811931807453844,
 3: 0.009293791275448712,
 4: 0.007500319318214275,
 5: 0.008146814792723995,
 6: 0.008477413386742535,
 7: 0.008251000814613184,
 8: 0.008596957752663233,
 9: 0.008257144255188685,
 10: 0.0012695751770095795,
 11: 0.008860552417038809,
 12: 0.00808354329685214,
 13: 0.008156804667616403,
 14: 0.008443316914856017,
 15: 0.008556893026564389,
 16: 0.00781282665344099,
 17: 0.008071958040219474,
 18: 0.008406020966624999,
 19: 0.008847892278310357,
 20: 0.00886086521110968,
 21: 0.007421917078736729,
 22: 0.008223434005176018,
 23: 0.008991766437337142,
 24: 0.00846397038200988,
 25: 0.006701898132655766,
 26: 0.008232471647009004,
 27: 0.008913135590780042,
 28: 0.009061682960248691,
 29: 0.009093905738349194,
 30: 0.009244521561084428,
 31: 0.008994323924050843,
 32: 0.007236869098405197,
 33: 0.00870909310685239,
 34: 0.008919130532277221,
 35: 0.00909742140395915,
 36: 0.007715970734839171,
 37: 0.008883452

In [21]:
# SUMMARY EXTRACTION

# Extracting the top N sentences based on their rankings for summary generation.
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
for i in range(10):
    print(ranked_sentences[i][1], end="\n\n\n")

When I'm on the courts or when I'm on the court playing, I'm a competitor and I want to beat every single person whether they're in the locker room or across the net.So I'm not the one to strike up a conversation about the weather and know that in the next few minutes I have to go and try to win a tennis match.


Major players feel that a big event in late November combined with one in January before the Australian Open will mean too much tennis and too little rest.


Speaking at the Swiss Indoors tournament where he will play in Sundays final against Romanian qualifier Marius Copil, the world number three said that given the impossibly short time frame to make a decision, he opted out of any commitment.


"I felt like the best weeks that I had to get to know players when I was playing were the Fed Cup weeks or the Olympic weeks, not necessarily during the tournaments.


Currently in ninth place, Nishikori with a win could move to within 125 points of the cut for the eight-man event in