In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.porter import *
import re

In [2]:
passage = """
If Cristiano Ronaldo didn't exist, would Lionel Messi have to invent him?

The question of how much these two other-worldly players inspire each other is an interesting one,
and it's tempting to imagine Messi sitting at home on Tuesday night, watching Ronaldo destroying Atletico, 
angrily glaring at the TV screen and growling: "Right, I'll show him!"

As appealing as that picture might be, however, it is probably a false one - from Messi's perspective, at least.

He might show it in a different way, but Messi is just as competitive as Ronaldo. Rather than goals and 
personal glory, however, the Argentine's personal drug is trophies.

Ronaldo, it can be said, never looks happy on the field of play unless he's just scored a goal - and even 
then he's not happy for long, because he just wants to score another one. And that relentless obsession with 
finding the back of the net has undoubtedly played a major role in his stunning career achievements.

Messi, though, is a different animal, shown by the generosity with which he sets up team-mates even if he has 
a chance to shoot, regularly hands over penalty-taking duties to others and invariably celebrates a goal by turning 
straight to the player who passed him the ball with an appreciative smile.

Rather than being a better player than Ronaldo, Messi's main motivations - according to the people who are close to
him - are being the best possible version of Lionel Messi, and winning as many trophies as possible.

That theory was supported by Leicester boss Brendan Rodgers when I interviewed him for a book I recently wrote about Messi.

Do Messi and Ronaldo inspire each other? "Maybe subconsciously in some way they've driven each other on," said Rodgers.
"But I think both those players inherently have that hunger to be the best players they can be. With the very elite 
performers, that drive comes from within."

Messi and Ronaldo ferociously competing with each other for everyone else's acclaim is a nice story for fans to debate 
and the media to spread, but it's probably not particularly true.
"""

### Text Standardization

In [3]:
contractions = { 
"ain't": "am not / are not / is not / has not / have not",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he shall / he will",
"he'll've": "he shall have / he will have",
"he's": "he has / he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is / how does",
"I'd": "I had / I would",
"I'd've": "I would have",
"I'll": "I shall / I will",
"I'll've": "I shall have / I will have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it shall have / it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she shall / she will",
"she'll've": "she shall have / she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they shall / they will",
"they'll've": "they shall have / they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall / what will",
"what'll've": "what shall have / what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
"where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who shall / who will",
"who'll've": "who shall have / who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you shall / you will",
"you'll've": "you shall have / you will have",
"you're": "you are",
"you've": "you have"
}

In [4]:
contractions_re = re.compile('(%s)' % '|'.join(contractions.keys()))
def expand_contractions(s, contractions_dict=contractions):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, s)
 
sentences = sent_tokenize(passage)    
sentences = [expand_contractions(i) for i in sentences]
sentences = [re.sub('\n', '', i) for i in sentences]

The goal is to compare the results of a few novel approaches:
1. Sentence Scoring based on Word Frequency
2. TextRank using Universal Sentence Encoder
3. Unsupervised Learning using Skip-Thought Vectors

### Sentence Scoring based on Word Freqency (Python 2.7/3.5)

The first approach we will explore is the simplest of the three. Here we assign weights to each word based on the frequency of the word in the passage. For example, if "Soccer" occurs 4 times within the passage, it will have a weight of 4. 

In [39]:
def create_freq_table(text_string):
    stopwords_list = set(stopwords.words('english'))
    
    words = word_tokenize(text_string)
    
    ps = PorterStemmer()
    
    freq_table = {}
    
    for word in words:
        #stem word 
        word = ps.stem(word)
        
        #remove stopwords
        if word in stopwords_list: 
            continue
        elif word in freq_table:
            freq_table[word] += 1
        else:
            freq_table[word] = 1
            
    return freq_table

freq_table = create_freq_table(" ".join(sentences))

Using the weights assigned to each word above, we will create a score for each sentence. At the end of the day, we will be taking the score of the top `N` for the summary. As you'd imagine, just by leveraging the raw score of each sentence, the length of certain sentences will skew the results. This is why will normalize the scores by dividing by the length of each sentence. 

In [5]:
def score_sentences(sentences, freq_table):
    
    sentence_value = {}
    
    for sentence in sentences:
        word_count_in_sentence = len(word_tokenize(sentence))
        
        for wordValue in freq_table:
            
            if wordValue.lower() in sentence.lower():                
                if sentence in sentence_value:
                    sentence_value[sentence] += freq_table[wordValue]
                else:
                    sentence_value[sentence] = freq_table[wordValue]

        sentence_value[sentence] = sentence_value[sentence] // word_count_in_sentence
    return sentence_value

def find_average_score(sentence_value):
    sum_values = 0
    
    for entry in sentence_value:
        sum_values += sentence_value[entry]
        
    average = int(sum_values/len(sentence_value))
    
    return average

Now, to create the summary, we will take any sentence that has a score that exceeds a threshold. In this case, the threshold will be the average score for for all of the sentences. 

In [7]:
def generate_summary(sentences, sentence_value, threshold):
    sentence_count = 0
    
    summary = ''
    
    for sentence in sentences:
        if sentence in sentence_value and sentence_value[sentence] > threshold:
            summary += " " + sentence
            sentence_count += 1
            
    return summary 
            
            

In [41]:
#End to End Run
freq_table = create_freq_table(" ".join(sentences))

sentence_scores = score_sentences(sentences, freq_table)

threshold = find_average_score(sentence_scores)

summary = generate_summary(sentences, sentence_scores, 1.0 * threshold)

print(re.sub('\n','',summary))

 If Cristiano Ronaldo didn't exist, would Lionel Messi have to invent him? As appealing as that picture might be, however, it is probably a false one - from Messi's perspective, at least. He might show it in a different way, but Messi is just as competitive as Ronaldo. Rather than goals and personal glory, however, the Argentine's personal drug is trophies. Do Messi and Ronaldo inspire each other? "Maybe subconsciously in some way they've driven each other on," said Rodgers. With the very elite performers, that drive comes from within."


### Text Rank using Universal Sentence Embeddings (Python 3.7)

Next, we evaluate the results generated when using universal sentence embeddings and text rank to generate summaries.

**Text Rank**
This is essentially a derivative of the famous PageRank created by the Google cofounders. In PageRank, they generated a matrix that calculates the probability that a user will move from one page to another. In the case of TextRank, we generate a cosine similarity matrix where we have the similarity of each sentence to each other.

A graph is then generated from this cosine similarity matrix and the PageRank algorithm is applied to this graph and scores are then calculated for each sentence.

**Universal Sentence Embeddings**
Universal sentence embeddings encode words, sentences and paragraphs into semantic vectors. They are trained in Deep Averaging Networks. More details can be found here:

In [12]:
import tensorflow_hub as hub
import tensorflow as tf

module_url = "https://tfhub.dev/google/universal-sentence-encoder/2"

embed = hub.Module(module_url)

# Reduce logging output.
tf.logging.set_verbosity(tf.logging.ERROR)

with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    message_embeddings = session.run(embed(sentences))

INFO:tensorflow:Using C:\Temp\tfhub_modules to cache modules.
INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_0:0 from checkpoint b'C:\\Temp\\tfhub_modules\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\variables\\variables' with Embeddings_en/sharded_0
INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_1:0 from checkpoint b'C:\\Temp\\tfhub_modules\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\variables\\variables' with Embeddings_en/sharded_1
INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_10:0 from checkpoint b'C:\\Temp\\tfhub_modules\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\variables\\variables' with Embeddings_en/sharded_10
INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_11:0 from checkpoint b'C:\\Temp\\tfhub_modules\\1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47\\variables\\variables' with Embeddings_en/sharded_11
INFO:tensorflow:Initialize variable module/Embeddings_en/sharded_12:0 from checkpoint b'C:\\Temp\\tfhub_modules\\1

In [16]:
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

#generate cosine similarity matrix
sim_matrix = cosine_similarity(message_embeddings)

#create graph and generate scores from pagerank algorithms
nx_graph = nx.from_numpy_array(sim_matrix)
scores = nx.pagerank(nx_graph)

ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
   
num_of_sentences = 5
    
summary = " ".join([i[1] for i in ranked_sentences[:num_of_sentences]])
summary

"Rather than being a better player than Ronaldo, Messi's main motivations - according to the people who are close tohim - are being the best possible version of Lionel Messi, and winning as many trophies as possible. He might show it in a different way, but Messi is just as competitive as Ronaldo. Messi and Ronaldo ferociously competing with each other for everyone else's acclaim is a nice story for fans to debate and the media to spread, but it has / it is probably not particularly true. Do Messi and Ronaldo inspire each other? Ronaldo, it can be said, never looks happy on the field of play unless he has / he is just scored a goal - and even then he has / he is not happy for long, because he just wants to score another one."

### Unsupervised Learning using Skip Thought Vectors (Python 2.7)

The high level approach =  Text Cleaning -> Encoder/Decoder -> K Means Clustering -> Extract Sentences Closest to Cluster Center

**Skip Thought Vectors**

Here, we use a encoder/decoder framework to generate feature vectors.
1. Encoder Network: The encoder is typically a GRU-RNN which generates a fixed length vector representation h(i) for each sentence S(i) in the input. The encoded representation h(i) is obtained by passing the final hidden state of the GRU cell (i.e. after it has seen the entire sentence) to multiple dense layers.
2. Decoder Network: The decoder network takes this vector representation h(i) as input and tries to generate two sentences - S(i-1) and S(i+1), which could occur before and after the input sentence respectively. Separate decoders are implemented for the generation of previous and next sentences, both being GRU-RNNs. The vector representation h(i) acts as the initial hidden state for the GRUs of the decoder networks.

Similar to how Word2Vec embeddings are trained by predicting the surrounding words, the Skip Thought Vectors are trained by predicting the sentence at a time, t-1 and t+1. As this model is trained, the learned representation (hidden layer) will now place similar sentences closer together which enables higher performance clustering.

**K-Means Clustering**

When trying to create a summary, we should only need the sentence which is the closest to the centre of that cluster. The key here is choosing the correct number of clusters to do a good job of summarizing the content. Kushal's post recommends that we calculate the cluster size by taking 30% of the number of sentences.

In [5]:
import skipthoughts

# You would need to download pre-trained models first
model = skipthoughts.load_model()

encoder = skipthoughts.Encoder(model)
encoded =  encoder.encode(sentences)

Loading model parameters...
Compiling encoders...
Loading tables...
Packing up...
38
8
41
13
15
48
17
18
20
22
23
56
25
60


We used a cluster size of 7.

In [36]:
from sklearn.metrics import pairwise_distances_argmin_min
import numpy as np
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=n_clusters)
kmeans = kmeans.fit(encoded)

n_clusters = int(np.ceil(len(encoded)**0.6))
print(n_clusters)

avg = []
for j in range(n_clusters):
    idx = np.where(kmeans.labels_ == j)[0]
    avg.append(np.mean(idx))
closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, encoded)
ordering = sorted(range(n_clusters), key=lambda k: avg[k])
summary = ' '.join([sentences[closest[idx]] for idx in ordering])

In [37]:
summary

'Do Messi and Ronaldo inspire each other? Ronaldo, it can be said, never looks happy on the field of play unless he has / he is just scored a goal - and even then he has / he is not happy for long, because he just wants to score another one. Rather than being a better player than Ronaldo, Messi\'s main motivations - according to the people who are close tohim - are being the best possible version of Lionel Messi, and winning as many trophies as possible. That theory was supported by Leicester boss Brendan Rodgers when I interviewed him for a book I recently wrote about Messi. With the very elite performers, that drive comes from within." "But I think both those players inherently have that hunger to be the best players they can be.'