In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.en import English
import numpy as np

In [2]:
nlp = English()
nlp.add_pipe(nlp.create_pipe('sentencizer'))

In [4]:
text_corpus = """
Become a consumer and spend 20% of your time familiarizing yourself within your chosen field.
Painters show up at numerous art exhibits. Chefs eat at cutting-edge restaurants, visit farms, and travel to food shows. Songwriters are constantly listening to music, new and old.
Imitate the style of great creatives in your field before you start adding your personal spin to your art.
You may think that breakout success comes from breaking the pattern. In reality, it is only by following a pattern that you tap into the right level of novelty.
Develop a creative community that consists of a mentor, collaborator, modern muse and prominent promoter.
Studies show that building a community of people around us is essential to achieving world-class success...research demonstrated that an innovator’s network could predict prominence, productivity, and even the length of their career.
Iterate your work by talking to customers and using data-drive processes to refine ideas and turn them into great pieces of work.
The biggest secret to creating something your audience will love? Listen to them.
"""

In [5]:
doc = nlp(text_corpus.replace("\n", ""))
sentences = [sent.string.strip() for sent in doc.sents]

In [6]:
print("Senetence are: \n", sentences)

Senetence are: 
 ['Become a consumer and spend 20% of your time familiarizing yourself within your chosen field.', 'Painters show up at numerous art exhibits.', 'Chefs eat at cutting-edge restaurants, visit farms, and travel to food shows.', 'Songwriters are constantly listening to music, new and old.', 'Imitate the style of great creatives in your field before you start adding your personal spin to your art.', 'You may think that breakout success comes from breaking the pattern.', 'In reality, it is only by following a pattern that you tap into the right level of novelty.', 'Develop a creative community that consists of a mentor, collaborator, modern muse and prominent promoter.', 'Studies show that building a community of people around us is essential to achieving world-class success...research demonstrated that an innovator’s network could predict prominence, productivity, and even the length of their career.', 'Iterate your work by talking to customers and using data-drive processe

In [7]:

# Let's create an organizer which will store the sentence ordering to later reorganize the 
# scored sentences in their correct order
sentence_organizer = {k:v for v,k in enumerate(sentences)}

In [8]:

print("Our sentence organizer: \n", sentence_organizer)

Our sentence organizer: 
 {'Become a consumer and spend 20% of your time familiarizing yourself within your chosen field.': 0, 'Painters show up at numerous art exhibits.': 1, 'Chefs eat at cutting-edge restaurants, visit farms, and travel to food shows.': 2, 'Songwriters are constantly listening to music, new and old.': 3, 'Imitate the style of great creatives in your field before you start adding your personal spin to your art.': 4, 'You may think that breakout success comes from breaking the pattern.': 5, 'In reality, it is only by following a pattern that you tap into the right level of novelty.': 6, 'Develop a creative community that consists of a mentor, collaborator, modern muse and prominent promoter.': 7, 'Studies show that building a community of people around us is essential to achieving world-class success...research demonstrated that an innovator’s network could predict prominence, productivity, and even the length of their career.': 8, 'Iterate your work by talking to cus

In [9]:
# Let's now create a tf-idf (Term frequnecy Inverse Document Frequency) model
tf_idf_vectorizer = TfidfVectorizer(min_df=2,  max_features=None, 
                                    strip_accents='unicode', 
                                    analyzer='word',
                                    token_pattern=r'\w{1,}',
                                    ngram_range=(1, 3), 
                                    use_idf=1,smooth_idf=1,
                                    sublinear_tf=1,
                                    stop_words = 'english')

In [10]:
# Passing our sentences treating each as one document to TF-IDF vectorizer
tf_idf_vectorizer.fit(sentences)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=2, ngram_range=(1, 3), norm='l2', preprocessor=None,
                smooth_idf=1, stop_words='english', strip_accents='unicode',
                sublinear_tf=1, token_pattern='\\w{1,}', tokenizer=None,
                use_idf=1, vocabulary=None)

In [11]:
# Transforming our sentences to TF-IDF vectors
sentence_vectors = tf_idf_vectorizer.transform(sentences)

In [12]:
# Getting sentence scores for each sentences
sentence_scores = np.array(sentence_vectors.sum(axis=1)).ravel()

# Sanity checkup
print(len(sentences) == len(sentence_scores))

True


In [13]:
# Getting top-n sentences
N = 3
top_n_sentences = [sentences[ind] for ind in np.argsort(sentence_scores, axis=0)[::-1][:N]]

In [14]:
# Let's now do the sentence ordering using our prebaked sentence_organizer
# Let's map the scored sentences with their indexes
mapped_top_n_sentences = [(sentence,sentence_organizer[sentence]) for sentence in top_n_sentences]
print("Our top_n_sentence with their index: \n")
for element in mapped_top_n_sentences:
    print(element)

# Ordering our top-n sentences in their original ordering
mapped_top_n_sentences = sorted(mapped_top_n_sentences, key = lambda x: x[1])
ordered_scored_sentences = [element[0] for element in mapped_top_n_sentences]

# Our final summary
summary = " ".join(ordered_scored_sentences)

Our top_n_sentence with their index: 

('Imitate the style of great creatives in your field before you start adding your personal spin to your art.', 4)
('Studies show that building a community of people around us is essential to achieving world-class success...research demonstrated that an innovator’s network could predict prominence, productivity, and even the length of their career.', 8)
('You may think that breakout success comes from breaking the pattern.', 5)


In [15]:
print("Summary: \n", summary)

Summary: 
 Imitate the style of great creatives in your field before you start adding your personal spin to your art. You may think that breakout success comes from breaking the pattern. Studies show that building a community of people around us is essential to achieving world-class success...research demonstrated that an innovator’s network could predict prominence, productivity, and even the length of their career.


In [16]:
def summarizer(text, tokenizer, max_sent_in_summary=3):
    # Create spacy document for further sentence level tokenization
    doc = nlp(text_corpus.replace("\n", ""))
    sentences = [sent.string.strip() for sent in doc.sents]
    # Let's create an organizer which will store the sentence ordering to later reorganize the 
    # scored sentences in their correct order
    sentence_organizer = {k:v for v,k in enumerate(sentences)}
    # Let's now create a tf-idf (Term frequnecy Inverse Document Frequency) model
    tf_idf_vectorizer = TfidfVectorizer(min_df=2,  max_features=None, 
                                        strip_accents='unicode', 
                                        analyzer='word',
                                        token_pattern=r'\w{1,}',
                                        ngram_range=(1, 3), 
                                        use_idf=1,smooth_idf=1,
                                        sublinear_tf=1,
                                        stop_words = 'english')
    # Passing our sentences treating each as one document to TF-IDF vectorizer
    tf_idf_vectorizer.fit(sentences)
    # Transforming our sentences to TF-IDF vectors
    sentence_vectors = tf_idf_vectorizer.transform(sentences)
    # Getting sentence scores for each sentences
    sentence_scores = np.array(sentence_vectors.sum(axis=1)).ravel()
    # Getting top-n sentences
    N = max_sent_in_summary
    top_n_sentences = [sentences[ind] for ind in np.argsort(sentence_scores, axis=0)[::-1][:N]]
    # Let's now do the sentence ordering using our prebaked sentence_organizer
    # Let's map the scored sentences with their indexes
    mapped_top_n_sentences = [(sentence,sentence_organizer[sentence]) for sentence in top_n_sentences]
    # Ordering our top-n sentences in their original ordering
    mapped_top_n_sentences = sorted(mapped_top_n_sentences, key = lambda x: x[1])
    ordered_scored_sentences = [element[0] for element in mapped_top_n_sentences]
    # Our final summary
    summary = " ".join(ordered_scored_sentences)
    return summary

In [17]:
print("Summarizer Result: \n", summarizer(text=text_corpus, tokenizer=nlp, max_sent_in_summary=3)

Summarizer Result: 
 Imitate the style of great creatives in your field before you start adding your personal spin to your art. You may think that breakout success comes from breaking the pattern. Studies show that building a community of people around us is essential to achieving world-class success...research demonstrated that an innovator’s network could predict prominence, productivity, and even the length of their career.
