In [1]:
### This is text summarization.
### Objective, to summarize an article and make sense off of it.

In [2]:
from nltk.corpus import stopwords
import nltk
import re
import collections

In [3]:
def calculate_sentence_frequency(sentence, average_sentence_word_count):
    """
    Calculates the weighted frequency of a single sentence.
    Parameters:
    1. sentence. A string containing multiple words.
    Returns : word_frequencies (type = dict) list of words and associative weights.
    """
    word_frequencies = {}
    if len(sentence.split(" ")) < average_sentence_word_count:
        for word in nltk.word_tokenize(sentence):
            if word not in word_frequencies.keys():
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1
        max_word_frequency = max(word_frequencies.values()) if len(word_frequencies.values()) > 0 else 1
        for word in word_frequencies.keys():
            word_frequencies[word] /= max_word_frequency
    return word_frequencies

In [4]:
def get_text_weighted_score(paragraph, average_word_count):
    """
    Generates the weighted score of the entire text.
    Uses calculate_sentence_frequency(paragraph[i]).
    Parameters:
    1. paragraph. A list of sentences.
    Returns:
    1. sentence_scores (type = dict) list of sentence and associative weights.
    """
    sentence_scores = {}
    for i, sent in enumerate(paragraph):
        word_frequencies = calculate_sentence_frequency(paragraph[i], average_word_count)
        for word in word_frequencies.keys():
            if sent not in sentence_scores.keys():
                sentence_scores[sent] =  word_frequencies[word]
            else:
                sentence_scores[sent] += word_frequencies[word]
    return sentence_scores

In [5]:
def main():
    STOPWORDS = set(stopwords.words('english'))
    STOPWORDS.add("-")
    
    ORIGINAL_TEXT = """
French startup BlaBlaCar has announced that the company’s revenue grew by 71 percent in 2019 compared to 2018. The big difference between 2019 and 2018 is that BlaBlaCar diversified its activity by offering bus rides as well as bus ticketing in some markets.
BlaBlaCar is still mostly known for its long-distance ride-sharing marketplace. If you’re going from one city to another, you can find a car with an empty seat and book a ride in that car. On the other side of the marketplace, if you plan on driving across the country, you can list your ride on the platform to find passengers so that you don’t have to pay for gas and highway tolls by yourself.
In November 2018, the company acquired Ouibus to become a marketplace for road travel, whether it’s by bus or by car. Ouibus is now called BlaBlaBus. BlaBlaCar also offers a carpooling marketplace for daily commutes between your home and your workplace called BlaBlaLines.
BlaBlaBus covers 400 cities in Europe while BlaBlaLines has managed to attract 1.5 million users.
The bottom line is that BlaBlaCar has built a huge community. The company now has 87 million users, with 17 million people signing up in 2019 alone. BlaBlaCar carried 70 million passengers across all its services last year.
In France, the long-distance carpooling service reached a record of 135,000 passengers in a single day. I’d bet that the railway company strike may have helped.
When it comes to the company itself, BlaBlaCar has hired a Chief Operating Officer, Béatrice Dumurgier. While BlaBlaCar faced some growing pains a couple of years ago, the company now plans to expand its team again by doubling the size of its engineering team in 2020.
"""
    
    TESLA_TEXT = ORIGINAL_TEXT.lower().replace(". ", " qwertyuiop")
    TESLA_TEXT = re.sub('[^a-zA-Z]', ' ', TESLA_TEXT )
    TESLA_TEXT = re.sub(r'\s+', ' ', TESLA_TEXT)
    TESLA_TEXT = TESLA_TEXT.split(" qwertyuiop")

    average_sentence_word_count = len(TESLA_TEXT)
    sum_word_count = 0
    for c,text in enumerate(TESLA_TEXT):
        TESLA_TEXT[c] = ' '.join([word for word in text.split() if word not in STOPWORDS])
        sum_word_count += len(TESLA_TEXT[c].split(" "))

    average_sentence_word_count = sum_word_count / average_sentence_word_count
    
    sentence_scores = get_text_weighted_score(TESLA_TEXT, average_sentence_word_count)
    original_dict = {}
    ORIGINAL_TEXT = ORIGINAL_TEXT.split(". ")
    for i, sentences in enumerate(sentence_scores.items()):
        original_dict[ORIGINAL_TEXT[i]] = sentences[1]
    sorted_sentences = sorted(original_dict.items(), key=lambda x: x[1], reverse=True)
    return sorted_sentences

In [6]:
if __name__ == "__main__":
    sorted_sentences = main()
    final_list = []
    for i, s in enumerate(sorted_sentences):
        final_list.append(s[0])

In [7]:
final_list

['\nFrench startup BlaBlaCar has announced that the company’s revenue grew by 71 percent in 2019 compared to 2018',
 'Ouibus is now called BlaBlaBus',
 'The big difference between 2019 and 2018 is that BlaBlaCar diversified its activity by offering bus rides as well as bus ticketing in some markets.\nBlaBlaCar is still mostly known for its long-distance ride-sharing marketplace',
 'On the other side of the marketplace, if you plan on driving across the country, you can list your ride on the platform to find passengers so that you don’t have to pay for gas and highway tolls by yourself.\nIn November 2018, the company acquired Ouibus to become a marketplace for road travel, whether it’s by bus or by car',
 'If you’re going from one city to another, you can find a car with an empty seat and book a ride in that car']