# Text Summarization

In [1]:
# Load librairies 
import re
import numpy as np
from nltk.corpus import stopwords
import math

## Get data from files

In [2]:
def get_sentences_from_file(filename):
    """Returns a list of sentences extracted from a file given as argument
    @params
        filename name of the file to load
    @return
        list sentences
    """    
    sentences = []
    lines = open(filename, "r")
    for line in lines:
       
        
        # Delete space at first and at the end of the sentence (if any)
        line = line.strip()
        
        # Don't take care of empty line
        if not line:
            continue
            
        # Special case of ...    
        line = line.replace("...", ".")
        
        # Split line with end of sentence sign . ! ? or CR or CR/LF        
        line = re.split(r'[.!?]',line)
        
        # Append to list
        for sentence in line:
            if not sentence:
                continue
            sentences.append(sentence)
            
    lines.close()
    return sentences

## Remove non pertinent words

In [3]:
def get_filtered_sentences(sentences):
    """Returns a list of words into a list of sentences 
       where punctation, stop word and more are removed
    @params
        sentences List of sentences
    @return
        list of filtered sentences
    
    """
    filtered_sentences = []
    
    # Only french stop words
    stop_words = stopwords.words('french')
    
    for sentence in sentences:            
        if not sentence:
            continue
            
        # Sentence in lower case
        sentence = sentence.lower()
            
        # Remove some unwanted characters like punctuation and "
        sentence = sentence.replace("«", "")
        sentence = sentence.replace("»", "")
        sentence = sentence.replace(",", "")
        sentence = sentence.replace(";", "")
        sentence = sentence.replace(":", "")
        sentence = sentence.replace("\"", "")
        sentence = sentence.replace("(", "")
        sentence = sentence.replace(")", "")            
            
        # Replace contracted forms            
        sentence = sentence.replace("n'", "ne ")
        sentence = sentence.replace("n’", "ne ")
        sentence = sentence.replace("l'", "le ")
        sentence = sentence.replace("d'", "de ")
        sentence = sentence.replace("d’", "de ")        
        sentence = sentence.replace("s'", "si ")
        sentence = sentence.replace("qu'", "que ")
        sentence = sentence.replace("c'", "ce ")
            
        # Remove space
        sentence = sentence.strip()
                        
        # Split in words and remove stopwords
        filtered_words = [word for word in sentence.split(" ") if word not in stopwords.words('french')]
        filtered_sentences.append(filtered_words)
        
    return filtered_sentences

In [4]:
sentences = get_sentences_from_file( "data/article_002.txt")
print(sentences[0])
filtered_sentences = get_filtered_sentences(sentences)
print(filtered_sentences)

Le constat d'échec de la justice dans la prévention des homicides conjugaux
[['constat', 'échec', 'justice', 'prévention', 'homicides', 'conjugaux'], ['rapport', 'inspection', 'générale', 'justice', 'homicides', 'conjugaux', '88', 'cas', 'définitivement', 'jugés', 'pointe', 'graves', 'dysfonctionnements', 'chaîne', 'pénale'], ['décidant', 'rendre', 'public', 'dimanche', '17', 'novembre', 'rapport', 'inspection', 'générale', 'justice', 'homicides', 'conjugaux', 'nicole', 'belloubet', 'garde', 'sceaux', 'dévoile', 'sans', 'fard', 'cloche', 'détection', 'signes', 'annonciateurs', 'crimes'], ['constat', 'alarmant', 'tant', 'côté', 'services', 'police', 'gendarmerie', 'côté', 'magistrats', 'services', 'pénitentiaires', 'services', 'sociaux', 'médicaux'], ['très', 'clairement', 'ça', 'va'], ['chaîne', 'pénale', 'satisfaisante', 'reconnaît', 'ministre', 'justice', 'entretien', 'publié', 'jour', 'journal', 'dimanche'], ['mission', 'inspection', 'a', 'examiné', '88', 'dossiers', 'homicides', 'c

## Cosine similarity

In [5]:
def get_cosine_similarity(a, b):
    """Returns the cosine similarity of 2 vectors
    @params
        a vector
        b vector
    @return
        cosine similarity
    """
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

## Vectorize sentences

### TF-IDF

In [6]:
def tfidf_vector(filtered_sentences):
    """Returns vectorized sentences with the TF-IDF of the words
    @params:
        filetred_sentences list of sentences with list of words
    @return:
        List of the same sentences where the words are replaced by 
          the TF-IDF of the word
    """
    term_frequency = []
    sentence_max_len = 0
    
    # Get the occurence of each word in a sentence
    for sentence in filtered_sentences:
        if sentence_max_len < len(sentence):
            sentence_max_len = len(sentence)

        words = {}
        for word in sentence:
            if word in words:
                words[word] += 1
            else:
                words[word] = 1
                        
        term_frequency.append(words)

    # Term Frequency 
    # word occurence / total count of word in the sentence    
    # Prepare IDF
    # Occurence of the words in sentences
    for terms in term_frequency:
        terms_count = len(terms)
        for term in terms:
            terms[term] = terms[term] / terms_count    
            if term in words_idf:
                words_idf[term] += 1
            else:
                words_idf[term] = 1
        
    # IDF
    document_count = len(filtered_sentences)
    for word_idf in words_idf:
        words_idf[word_idf] = math.log(document_count / words_idf[word_idf])

    # TF-IDF for each word and vectorize sentences
    tf_idfs = []

    for idx, sentence in enumerate(filtered_sentences):
        tf_idf = []
        # The vectors must have the same length
        vector = [0] * sentence_max_len
        for word_index, word in enumerate(sentence):
            tfidf = term_frequency[idx][word] * words_idf[word]
            vector[word_index] = tfidf
        tf_idfs.append(vector)
    return tf_idfs

### Word index

In [7]:
def word_vector(filtered_sentences):
    """Returns vectorized sentences with the index of the words
    @params:
        filetred_sentences list of sentences with list of words
    @return:
        List of the same sentences where the words are replaced by 
          the word ID
    """
    vectors = []
    sentence_max_len = 0
    words = {}
    idx = 0
    for sentence in filtered_sentences:
        if sentence_max_len < len(sentence):
            sentence_max_len = len(sentence)
        
        for word in sentence:
            if word not in words:
                words[word] = idx
                idx += 1
    
    for idx, sentence in enumerate(filtered_sentences):
        vector = [0] * sentence_max_len
        for word_index, word in enumerate(sentence):
            vector[word_index] = words[word]
        vectors.append(vector)
        
    return vectors

### Word count

In [8]:
def wordcount_vector(filtered_sentences):
    """Returns vectorized sentences with the count of words
    @params:
        filetred_sentences list of sentences with list of words
    @return:
        List of the same sentences where the words are replaced by 
          the count of the word
    """
    term_frequency = {}
    sentence_max_len = 0
    
    for sentence in filtered_sentences:
        if sentence_max_len < len(sentence):
            sentence_max_len = len(sentence)

        for word in sentence:
            if word in term_frequency:
                term_frequency[word] += 1
            else:
                term_frequency[word] = 1                        

    tf_idfs = []

    for idx, sentence in enumerate(filtered_sentences):
        tf_idf = []        
        vector = [0] * sentence_max_len
        for word_index, word in enumerate(sentence):            
            vector[word_index] = term_frequency[word]
        tf_idfs.append(vector)
    return tf_idfs

## Do summarization

In [17]:
words = []

words_idf = {}

# Load sentences
sentences = get_sentences_from_file( "data/article_002.txt")

# Clean sentences
filtered_sentences = get_filtered_sentences(sentences)

# Convert sentences toi vectors
vectors = tfidf_vector(filtered_sentences)
#vectors = word_vector(filtered_sentences)
#vectors = wordcount_vector(filtered_sentences)
    
# Get the cosine similarity betwwen all the sentences
similarity_matrix = np.zeros((len(vectors), len(vectors)))

for idx1, sent1 in enumerate(vectors):
    for idx2, sent2 in enumerate(vectors):
        if idx1 == idx2:
            continue
        similarity_matrix[idx1][idx2] = get_cosine_similarity(np.array(sent1), np.array(sent2))

# Rank sentences to get the best cosine similatity first
rank = []     
for idx, sim in enumerate(similarity_matrix):
    rank.append((idx,sum(sim)))

sorted_rank = sorted(rank, key=lambda sim: sim[1], reverse=True)
top_n = 3

# Print result
print(f"Title : {sentences[0]}")
print ("---------------")
for r in sorted_rank:
    idx = r[0]
    print(sentences[idx] + '.')
    top_n -= 1
    if top_n == 0:
        break

Title : Le constat d'échec de la justice dans la prévention des homicides conjugaux
---------------
L'absence de dénonciation par les médecins est également déplorée, alors qu'une dizaine de victimes de violences conjugales avaient auparavant consulté à l'hôpital ou en cabinet.
 "Cette absence de dénonciation ou de signalement a empêché la mise en place de mesures susceptibles de prévenir l'homicide ultérieur", note l’inspection.
 Dans 35% des cas où des violences préexistaient, elles n’avaient pas été dénoncées à la police, mais étaient le plus souvent connues de la famille, des voisins ou de services sociaux.
