In [None]:
import re    
import sys
import math
import nltk
import spacy
import gensim
import random
import numpy as np
import pandas as pd
from statistics import mean
from sklearn import preprocessing
from nltk.corpus import stopwords
from numpy.linalg import svd as svd
from scipy.sparse.linalg import svds
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from sklearn.preprocessing import normalize
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
spacy.load('en')
from spacy.lang.en import English
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
def tokenize_sentences(text):
    list_of_sentences = nltk.sent_tokenize(text)
    return list_of_sentences
    

def _compute_matrix(sentences, weighting, norm): 
    
    '''
    SECTION - 4.2.3 - Input Matrix Creation
    
    Compute the matrix of term frequencies or tfidf or binary representation given a list of sentences
    
    :param ngram_range - The lower and upper boundary of the range of n-values for different word n-grams or char n-grams 
                         to be extracted. All values of n such such that min_n <= n <= max_n will be used. 
                         For example an ngram_range of (1, 1) means only unigrams, (1, 2) means unigrams and bigrams, and (2, 2) means only bigrams.

    :param min_df -      When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold.

    :param binary -      If True, all non zero counts are set to 1. 

    :param max_df -      To remove intra corpus detected stop words

    '''
        
    if weighting.lower() == 'binary':
        vectorizer = CountVectorizer(min_df=1, ngram_range=(1, 2), binary=True, stop_words=None)
    elif weighting.lower() == 'frequency':
        vectorizer = CountVectorizer(min_df=1, ngram_range=(1, 2), binary=False, decode_error = 'ignore', stop_words='english')
    elif weighting.lower() == 'tfidf':
        vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1, 2), stop_words='english', norm = 'l2', decode_error = 'ignore')
    else:
        raise ValueError('Parameter "method" must take one of the values "binary", "frequency" or "tfidf".')

    frequency_matrix = vectorizer.fit_transform(sentences).astype(float)

    terms = vectorizer.get_feature_names()
    
    # Normalize the term vectors (i.e. each row (axis=1) adds to 1). 
    # Hence each row is considered as a vector and for normalizing it to unit vector length we use l2 norm.
    # l2 norm is simply the distance in euclidean space. Scale input vectors individually to unit norm (vector length).
    if weighting.lower() == 'binary' or weighting.lower() == 'frequency':
        if norm in ('l1', 'l2'):
            frequency_matrix = normalize(frequency_matrix, norm=norm, axis=0)
        elif norm is not None:
            raise ValueError('Parameter "norm" can only take values "l1", "l2" or None')

    return frequency_matrix, terms
    
    
def SVD_cal(raw_text, threshold_value, num_of_sentences,  weighting, norm='l2'):
    
    '''
    SECTION - 4.2.4 - Singular Value Decomposition
    
    param - threshold_value: Apply a threshold based approach to remove singular values less than a heuristic value 
    (must be between 0 and 1) of the largest singular values
    
    param - weighting: mode of sentence matrix creation either tf-idf, tf or binary representation.
    
    param - num_of_sentence: Selecting number of sentences to be selected for summarization.
    '''

    token_sentences = tokenize_sentences(raw_text)
    sentence_matrix, feature_names = _compute_matrix(token_sentences, weighting, norm=norm)
    sentence_matrix = sentence_matrix.transpose()
    sentence_matrix = sentence_matrix.multiply(sentence_matrix > 0)
    num_of_topics = min(sentence_matrix.shape) - 1

    # To make the output non-varying.svds uses random intial vector from dimension N of the sparse matrix. 
    # So to set the initial vector to a constant choice we must use the v0 parameter and the code is mentioned below.
    np.random.seed(0)
    v0 = np.random.rand(min(sentence_matrix.shape))
    
    '''
    :param sentence_matrix - Array to compute the SVD on, of shape (M, N)
    :param k - Number of largest singular values to be considered for summary generation. Usually k < min(sentence_matrix.shape)
    :param v0 - Initialization of vector for iterations. By default is is random but has to be fixed in order to generate the contant results.
    :param which - Which k singular values to find: ‘LM’ : largest singular values ‘SM’ : smallest singular values
    '''
    u, s, v = svds(sentence_matrix, k=num_of_topics, v0=v0, which='LM')
    
    # A preprocessing step is embedded between the SVD and sentence selection process.
    # The first average sentence score is calculated for each concept which is represented by a row of VT matrix.
    # If the value of a cell in that row is less than the calculated average score of that row, the score in the cell is set to zero
    
    topic_averages = v.mean(axis=1)
    topic_sigma_threshold = threshold_value
    for topic_ndx, topic_avg in enumerate(topic_averages):
        v[topic_ndx, v[topic_ndx, :] <= topic_avg] = 0  
    
    if 1 <= topic_sigma_threshold < 0:
        raise ValueError('Parameter topic_sigma_threshold must take a value between 0 and 1')
    
    # Apply a threshold-based approach to remove singular values that are less than half of the largest singular value if any exist. 
    # This is a heuristic, and you can play around with this value if you want.
    # Mathematically, Si = 0 iff Si < (1/2)max(S). 
    
    sigma_threshold = max(s) * topic_sigma_threshold
    s[s < sigma_threshold] = 0  
    
    # Build a "length vector" containing the length (i.e. saliency) of each sentence.
    # Multiply each term sentence column from V squared with its corresponding singular value from S also squared, to get sentence weights per topic.
    # Compute the sum of the sentence weights across the topics and take the square root of the final score to get 
    # the salience scores for each sentence in the document.
    
    saliency_vec = np.sqrt(np.dot(np.square(s), np.square(v)))
    top_sentences = saliency_vec.argsort()[-num_of_sentences:][::-1]
    
    # Once we have these scores, we sort them in descending order, pick the top n sentences corresponding to the highest scores.
    top_sentences.sort()
    
    # Return the sentences in the order in which they appear in the document.
    list_summary = [token_sentences[i] for i in top_sentences]
    string_summary = " ".join(str(item) for item in list_summary)

    return string_summary
    

def num_of_sentences(text_input):
    '''
    SECTION 5.2, EQUATION 5.1
    Function - Defines the number of sentences to be chosen for the document summarization based on the number of sentences in
    the actual document and the number of tokens in each sentences.
    '''
    tokenized_sentences = tokenize_sentences(text_input)
    avg_words_per_sentence = mean([len(sentence.split()) for sentence in tokenized_sentences])
    return math.ceil(500/avg_words_per_sentence)
    
    
def summarization_process(Structured_dataset, weighting):
    Threshold_Sigma, Errorneous_files  = 0.8, 0
    LSA_Text, Case_Label, Doc_ID, Legal_Details, Errorneous_files_list, filename = [], [], [], [], [], []
    for idx, each_doc in Structured_dataset.iterrows():
        try:     
            Num_of_Sentences = num_of_sentences(each_doc['Preprocessed_Text'])
            print('Executing {} file and Number of sentences to select {} out of {}'
                  .format(idx, Num_of_Sentences, len(tokenize_sentences(each_doc['Preprocessed_Text']))))

            LSA_Text.append(SVD_cal(each_doc['Preprocessed_Text'], Threshold_Sigma, Num_of_Sentences, weighting))
            Case_Label.append(each_doc['Case_Label'])
            Doc_ID.append(each_doc['DocID'])
            Legal_Details.append(each_doc['Legal_Details'])
            filename.append(each_doc['File_Name'])

        except ValueError:
            Errorneous_files = Errorneous_files + 1
            Errorneous_files_list.extend(idx, doc_content)
            pass
    
    text_BERT_format = {'Summarized_content' : LSA_Text, 'Labels' : Case_Label, "DocID" : Doc_ID, 
                        'Legal_Details' : Legal_Details, 'Filename':filename}
    my_df = pd.DataFrame(text_BERT_format)
    
    return my_df

In [None]:
if __name__=='__main__':
    Structured_dataset = pd.read_csv('Thesis - Dataset and Transformations/transform - post legal data extraction/fully_preprocessed_with_legal_entites.csv')
    summarized_documents_df = summarization_process(Structured_dataset, weighting='binary')
    # Change the saving path:
    # For tfidf: Thesis - Dataset and Transformations/transform - post summarization/LSA_tfidf.csv
    # For term freq: Thesis - Dataset and Transformations/transform - post summarization/LSA_frequency.csv
    summarized_documents_df.to_csv('Thesis - Dataset and Transformations/transform - post summarization/LSA_binary.csv', index=False, header=True)