In [1]:
import re
import pickle
import networkx
import numpy as np
from utils import build_feature_matrix, low_rank_svd
from gensim.summarization import summarize, keywords
from nltk.tokenize import sent_tokenize, word_tokenize

path_to_no_sentences = '/home/ricardo/Escritorio/Machine-Learning-Algorithms-master/Summarization/no_sentences'
path_to_yes_sentences = '/home/ricardo/Escritorio/Machine-Learning-Algorithms-master/Summarization/yes_sentences'
aspects = ('disco', 'canción', 'grupo', 'voz', 'músico', 'guitarra', 'sonido')

In [2]:
# Function to load data
def load_info(path):
    f = open(path, 'rb')
    sentences = pickle.load(f)
    f.close()

    return sentences

In [3]:
# This function takes a word(aspect) and searches it in sentences
def search_aspect(sentences, aspect):
    selected_sentences = []
    for sentence in sentences:
        if(aspect in sentence):
            selected_sentences.append(sentence)

    return selected_sentences

In [4]:
# Function from Dipanjan Sarkar - Text Analytics with Python
from gensim.summarization import summarize, keywords
def text_summarization_gensim(text, summary_ratio=0.5):
    summary = summarize(text, split=True, ratio=summary_ratio)
    
    return summary

In [5]:
def parse_document(document):
    document = re.sub('\n', ' ', document)
    if isinstance(document, str):
        document = document
    elif isinstance(document, unicode):
        return unicodedata.normalize('NFKD', document).encode('ascii', 'ignore')
    else:
        raise ValueError('Document is not string or unicode!')
    document = document.strip()
    sentences = sent_tokenize(document)
    sentences = [sentence.strip() for sentence in sentences]
    
    return sentences

In [9]:
no_sentences = sent_tokenize(load_info(path_to_no_sentences))
yes_sentences = sent_tokenize(load_info(path_to_yes_sentences))

num_sentences = 3
num_topics = 3

for aspect in aspects:    
    hit_yes_sentences = ' '.join(search_aspect(yes_sentences, aspect))
    hit_no_sentences = ' '.join(search_aspect(no_sentences, aspect))
    
    # Gensim
    
    gensim_yes = ' '.join(text_summarization_gensim(hit_yes_sentences, summary_ratio=0.2))
    gensim_no = ' '.join(text_summarization_gensim(hit_no_sentences, summary_ratio=0.2))
    
    # LSA
    
    # Yes sentences
    hit_yes_sentences = parse_document(' '.join(search_aspect(yes_sentences, 'disco')))
    lsa_yes = []
    
    vec, dt_matrix = build_feature_matrix(hit_yes_sentences, feature_type='frequency')
    td_matrix = dt_matrix.transpose()
    td_matrix = td_matrix.multiply(td_matrix > 0)

    u, s, vt = low_rank_svd(td_matrix, singular_count=num_topics)  

    sv_threshold = 0.5
    min_sigma_value = max(s) * sv_threshold
    s[s < min_sigma_value] = 0

    salience_scores = np.sqrt(np.dot(np.square(s), np.square(vt)))

    top_sentence_indices = salience_scores.argsort()[-num_sentences:][::-1]
    top_sentence_indices.sort()
    
    for index in top_sentence_indices:
        lsa_yes.append(hit_yes_sentences[index])
        
    # No sentences
    hit_no_sentences = parse_document(' '.join(search_aspect(no_sentences, 'disco')))
    lsa_no = []
    
    vec, dt_matrix = build_feature_matrix(hit_no_sentences, feature_type='frequency')
    td_matrix = dt_matrix.transpose()
    td_matrix = td_matrix.multiply(td_matrix > 0)

    u, s, vt = low_rank_svd(td_matrix, singular_count=num_topics)  

    sv_threshold = 0.5
    min_sigma_value = max(s) * sv_threshold
    s[s < min_sigma_value] = 0

    salience_scores = np.sqrt(np.dot(np.square(s), np.square(vt)))

    top_sentence_indices = salience_scores.argsort()[-num_sentences:][::-1]
    top_sentence_indices.sort()
    
    for index in top_sentence_indices:
        lsa_no.append(hit_no_sentences[index])
        
    lsa_no = ' '.join(lsa_no)
    lsa_yes = ' '.join(lsa_yes)
        
    #TextRank
    
    tr_no = []
    tr_yes = []
    
    # Yes sentences
    vec, dt_matrix = build_feature_matrix(hit_yes_sentences, feature_type='tfidf')

    similarity_matrix = (dt_matrix * dt_matrix.T)
    similarity_graph = networkx.from_scipy_sparse_matrix(similarity_matrix)
    
    scores = networkx.pagerank(similarity_graph)
    
    ranked_sentences = sorted(((score, index) for index, score in scores.items()), reverse=True)
    top_sentence_indices = [ranked_sentences[index][1] for index in range(num_sentences)]
    top_sentence_indices.sort()
    
    for index in top_sentence_indices:
        tr_yes.append(hit_yes_sentences[index])
        
    # No sentences
        
    vec, dt_matrix = build_feature_matrix(hit_no_sentences, feature_type='tfidf')

    similarity_matrix = (dt_matrix * dt_matrix.T)
    similarity_graph = networkx.from_scipy_sparse_matrix(similarity_matrix)
    
    scores = networkx.pagerank(similarity_graph)
    
    ranked_sentences = sorted(((score, index) for index, score in scores.items()), reverse=True)
    top_sentence_indices = [ranked_sentences[index][1] for index in range(num_sentences)]
    top_sentence_indices.sort()
    
    for index in top_sentence_indices:
        tr_no.append(hit_no_sentences[index])
    
    tr_no = ' '.join(tr_no)
    tr_yes = ' '.join(tr_yes)
    
    a = open(aspect + '.txt', 'w')
    a.write('Gensim\n')
    a.write('-------------------------------------------------\n')
    a.write('Positive\n')
    a.write(gensim_yes + '\n')
    a.write('Negative\n')
    a.write(gensim_no + '\n')
    
    a.write('LSA\n')
    a.write('-------------------------------------------------\n')
    a.write('Positive\n')
    a.write(lsa_yes + '\n')
    a.write('Negative\n')
    a.write(lsa_no + '\n')
    
    a.write('TextRank\n')
    a.write('-------------------------------------------------\n')
    a.write('Positive\n')
    a.write(tr_yes + '\n')
    a.write('Negative\n')
    a.write(tr_no + '\n')
    
    a.close()