In [2]:
#only need once
import nltk
nltk.download('stopwords')
nltk.download('punkt')

In [11]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.snowball import SnowballStemmer
import functools
import os
import re
from gensim.summarization import keywords
import numpy as np
import pandas as pd
import glob
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import codecs

#global variable
STOPWORDS = set(stopwords.words("english"))
STEMMER = SnowballStemmer("english")



In [6]:
path = './Disruptive Innovation Text/*.txt' 
files = glob.glob(path)
# iterate over the list getting each file 
articles_list = []
#inside for loop we remove the global redundants
for fle in files:
    with codecs.open(fle, "r",encoding='utf-8', errors='ignore') as f:
        article_content = ''
        for line in f:
            article_content += line
        articles_list.append(article_content)
articles_dict = {'article_text': articles_list}
df = pd.DataFrame.from_dict(articles_dict)

In [16]:
#global redudant removal for each row (which is a string) 1 by 1
#replace raw txt from the df with the new txt (removing global redundancy)
remove_global_redundancies

'\r\nInnovating in an Evolving \r\nWorld: \r\nSlow Change \r\nwith \r\nLong-Term Impact \r\n\r\nHumans are hard-wired to respond to instantaneous \r\nchange: Our fight or flight \r\nresponse evolved to make snap decisions based on \r\nimmediate danger. It is not in \r\nour nature to identify and react to challenges that arise slowly, even those with great \r\nlong-term \r\nimpact on \r\nour lives. \r\n\r\n\r\nBusinesses face a \r\nsimilar problem when dealing with \r\nslow change \r\nin \r\ntheir \r\nindustries. Large-scale societal, economic, and technological trends that emerge \r\ngradually and continuously over time can be \r\nall too easily \r\noverlooked. \r\nThat is \r\nbecause \r\nslow change \r\noften occurs outside \r\nof an organizations line-of-sight and \r\ncan \r\narise from the merger of many disparate \r\ndevelopments, each of which may be \r\nlost in the day-to-day \r\nnoise. \r\n\r\n\r\nEven the \r\nmost forward-thinking companies can \r\nmiss \r\nslow change \r\noccu

In [None]:
# helper function to remove stopwords
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

In [None]:
# Extract word vectors
word_embeddings = {}
f = open('./gloves.6B/glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [None]:
#######################ONLY DO ONCE FOR THE FIRST 12 ARTICLES TO GET THE FIRST FREQ TABLE!##############

#get frequency table from text.
def get_freq_table(txt):
    words = word_tokenize(txt) #list of strings where each string is a word.
    freq_table = dict()
    
    for word in words:
        word = STEMMER.stem(word)
        if word in STOPWORDS:
            continue
        if word in freq_table:
            freq_table[word] += 1
        else:
            freq_table[word] = 1
       
    return freq_table

#create giant string which contains all articles (which had global redundancies removed prior)
overall_string = ""
for s in df['article_text']:
    overall_string = s + " "

#global word frequency table
FREQ_TABLE = get_freq_table(overall_string)

In [None]:
#load freq table from pkl file
def load_freq_table(pkl_dir): #wishful thinking

In [None]:
#update freq table pkl file
#pre-con: pkl_dir is directory of pkl file, txt is a text file with global redundancies removed, whereby
#         we update the freq table counts by the words in txt
def update_freq_table(pkl_dir, txt):
    #read pkl_dir wishful thinking
    freq_table = load_freq_table(pkl_dir)
    #list of words
    words_in_txt = word_tokenize(txt)
    
    for word in words_in_txt:
        stemmed_word = STEMMER.stem(word)
        if stemmed_word in STOPWORDS:
            continue
        if word in freq_table:
            freq_table[stemmed_word] += 1
        else:
            freq_table[stemmed_word] = 1
       
    #write_freq_table wishful thinking
    #write_freq_table(pkl_dir, words_in_txt)

In [None]:
#pre-cond: txt is a text with global redundancies removed. pkl_dir is the directory of the freq_table pickle.

def summarize_alg(txt, pkl_dir):
    sentences = nltk.sent_tokenize(txt) #list of strings where each string is a sentence
    
    #-------------clean sentences (non-global redundants)---------------------
    
    # remove punctuations, numbers and special characters
    clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z -]", "").str.replace(" +", " ")
    # make alphabets lowercase
    clean_sentences = [s.lower() for s in clean_sentences]
    # remove stopwords from the sentences
    clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]
    
    #-------------------------1st algo------------------------
    sentence_vectors = []
    for i in clean_sentences:
        if len(i) != 0:
            v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
        else:
            v = np.zeros((100,))
        sentence_vectors.append(v)
        
    # similarity matrix initialization
    sim_mat = np.zeros([len(sentences), len(sentences)])
    
    # updating matrix coeffs
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
                sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100),
                                                  sentence_vectors[j].reshape(1,100))[0,0]
    
    # graphs
    nx_graph = nx.from_numpy_matrix(sim_mat)
    first_algo_scores = nx.pagerank_numpy(nx_graph, alpha=0.85, personalization=None, weight='weight', dangling=None)
    
    #normalize across first_algo_scores such that every score runs from [0,1].
    first_algo_scores = {k: (np.arctan(v)/np.pi + 1/2) for k, v in first_algo_scores.items()}
    
    #--------------------2nd algo-----------------------------------------------
    #load freq table
    freq_table = load_freq_table(pkl_dir) #wishful thinking
    
    second_algo_scores = dict()
    
    for i in range(len(clean_sentences)):
        sentence = clean_sentences[i]
        words_in_sentence = word_tokenize(sentence)
        for word in words_in_sentence:
            if STEMMER.stem(word) in freq_table: #we only consider words which appear in freq_table.
                score_of_word = freq_table[STEMMER.stem(word)]
                if i in second_algo_scores:
                    second_algo_scores[i] += score_of_word
                else:
                    second_algo_scores[i] = score_of_word
        
        #divide by length of sentence
        second_algo_scores[i] /= len(sentence)
        
    #normalize across second_algo_scores such that every score runs from [0,1].
    max_score = max(second_algo_scores.values())
    second_algo_scores = {k: v/max_score for k, v in second_algo_scores.items()}
    
    #----------------------combine scores----------------------------
    combined_scores = {i: first_algo_scores[i]*second_algo_scores[i] for i in range(len(sentences))}
    
    #----------------------extract the sentences we need-----------------------------------
    #list of tuples (index, score) sorted by score
    scores_sorted = list(sorted(combined_scores.items(), lambda x:-x[1]))
    
    extracted_sentences = list()
    #first article no trailing " ". so initialize to -1
    character_count = -1
    for (index, score) in scores_sorted:
        if character_count + len(sentences[index]) > 500:
            break
        else:
            extracted_sentences.append((index, sentences[index]))
            character_count += len(sentences[index])+1 #+1 to take into account the trailing " "

    #-----------return the sentences in the order they appeared as a single string--------
    final_text = functools.reduce(lambda x,y : x+" "+y, map(lambda x: x[1], sorted(extracted_sentences)))
    
    return final_text
    