In [2]:
#only need to do once
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.snowball import SnowballStemmer
import functools
import os
import re
from gensim.summarization import keywords

#global variable
STOPWORDS = set(stopwords.words("english"))
STEMMER = SnowballStemmer("english")

In [6]:
len(STOPWORDS)

179

In [9]:
#pre-con: txtfile is string of textfile directory name.
#post-con: single string of raw text.
def read_txt(txtfile):
    with open(txtfile, 'r') as myfile:
        text=myfile.read()
    myfile.close()
    return text

In [4]:
a = read_txt("Disruptive Innovation Text\\1- Innovating in an Evolving World.txt")
a

'\nInnovating in an Evolving \nWorld: \nSlow Change \nwith \nLong-Term Impact \n\nHumans are hard-wired to respond to instantaneous \nchange: Our ‘fight or flight’ \nresponse evolved to make snap decisions based on \nimmediate danger. It is not in \nour nature to identify and react to challenges that arise slowly, even those with great \nlong-term \nimpact on \nour lives. \n\n\nBusinesses face a \nsimilar problem when dealing with \n‘slow change’ \nin \ntheir \nindustries. Large-scale societal, economic, and technological trends that emerge \ngradually and continuously over time can be \nall too easily \noverlooked. \nThat is \nbecause \n‘slow change’ \noften occurs outside \nof an organization’s line-of-sight and \ncan \narise from the merger of many disparate \ndevelopments, each of which may be \nlost in the day-to-day \n‘noise.’ \n\n\nEven the \nmost forward-thinking companies can \nmiss \nslow change \noccurring around \nthem: \nIn 2007, Microsoft CEO Steve Ballmer said, “There \n

In [10]:
#pre-con: single string of raw text.
#post-con: dictionary of keywords
def find_keywords(txt):
    alpha_num = re.sub(" +", " ", re.sub("[^a-zA-Z -]", "", txt.replace("\n"," ")).strip())
    stemmer_map = list(map(lambda x: STEMMER.stem(x),word_tokenize(alpha_num)))
    keyword = keywords(functools.reduce(lambda x, y: x+" "+y, stemmer_map)).split("\n")
    
    return set(keyword)

In [25]:
a = find_keywords(txt1)
"busi" in a
print(a)
max(a)

{'decis', 'slow chang', 'microsoft', 'continu', 'steadi', 'capit', 'impact', 'transform', 'dispar', 'compani', 'large societ', 'busi', 'market', 'technolog'}


'transform'

In [11]:
#pre-con: txt is a single string of raw text with \n removed.
#post-con: returns a dictionary where each key is a stemmed word,
#          value is the total number of occurences of that particular word.


#OUR CURRENT SCORING IS BASED ON FREQUENCY OF WORDS BUT CAN BE CHANGED!
def get_score_table(txt):
    words = word_tokenize(txt) #list of strings where each string is a word.
    score_table = dict()
    
    for word in words:
        word = STEMMER.stem(word)
        if word in STOPWORDS:
            continue
        if word in score_table:
            score_table[word] += 1
        else:
            score_table[word] = 1
    
    keyword_dict = find_keywords(txt)
    max_score = max(score_table.values())
    
    for key, value in score_table.items():
        if key in keyword_dict:
            score_table[key] = max_score + value
       
    return score_table

In [12]:
#pre-con: txt is a single string of raw text with \n removed.
#post-con: list of tuples, each inner tuple contains 3 values: 1) the sentence,
#          2) index of sentence (based on the order they appeared in the text), 3) score of sentence
def first_alg(txt):
    score_table = get_score_table(txt)
    sentences = sent_tokenize(txt) #list of strings where each string is a sentence.
    sentence_score = dict()
    
    for i in range(len(sentences)):
        sentence = sentences[i]
        words_in_sentence = word_tokenize(sentence)
        for word in words_in_sentence:
            if STEMMER.stem(word) in score_table: #we only consider words which appear in score_table.
                score_of_word = score_table[STEMMER.stem(word)]
                if sentence in sentence_score:
                    sentence_score[sentence][1] += score_of_word
                else:
                    sentence_score[sentence] = [i, score_of_word]
        
    for (key,value) in sentence_score.items(): #normalize sentence scores
        sentence_score[key][1] /= len(key)
        
    return list(map(lambda x: (x[1][0], x[0], x[1][1]), list(sentence_score.items())))

In [13]:
#pre-con: alg_output is a list of tuples, each inner tuple contains 3 values: 1) the sentence,
#          2) index of sentence (based on the order they appeared in the text), 3) score of sentence
#post-con: the final text which satisfies the condition of less than 500 characters.

def extract_from_alg_output(alg_output):
    alg_output_sorted = sorted(alg_output, key = lambda x: -x[2])
    
    extracted_alg_output = list()
    character_count = 0
    for elem in alg_output_sorted:
        if character_count + len(elem[1]) > 500:
            break
        else:
            extracted_alg_output.append(elem)
            character_count += len(elem[1])

    final_text = functools.reduce(lambda x,y : x+y, map(lambda x: x[1], sorted(extracted_alg_output)))
    
    return final_text  

In [14]:
txt1 = read_txt("Disruptive Innovation Text\\1- Innovating in an Evolving World.txt").replace("\n", "")

first_alg_output = first_alg(txt1)
output1 = extract_from_alg_output(first_alg_output)

output1

'Businesses face a similar problem when dealing with ‘slow change’ in their industries.By seeking out such patterns in our work at Citi, we have identified three types of change that often cause the biggest impact: (1) Behavioral changes, (2) Technological changes, and (3) Industry or sector changes.Such is the case with three trends we are currently exploring: (1) changing social structures, (2) the changing nature of transactions, and (3) the changing nature of industries.'

In [12]:
output_lst = list()

for txt in os.listdir("Disruptive Innovation Text"):
    output_lst.append(
        extract_from_alg_output(first_alg(read_txt("Disruptive Innovation Text\\"+txt).replace("\n", ""))))

In [19]:
lst = [1,10,11,12,2,3,4,5,6,7,8,9]

i = 0
for txt in output_lst:
    print(str(lst[i])+")\n"+txt+"\n")
    i+=1

1)
Businesses face a similar problem when dealing with ‘slow change’ in their industries.By seeking out such patterns in our work at Citi, we have identified three types of change that often cause the biggest impact: (1) Behavioral changes, (2) Technological changes, and (3) Industry or sector changes.Such is the case with three trends we are currently exploring: (1) changing social structures, (2) the changing nature of transactions, and (3) the changing nature of industries.

10)
9.In May of this year, the peak of the selling season, the median amount of time homes had been listed for sale was 55 days.Most of this democratization, however, has been driven by broker advertising revenue.U.S. Home Sales Continue to Rise, Figure 38.Construction Lagged in Recovery, Figure 39.Approx.Commissions on U.S.For buyers, the vacant state of the home allows for flexible move-in dates.First, a significant amount of capital is required to purchase, carry, repair, and sell homes.

11)
10.21 Gartner In

In [38]:
output_lst = list()

for txt in os.listdir("Disruptive Innovation Text"):
    output_lst.append(
        extract_from_alg_output(first_alg(read_txt("Disruptive Innovation Text\\"+txt).replace("\n", ""))))

lst = [1,10,11,12,2,3,4,5,6,7,8,9]

i = 0
for txt in output_lst:
    print(str(lst[i])+")\n"+txt+"\n")
    i+=1

1)
Businesses face a similar problem when dealing with ‘slow change’ in their industries.By seeking out such patterns in our work at Citi, we have identified three types of change that often cause the biggest impact: (1) Behavioral changes, (2) Technological changes, and (3) Industry or sector changes.Such is the case with three trends we are currently exploring: (1) changing social structures, (2) the changing nature of transactions, and (3) the changing nature of industries.

10)
9.Stable, slowly rising home prices benefit these strategies because there is reasonable confidence asset prices won’t fall.U.S. Home Sales Continue to Rise, Figure 38.Construction Lagged in Recovery, Figure 39.Approx.Commissions on U.S.Additionally, many iBuyer firms offer trial periods and warranties on core infrastructure that offer peace of mind.First, a significant amount of capital is required to purchase, carry, repair, and sell homes.

11)
10.Figure 45.Global Smart Speaker Shipments Expected to Grow 