In [18]:
#only need once
import nltk
#nltk.download('stopwords')
#nltk.download('punkt')

In [1]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.snowball import SnowballStemmer
import functools
import os
import re
from gensim.summarization import keywords
import numpy as np
import pandas as pd
import glob
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import codecs
import pickle

#global variable
STOPWORDS = set(stopwords.words("english"))
STEMMER = SnowballStemmer("english")



In [2]:
def read_txt(txtfile):
    with open(txtfile, 'r') as myfile:
        text=myfile.read()
    myfile.close()
    return text

#pre-con: txt is a raw text with \n
#post-con: returns text with global redundant words removed
def remove_global_redundancies(txt):
    txt = re.sub("Figure [0-9]+\..*?Source:.*?\n", "", txt, re.MULTILINE, re.DOTALL)
    txt = re.sub("Figure [0-9]+\..*|figure [0-9]+\..*|Source:.*|source:.*", "", txt)
    txt = re.sub("[0-9]+, Citi Research", "", txt)
    txt = re.sub(".*[0-9]+.*?\nCitigroup.*", "", txt)
    txt = re.sub(".*Citi GPS.*", "", txt)
    txt = re.sub("Citi|Citibank|Citigroup|citi|citibank|citigroup", "", txt)
    #commented code removes "hello. hello. bye?"
    #want to remove "hello \n there?"
    #txt = re.sub("[\.\?!](.*?\?)", "", txt, re.MULTILINE, re.DOTALL)
    #naive solution
    txt = re.sub(".*\?", "", txt)
    txt = re.sub("!", ".", txt)
    txt = re.sub("\r|\n", " ", txt)
    
    #Remove all letters that are not from ASCII 0-127
    #However, would remove things like trademark and registered
    txt = re.sub("[^\x00-\x7f]", "", txt)
    
    txt = re.sub(" +", " ", txt)
    txt = txt.strip()
    
    return txt

In [62]:
path = './Actual - Bank of the Future Text/*.txt' 
files = glob.glob(path)
# iterate over the list getting each file 
articles_list = []

#inside for loop we remove the global redundants
for fle in files:
    with codecs.open(fle, "r",encoding='utf-8', errors='ignore') as f:
        article_content = ''
        for line in f:
            article_content += line
        article_content = remove_global_redundancies(article_content)
        articles_list.append(article_content)
        
        
articles_dict = {'article_text': articles_list}
df = pd.DataFrame.from_dict(articles_dict)
 

In [71]:
output_lst = list()

#rudimentary way to check if is txt file
for txt in os.listdir("Disruptive Innovation Text"):
    if re.match(".*\.txt$", txt) != None:
        output_lst.append(
            remove_global_redundancies(read_txt("Disruptive Innovation Text/" + txt)))

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x92 in position 1164: invalid start byte

In [4]:
# helper function to remove stopwords
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in STOPWORDS])
    return sen_new

In [61]:
# Extract word vectors
word_embeddings = {}
f = open('..\\Glove\\glove.6B.100d.txt', encoding='utf-8')
#f = open('./glove/glove.6B.100d.txt', encoding='utf-8') for Mac
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [65]:
#######################ONLY DO ONCE FOR THE FIRST 12 ARTICLES TO GET THE FIRST FREQ TABLE!##############

#get frequency table from text.
def get_freq_table(txt):
    words = word_tokenize(txt) #list of strings where each string is a word.
    freq_table = dict()
    
    for word in words:
        word = STEMMER.stem(word)
        if word in STOPWORDS:
            continue
        if word in freq_table:
            freq_table[word] += 1
        else:
            freq_table[word] = 1
       
    return freq_table

#create giant string which contains all articles (which had global redundancies removed prior)
overall_string = ""
for s in df['article_text']:
    overall_string += s + " "

#global word frequency table
FREQ_TABLE = get_freq_table(overall_string)

#save global word frequency table to a pickle file
pickle_out = open("FREQ_TABLE.pickle","wb")
pickle.dump(FREQ_TABLE, pickle_out)
pickle_out.close()

In [11]:
print(FREQ_TABLE)

{'innov': 37, 'evolv': 5, 'world': 22, ':': 147, 'slow': 9, 'chang': 51, 'long-term': 9, 'impact': 20, 'human': 23, 'hard-wir': 1, 'respond': 8, 'instantan': 1, 'fight': 2, 'flight': 1, 'respons': 6, 'make': 28, 'snap': 1, 'decis': 12, 'base': 42, 'immedi': 1, 'danger': 1, '.': 1064, 'natur': 30, 'identifi': 7, 'react': 2, 'challeng': 34, 'aris': 2, 'slowli': 6, ',': 1600, 'even': 32, 'great': 6, 'live': 8, 'busi': 37, 'face': 7, 'similar': 25, 'problem': 10, 'deal': 6, 'industri': 98, 'large-scal': 2, 'societ': 3, 'econom': 24, 'technolog': 107, 'trend': 14, 'emerg': 28, 'gradual': 1, 'continu': 9, 'time': 60, 'easili': 5, 'overlook': 1, 'becaus': 21, 'often': 20, 'occur': 9, 'outsid': 6, 'organ': 4, 'line-of-sight': 1, 'merger': 1, 'mani': 39, 'dispar': 1, 'develop': 68, 'may': 70, 'lost': 2, 'day-to-day': 1, 'nois': 1, 'forward-think': 1, 'compani': 82, 'miss': 5, 'around': 25, '2007': 2, 'microsoft': 6, 'ceo': 8, 'steve': 1, 'ballmer': 1, 'said': 8, 'chanc': 1, 'iphon': 1, 'go': 19

In [13]:
#load freq table from pkl file
def load_freq_table(pkl_dir): #wishful thinking
    pickle_off = open(pkl_dir,"rb")
    unpickled_file = pickle.load(pickle_off)
    return unpickled_file


In [14]:
load_freq_table('FREQ_TABLE.pickle')

{'innov': 37,
 'evolv': 5,
 'world': 22,
 ':': 147,
 'slow': 9,
 'chang': 51,
 'long-term': 9,
 'impact': 20,
 'human': 23,
 'hard-wir': 1,
 'respond': 8,
 'instantan': 1,
 'fight': 2,
 'flight': 1,
 'respons': 6,
 'make': 28,
 'snap': 1,
 'decis': 12,
 'base': 42,
 'immedi': 1,
 'danger': 1,
 '.': 1064,
 'natur': 30,
 'identifi': 7,
 'react': 2,
 'challeng': 34,
 'aris': 2,
 'slowli': 6,
 ',': 1600,
 'even': 32,
 'great': 6,
 'live': 8,
 'busi': 37,
 'face': 7,
 'similar': 25,
 'problem': 10,
 'deal': 6,
 'industri': 98,
 'large-scal': 2,
 'societ': 3,
 'econom': 24,
 'technolog': 107,
 'trend': 14,
 'emerg': 28,
 'gradual': 1,
 'continu': 9,
 'time': 60,
 'easili': 5,
 'overlook': 1,
 'becaus': 21,
 'often': 20,
 'occur': 9,
 'outsid': 6,
 'organ': 4,
 'line-of-sight': 1,
 'merger': 1,
 'mani': 39,
 'dispar': 1,
 'develop': 68,
 'may': 70,
 'lost': 2,
 'day-to-day': 1,
 'nois': 1,
 'forward-think': 1,
 'compani': 82,
 'miss': 5,
 'around': 25,
 '2007': 2,
 'microsoft': 6,
 'ceo': 8,


In [None]:
#update freq table pkl file
#pre-con: pkl_dir is directory of pkl file, txt is a text file with global redundancies removed, whereby
#         we update the freq table counts by the words in txt
def update_freq_table(pkl_dir, txt):
    #read pkl_dir wishful thinking
    freq_table = load_freq_table(pkl_dir)
    #list of words
    words_in_txt = word_tokenize(txt)
    
    for word in words_in_txt:
        stemmed_word = STEMMER.stem(word)
        if stemmed_word in STOPWORDS:
            continue
        if word in freq_table:
            freq_table[stemmed_word] += 1
        else:
            freq_table[stemmed_word] = 1
       
    #write_freq_table wishful thinking
    #write_freq_table(pkl_dir, words_in_txt)

In [53]:
#pre-cond: txt is a text with global redundancies removed. pkl_dir is the directory of the freq_table pickle.

def summarize_alg(txt, pkl_dir):
    sentences = nltk.sent_tokenize(txt) #list of strings where each string is a sentence
    
    #-------------clean sentences (non-global redundants)---------------------
    
    # remove punctuations, numbers and special characters
    clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z -]", "").str.replace(" +", " ")
    # make alphabets lowercase
    clean_sentences = [s.lower() for s in clean_sentences]
    # remove stopwords from the sentences
    clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]
    
    #-------------------------1st algo------------------------
    sentence_vectors = []
    for i in clean_sentences:
        if len(i) != 0:
            v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
        else:
            v = np.zeros((100,))
        sentence_vectors.append(v)
        
    # similarity matrix initialization
    sim_mat = np.zeros([len(sentences), len(sentences)])
    
    # updating matrix coeffs
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
                sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100),
                                                  sentence_vectors[j].reshape(1,100))[0,0]
    
    # graphs
    nx_graph = nx.from_numpy_matrix(sim_mat)
    first_algo_scores = nx.pagerank_numpy(nx_graph, alpha=0.85, personalization=None, weight='weight', dangling=None)
    
    #normalize across first_algo_scores such that every score runs from [0,1].
    first_algo_scores = {k: (np.arctan(v)/np.pi + 1/2) for k, v in first_algo_scores.items()}
    
    #--------------------2nd algo-----------------------------------------------
    #load freq table
    freq_table = load_freq_table(pkl_dir) #wishful thinking
    
    second_algo_scores = dict()
    
    for i in range(len(clean_sentences)):
        sentence = clean_sentences[i]
        words_in_sentence = word_tokenize(sentence)
        second_algo_scores[i] = 0
        for word in words_in_sentence:
            if STEMMER.stem(word) in freq_table: #we only consider words which appear in freq_table.
                score_of_word = freq_table[STEMMER.stem(word)]
                if i in second_algo_scores.keys():
                    second_algo_scores[i] += score_of_word
                else:
                    second_algo_scores[i] = score_of_word
        
        #divide by length of sentence
        if len(sentence)!=0:
            second_algo_scores[i]/len(sentence)
        
    #normalize across second_algo_scores such that every score runs from [0,1].
    max_score = max(second_algo_scores.values())
    second_algo_scores = {k: v/max_score for k, v in second_algo_scores.items()}
    
    #----------------------combine scores----------------------------
    combined_scores = {i: first_algo_scores[i]*second_algo_scores[i] for i in range(len(sentences))}
    
    #----------------------extract the sentences we need-----------------------------------
    #list of tuples (index, score) sorted by score
    scores_sorted = list(sorted(combined_scores.items(), key=lambda x:-x[1]))
    
    extracted_sentences = list()
    #first article no trailing " ". so initialize to -1
    character_count = -1
    
    #################
    for (index, score) in scores_sorted:
        if len(sentences[index])>=500:
            continue
        elif character_count + len(sentences[index]) > 500:
            break
        else:
            extracted_sentences.append((index, sentences[index]))
            character_count += len(sentences[index])+1 #+1 to take into account the trailing " "

    #-----------return the sentences in the order they appeared as a single string--------
    if len(extracted_sentences) == 0:
        return "0 length"
    
    final_text = functools.reduce(lambda x,y : x+" "+y, map(lambda x: x[1], sorted(extracted_sentences)))
    
    return final_text
    

In [55]:
#pre-cond: txt is a text with global redundancies removed. pkl_dir is the directory of the freq_table pickle.

def summarize_alg(txt, pkl_dir):
    sentences = nltk.sent_tokenize(txt) #list of strings where each string is a sentence
    
    #-------------clean sentences (non-global redundants)---------------------
    
    # remove punctuations, numbers and special characters
    clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z -]", "").str.replace(" +", " ")
    # make alphabets lowercase
    clean_sentences = [s.lower() for s in clean_sentences]
    # remove stopwords from the sentences
    clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]
    
    #-------------------------1st algo------------------------
    sentence_vectors = []
    for i in clean_sentences:
        if len(i) != 0:
            v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
        else:
            v = np.zeros((100,))
        sentence_vectors.append(v)
        
    # similarity matrix initialization
    sim_mat = np.zeros([len(sentences), len(sentences)])
    
    # updating matrix coeffs
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
                sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100),
                                                  sentence_vectors[j].reshape(1,100))[0,0]
    
    # graphs
    nx_graph = nx.from_numpy_matrix(sim_mat)
    first_algo_scores = nx.pagerank_numpy(nx_graph, alpha=0.85, personalization=None, weight='weight', dangling=None)
    
    #normalize across first_algo_scores such that every score runs from [0,1].
    first_algo_scores = {k: (np.arctan(v)/np.pi + 1/2) for k, v in first_algo_scores.items()}
    
    #--------------------2nd algo-----------------------------------------------
    #load freq table
    freq_table = load_freq_table(pkl_dir) #wishful thinking
    
    second_algo_scores = dict()
    
    for i in range(len(clean_sentences)):
        sentence = clean_sentences[i]
        words_in_sentence = word_tokenize(sentence)
        second_algo_scores[i] = 0
        for word in words_in_sentence:
            if STEMMER.stem(word) in freq_table: #we only consider words which appear in freq_table.
                score_of_word = freq_table[STEMMER.stem(word)]
                if i in second_algo_scores.keys():
                    second_algo_scores[i] += score_of_word
                else:
                    second_algo_scores[i] = score_of_word
        
        #divide by length of sentence
        if len(sentence)!=0:
            second_algo_scores[i]/len(sentence)
        
    #normalize across second_algo_scores such that every score runs from [0,1].
    max_score = max(second_algo_scores.values())
    second_algo_scores = {k: v/max_score for k, v in second_algo_scores.items()}
    
    #----------------------combine scores----------------------------
    combined_scores = {i: first_algo_scores[i]*second_algo_scores[i] for i in range(len(sentences))}
    
    #----------------------extract the sentences we need-----------------------------------
    #list of tuples (index, score) sorted by score
    scores_sorted = list(sorted(combined_scores.items(), key=lambda x:-x[1]))
    
    extracted_sentences = list()
    #first article no trailing " ". so initialize to -1
    sentence_count = 0
    
    #################
    for (index, score) in scores_sorted:
        if len(sentences[index])>=500:
            continue
        else:
            if sentence_count ==3:
                break
            extracted_sentences.append((index, sentences[index]))
            sentence_count += 1 #+1 to take into account the trailing " "

    #-----------return the sentences in the order they appeared as a single string--------
    if len(extracted_sentences) == 0:
        return "0 length"
    
    final_text = functools.reduce(lambda x,y : x+" "+y, map(lambda x: x[1], sorted(extracted_sentences)))
    
    return final_text
    

In [66]:
print(summarize_alg(df.iloc[0,0],'FREQ_TABLE.pickle'))

Among other factors, the banks strategic game plan will have to include the greater use of artificial intelligence (AI) and automation and the related overhaul of Core Banking systems and increased adoption of Cloud-based services. Banks are exploring AI uses in consumer and wholesale banking with the help of robotics (automation of routine tasks), analytics (big data mining), chat bots (digital dialogue with customers), and cognitive (changing rules and adapting). Outside the tech sector itself, financial services are one of the leading early adopters of AI in terms of spending, and in our chapter on artificial intelligence, we do a deep dive into some of the use cases of AI in banking and finance today.


In [68]:
print(summarize_alg(df.iloc[1,0],'FREQ_TABLE.pickle'))

Advances in computing power, data volume, and connectivity are core components of the industrialization of AI, and together they are leading an explosion in AI applications, including in financial services. 2018 group  March 2018 Industrialization of AI Spending and Investing More Banking & Securities Is the Largest Non-Tech Industry for AI Outside the tech sector itself, financial services is one of the leading early adopters of AI. 2013 2014 2015 2016 2017 2013 2014 2015 2016 2017 2018 group  22 22 AI in the U.S. is dominated by tech companies such as Amazon (predicting and analyzing customer shopping patterns); Google (strong AI push, acquired 50+ AI startups in 2015-16); IBM (high-profile Watson AI service); Apple (acquired four AI startups in 2015-16 for its digital assistants, facial/voice recognition); and Facebook (analyzing big data on social media).


In [69]:
print(summarize_alg(df.iloc[2,0],'FREQ_TABLE.pickle'))

2018 group  39 Chinese BigTech and Financial Services The presence of Chinese Internet giants Baidu, Alibaba, Tencent, and JD.com (BATJ) in finance is striking if we consider that (1) Chinas largest e-commerce platform also runs Chinas largest mutual fund (Yue Bao by Ant Financial), and (2) the Chinese social messaging app facilitates large volumes of money transfers / payments (WeChat by Tencent). 2018 group  41 Driving Platform Effects with comprehensive products Sesame Credit launched January 2015, in 11 months to >100m cumulative users Yue Bao launched June 2013, in 20 months to >100m cumulative users; Insurance launched November 2010, in 31 months to >100m cumulative users Yue Bao: convenient cash management service This money market fund provides convenient cash management service for users and provides a foundation for other wealth management services. However, Ant Financial suspended the Jie Bei service for certain users in early 2018 as a result of regulatory tightening on c

In [70]:
df

Unnamed: 0,article_text
0,The Bank of the Future 7 Interview with Expone...
1,Chapter A: Artificial Intelligence - the Finan...
2,Chapter B: BigTech or the ANTification of Fina...
