In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem.snowball import SnowballStemmer
import functools
import os
import re
from gensim.summarization import keywords
import numpy as np
import pandas as pd
import glob
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import codecs
import pickle
import time

#timer
start = time.time()

#global variable
STOPWORDS = set(stopwords.words("english"))
STEMMER = SnowballStemmer("english")



In [2]:
def read_txt(txtfile):
    with open(txtfile, 'r') as myfile:
        text=myfile.read()
    myfile.close()
    return text

#pre-con: txt is a raw text with \n
#post-con: returns text with global redundant words removed
def remove_global_redundancies(txt):
    txt = re.sub("\\n[0-9 ,]+\\r", "", txt)
    txt = re.sub("Figure [0-9]+\..*? [0-9]+.*?Citigroup.*?\n", "", txt, re.MULTILINE, re.DOTALL)
    txt = re.sub("Figure [0-9]+\..*?Source:.*?\n", "", txt, re.MULTILINE, re.DOTALL)
    txt = re.sub("Figure [0-9]+\..*|figure [0-9]+\..*|Source:.*|source:.*", "", txt)
    txt = re.sub("[0-9]+, Citi Research", "", txt)
    txt = re.sub(".*[0-9]+.*?\nCitigroup.*", "", txt)
    txt = re.sub(".*[0-9]+.*?Citigroup.*", "", txt)
    txt = re.sub(".*Citi GPS.*", "", txt)
    txt = re.sub("Citi|Citibank|Citigroup|citi|citibank|citigroup", "", txt)
    txt = re.sub("\n(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec).*? [0-9]+ *\r", "", txt)
    txt = re.sub(".*\?", "", txt)
    txt = re.sub("!", ".", txt)
    txt = re.sub("\r|\n", " ", txt)
    txt = re.sub("[^\x00-\x7f]", "", txt)
    txt = re.sub(" +", " ", txt)
    txt = txt.strip()
    
    return txt

In [3]:
path = './Actual - Bank of the Future Text/*.txt' 
files = glob.glob(path)
# iterate over the list getting each file 
articles_list = []
#inside for loop we remove the global redundants
for fle in files:
    with codecs.open(fle, "r",encoding='utf-8', errors='ignore') as f:
        article_content = ''
        for line in f:
            article_content += line
        article_content = remove_global_redundancies(article_content)
        articles_list.append(article_content)
        
articles_dict = {'article_text': articles_list}
df = pd.DataFrame.from_dict(articles_dict)

In [4]:
# helper function to remove stopwords
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in STOPWORDS])
    return sen_new

In [5]:
# Extract word vectors
word_embeddings = {}
f = open('..\\Glove\\glove.6B.100d.txt', encoding='utf-8')
#f = open('./glove/glove.6B.100d.txt', encoding='utf-8') for Mac
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [6]:
#get frequency table from text.
def get_freq_table(txt):
    words = word_tokenize(txt) #list of strings where each string is a word.
    freq_table = dict()
    
    for word in words:
        word = STEMMER.stem(word)
        if word in STOPWORDS:
            continue
        if word in freq_table:
            freq_table[word] += 1
        else:
            freq_table[word] = 1
       
    return freq_table

#create giant string which contains all articles (which had global redundancies removed prior)
overall_string = ""
for s in df['article_text']:
    overall_string += s + " "

#global word frequency table
FREQ_TABLE = get_freq_table(overall_string)

In [7]:
#pre-cond: txt is a text with global redundancies removed. pkl_dir is the directory of the FREQ_TABLE pickle.

def summarize_alg(txt, pkl_dir):
    sentences = nltk.sent_tokenize(txt) #list of strings where each string is a sentence
    
    #-------------clean sentences (non-global redundants)---------------------
    
    # remove punctuations, numbers and special characters
    clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z -]", "").str.replace(" +", " ")
    # make alphabets lowercase
    clean_sentences = [s.lower() for s in clean_sentences]
    # remove stopwords from the sentences
    clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]
    
    #-------------------------1st algo------------------------
    sentence_vectors = []
    for i in clean_sentences:
        if len(i) != 0:
            v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
        else:
            v = np.zeros((100,))
        sentence_vectors.append(v)
        
    # similarity matrix initialization
    sim_mat = np.zeros([len(sentences), len(sentences)])
    
    # updating matrix coeffs
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
                sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100),
                                                  sentence_vectors[j].reshape(1,100))[0,0]
    
    # graphs
    nx_graph = nx.from_numpy_matrix(sim_mat)
    first_algo_scores = nx.pagerank_numpy(nx_graph, alpha=0.85, personalization=None, weight='weight', dangling=None)
    
    #normalize across first_algo_scores such that every score runs from [0,1].
    max_score_first = max(first_algo_scores.values())
    first_algo_scores = {k: ((v/max_score_first) + 1)/2 for k, v in first_algo_scores.items()}
    
    #--------------------2nd algo-----------------------------------------------
    second_algo_scores = dict()
    
    for i in range(len(clean_sentences)):
        sentence = clean_sentences[i]
        words_in_sentence = word_tokenize(sentence)
        second_algo_scores[i] = 0
        for word in words_in_sentence:
            if STEMMER.stem(word) in FREQ_TABLE: #we only consider words which appear in FREQ_TABLE.
                score_of_word = FREQ_TABLE[STEMMER.stem(word)]
                if i in second_algo_scores.keys():
                    second_algo_scores[i] += score_of_word
                else:
                    second_algo_scores[i] = score_of_word
        
        #divide by length of sentence
        if len(sentence)!=0:
            second_algo_scores[i]/len(sentence)
        
    #normalize across second_algo_scores such that every score runs from [0,1].
    max_score_second = max(second_algo_scores.values())
    second_algo_scores = {k: v/max_score_second for k, v in second_algo_scores.items()}
    
    #----------------------combine scores----------------------------
    combined_scores = {i: (first_algo_scores[i])*(second_algo_scores[i]) for i in range(len(sentences))}
    
    #----------------------extract the sentences we need-----------------------------------
    #list of tuples (index, score) sorted by score
    scores_sorted = list(sorted(combined_scores.items(), key=lambda x:-x[1]))
    
    extracted_sentences = list()
    sentence_count = 0
    
    for (index, score) in scores_sorted:
        if len(sentences[index])>=300:
            continue
        else:
            if sentence_count==3:
                break
            extracted_sentences.append((index, sentences[index]))
            sentence_count += 1

    #-----------return the sentences in the order they appeared as a single string--------
    final_text = functools.reduce(lambda x,y : x+" "+y, map(lambda x: x[1], sorted(extracted_sentences)))
    
    return final_text
    

In [8]:
print(summarize_alg(df.iloc[0,0],'FREQ_TABLE.pickle'))


Among other factors, the banks strategic game plan will have to include the greater use of artificial intelligence (AI) and automation and the related overhaul of Core Banking systems and increased adoption of Cloud-based services. Banks are exploring AI uses in consumer and wholesale banking with the help of robotics (automation of routine tasks), analytics (big data mining), chat bots (digital dialogue with customers), and cognitive (changing rules and adapting). Outside the tech sector itself, financial services are one of the leading early adopters of AI in terms of spending, and in our chapter on artificial intelligence, we do a deep dive into some of the use cases of AI in banking and finance today.


In [9]:
print(summarize_alg(df.iloc[1,0],'FREQ_TABLE.pickle'))

Advances in computing power, data volume, and connectivity are core components of the industrialization of AI, and together they are leading an explosion in AI applications, including in financial services. Industrialization of AI Spending and Investing More Banking & Securities Is the Largest Non-Tech Industry for AI Outside the tech sector itself, financial services is one of the leading early adopters of AI. According to the IDC, the banking and securities sector is not only the biggest spender on external AI services but is also expected see fast spending growth over the next five years.


In [10]:
print(summarize_alg(df.iloc[2,0],'FREQ_TABLE.pickle'))

Traditional banking is being challenged not by small FinTech startups but rather by established tech giants (particularly in emerging markets) leveraging their strong customer bases, vast user data pools, agile technology platforms, and deep funding pockets. The bank offers no physical branches and operates entirely on a cloud  computing platform using Big Data to compute loan amounts and terms, thus saving significantly on operational costs. With 980 million monthly active user accounts as of September 2017 (+16% YoY), Tencents Weixin mobile messaging app has become a powerful multi-function platform, including for integrating third-party services, including payments and financial services.


In [11]:
end = time.time()
end - start

19.2243390083313