# Imports

In [1]:
#Script to extract important topics from content
#originally written by: vipul-sharma20
#modifications made by: jadekhiev

# imports
import os
import sys
from pathlib import Path

# imports required utility functions
import string
from collections import Counter

# Data packages
import math
import pandas as pd
import numpy as np

#Operation
import operator

#Natural Language Processing Packages
import re
import nltk
from nltk import tokenize
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
nltk.download('brown')
from nltk.corpus import brown

import spacy
nlp = spacy.load('en') #spacy PoS tagger
# python -m spacy download en
#Progress bar
from tqdm import tqdm

[nltk_data] Downloading package punkt to /Users/jadekhiev/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package brown to /Users/jadekhiev/nltk_data...
[nltk_data]   Package brown is already up-to-date!


# Shared Code

In [2]:
# Import articles
def importData(filename):
    """
    Import data into df
    """
    #Import Labelled Data
    DATA_DIR = "Data"
    thispath = Path().absolute()
    ARTICLES = os.path.join(DATA_DIR, filename)
    
    df = pd.read_excel(ARTICLES)

    try:
        df.head()
    except:
        pass
    
    return df

In [36]:
articleDB = importData("test_articles.xlsx")

# CE V1

In [37]:
# PoS Tagger and CFG Definitions
# train tagger with browns news corpus
train = brown.tagged_sents(categories='news')

# custom regex tagging
regex_tag = nltk.RegexpTagger([
     #(r'[$][0-9]+\s[MmBbTt]\S+','DV'), #dollar value 
     (r'^[-\:]?[0-9]+(.[0-9]+)?$', 'CD'),
     (r'.*able$', 'JJ'),
     (r'^[A-Z].*$', 'NNP'),
     (r'.*ly$', 'RB'),
     (r'.*s$', 'NNS'),
     (r'.*ing$', 'VBG'),
     (r'.*ed$', 'VBD'),
     (r'.[\/\/]\S+', 'URL'), #URL / useless
     (r'.*', 'NN')
])

unigram_tag = nltk.UnigramTagger(train, backoff=regex_tag)
bigram_tag = nltk.BigramTagger(train, backoff=unigram_tag)
trigram_tag = nltk.TrigramTagger(train, backoff=bigram_tag)

# PoS Browns Corpus Tagging: https://en.wikipedia.org/wiki/Brown_Corpus
# custom defined Context Free Grammar (CFG) by vipul
cfg = dict()
cfg['NNP+NNP'] = 'NNP'
cfg['NN+NN'] = 'NNI'
cfg['NNP+NNI'] = 'NNI'
cfg['NNI+NN'] = 'NNI'
cfg['NNI+NNI'] = 'NNI'
cfg['NNI+NNP'] = 'NNI'
cfg['JJ+JJ'] = 'JJ'
cfg['JJ+NN'] = 'NNI'
cfg['CD+CD'] = 'CD'
cfg['NPI+NNP'] = 'NNP' # this is specific for collecting terms with the word deal
cfg['NNI+RP'] = 'NNI' # collects terms like "heats up" -- RP = adverb particle
cfg['RB+NN'] = 'NNP'# combination for monetary movement e.g. quarterly[RB] profit[NN] fell [VBD] -- RB = adverb
cfg['NNP+VBD'] = 'VPI' #VBP = a verb phrase
cfg['MD+VB'] = 'VPI' # collects terms like "will lose" (verb phrase incomplete)
cfg['MD+NN'] = 'VPI' # collects terms like "will soar" (verb phrase incomplete)
cfg['VPI+NN'] = 'VP' # collects terms like "will lose ground"
cfg['NNI+VP'] = 'VP' # collects terms like "index will soar"
cfg['NN+VPI'] = 'VP' # collects terms like "index will soar"
cfg['NNP+VPI'] = 'VP' # collects terms like "index will soar"
cfg['VPI+TO'] = 'VPI' # collect past participle verbs with to e.g. pledged to
cfg['VBN+TO'] = 'VBN' # collect past participle verbs with to e.g. pledged to
cfg['VBN+NN'] = 'VP' # collects terms like "pledged to adapt"

# Utility functions for context extraction
def getWords(sentence):
    stopwords = [
        # dates/times
        "january", "february", "march", "april", "may", "june", "july", "august", "september", "october"
        , "november", "december", "jan", "feb","mar", "apr", "jun", "jul", "aug", "oct", "nov", "dec"
        , "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "morning","evening"
        ,"today","pm","am","daily"
        # specific article terms that are useless
        , "read", "file", "'s","'t", "photo", "inc", "corp", "group", "inc", "corp", "source"
        , "bloomberg", "cnbc","cnbcs", "cnn", "reuters","bbc", "published", "broadcast","msnbc","ap"
        ,"said","nbcuniversal","newsletterupgrade","nbc", "news",'url',"cbc"
        # other useless terms
        "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself"
        , "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its"
        , "itself", "they", "them", "their", "theirs","themselves", "what", "which", "who", "whom"
        , "this", "that", "these", "those", "theyve", "theyre", "theres", "heres", "didnt", "wouldn"
        , "couldn", "didn","are","is", "was","will", "have", "be", "such"
    ]
    words = word_tokenize(sentence)
    
    words = [word for word in words if len(word)>1 and word.lower() not in stopwords] 
    return words

def countWords(wordList):
    return dict(Counter(wordList))

def get_info(content):
    words = getWords(content)
    temp_tags = trigram_tag.tag(words)
    tags = re_tag(temp_tags)
    normalized = True
    while normalized:
        normalized = False
        #print("len tag: ", len(tags))
        #pp.pprint(DictGroupBy(tags))
        for i in range(0, len(tags) - 1):
            #print("i: ", i)
            tagged1 = tags[i]
            if i+1 >= len(tags) - 1:
                break
            tagged2 = tags[i+1]

            # when word = deal and next word is tagged IN (with, for, etc.) 
            if tagged1[0]=='deal' and tagged2[1]=='IN':
                tags.pop(i)
                tags.pop(i)
                re_tagged = tagged1[0] + ' ' + tagged2[0]
                pos='NPI'
                tags.insert(i, (re_tagged, pos))
                normalized = True

            else: 
                key = tagged1[1] + '+' + tagged2[1]
                pos = cfg.get(key)       
                if pos:
                    tags.pop(i)
                    tags.pop(i)
                    re_tagged = tagged1[0] + ' ' + tagged2[0]
                    tags.insert(i, (re_tagged, pos))
                    normalized = True

    final_context = []
    for tag in tags:
        if tag[1] == 'NNP' or tag[1] == 'NNI' or tag[1] == 'VP':
            final_context.append(tag[0])
    
    return final_context


def re_tag(tagged):
    new_tagged = []
    for tag in tagged:
        if tag[1] == 'NP' or tag[1] == 'NP-TL':
            new_tagged.append((tag[0], 'NNP'))
        elif tag[1][-3:] == '-TL':
            new_tagged.append((tag[0], tag[1][:-3]))
        elif tag[1][-1:] == 'S':
            new_tagged.append((tag[0], tag[1][:-1]))
        else:
            new_tagged.append((tag[0], tag[1]))
    
    return new_tagged

# extract all unigrams based on all words pulled from context extraction
def unigramBreakdown(fullContext):
    # to be used as frequency count
    stopwords = ["myself", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "him", "his", "himself", "she", "her", "hers", "herself", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "are", "was", "were", "been", "being", "have", "has", "had", "having", "does", "did", "doing",  "the", "and", "but", "if", "or", "because", "until", "while", "for", "with", "about", "into", "through", "during", "before", "after", "from", "down", "out", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "nor", "not", "only", "own", "same", "than", "too", "very", "can", "will", "just", "don", "should", "now", "past", "year", "month", "day"]   
    
    # separates each word for each article => list of list
    articleUnigrams = []
    for term in fullContext:
        articleUnigrams.extend(term.split())
    
    # remove stop words and punctuation
    translator = str.maketrans('', '', string.punctuation)
    unigrams = [term.lower().translate(translator) for term in articleUnigrams if term.lower() not in stopwords and len(term)>2]
    # count frequency of terms
    # unigrams = countWords(unigrams)
    
    return unigrams

# extracts unigrams AND bigrams pulled by context extraction
def bigramBreakdown(fullContext):
    bigrams = []
    
    # remove stop words and punctuation
    translator = str.maketrans('', '', string.punctuation)
    bigrams.extend([term.lower().translate(translator) for term in fullContext if len(term.split()) < 3])
    
    return bigrams

# did this because I couldn't good way to write the switcher to switch to a non-function
def ngramDummy(fullContext):
    stopwords = [
    # dates/times
      "january", "february", "march", "april", "may", "june", "july", "august", "september", "october"
    , "november", "december", "jan", "feb","mar", "apr", "jun", "jul", "aug", "oct", "nov", "dec"
    , "jan.", "feb.","mar.", "apr.", "jun.", "jul.", "aug.", "oct.", "nov.", "dec."
    , "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "morning","evening"
    , "today","pm","am","daily","day", "year"
    # specific article terms that are useless
    , "read", "file", "'s","'t", "photo", "inc", "corp", "group", "inc", "corp", "source"
    , "bloomberg", "cnbc","cnbcs", "cnn", "reuters","bbc", "published", "broadcast","msnbc","ap"
    , "said","nbcuniversal","newsletterupgrade","nbc", "news",'url', "more information","cbc"
    , 'business insider', 'new york times', "wall street journal"
    # other useless terms
    , "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself"
    , "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its"
    , "itself", "they", "them", "their", "theirs","themselves", "what", "which", "who", "whom"
    , "this", "that", "these", "those", "theyve", "theyre", "theres", "heres", "didnt", "wouldn"
    , "couldn", "didn","are","is", "was","will", "have", "be", "were"
    , "company", "people", "president", "others", "times", "percent","number", "companies", "business"
    , "world", "state", "order","talk",'team', 'brands', 'program'
    , 'family', 'everyone', 'per', 'house', 'case', 'someone', 'something', 'anyone',"person"
    , "co.", "co", "inc.", "inc", ".com", "com", "report", "things", "thing", "job", "member", "members"
    , "staying", "possibility","part", "none","showing", "one"
    , "us", "u.s.", "united states", "america", "united states of america", "usa", "states"
    ]
    ngrams = []
    # remove punctuation and translate all terms into lowercse
    # translator = str.maketrans('', '', string.punctuation)
    #bigrams.extend([term.lower().translate(translator) for term in fullContext if len(term.split()) < 3 and term.lower not in stopwords])
    ngrams.extend([term.lower() for term in fullContext if term.lower() not in stopwords])
    
    return ngrams
# PMI For Tag Ranking
# return binary representation of article in terms of all keyphrases pulled
def dfTransform(df, term_column):
    # df is the article df ;
    keyterms = []
    for article in df[term_column].values:
        keyterms.extend([word.lstrip() for word in (article.split(','))])
    keyterms = set(keyterms) # deduplicate terms by casting as set
    
    # for each article and each keyword: give 1 if keyword in article and 0 if not
    encodedArticle = []
    for i in tqdm(df.index):
        articleTerms = ([word.lstrip() for word in (df[term_column].iloc[i].split(','))])
        encodedArticle.append([1 if word in articleTerms else 0 for word in keyterms])
    
    # set up dataframe
    binEncDf = pd.DataFrame(encodedArticle)
    # use keywords as columns
    binEncDf.columns = keyterms
    # keep article_id and prediction from original table
    df = df.rename(columns={'prediction': 'mkt_moving'}) # changed it from prediction because that was also a keyterm
    # join prediction with encoding
    binEncDf = df[['mkt_moving']].join(binEncDf)
    
    return binEncDf

# Simple example of getting pairwise mutual information of a term
def pmiCal(df, x, label_column='mkt_moving'):
    pmilist=[]
    for i in [0,1]:
        for j in [0,1]:
            px = sum(df[label_column]==i)/len(df)
            py = sum(df[x]==j)/len(df)
            pxy = len(df[(df[label_column]==i) & (df[x]==j)])/len(df)
            if pxy==0:#Log 0 cannot happen
                pmi = math.log((pxy+0.0001)/(px*py+0.0001))
            else:
                pmi = math.log(pxy/(px*py+0.0001))
            pmilist.append([i]+[j]+[px]+[py]+[pxy]+[pmi])
    pmiDf = pd.DataFrame(pmilist)
    pmiDf.columns = ['x','y','px','py','pxy','pmi']
    
    return pmiDf

def pmiIndivCal(df,x,gt, label_column='mkt_moving'):
    px = sum(df[label_column]==gt)/len(df)
    py = sum(df[x]==1)/len(df)
    pxy = len(df[(df[label_column]==gt) & (df[x]==1)])/len(df)
    if pxy==0:#Log 0 cannot happen
        pmi = math.log((pxy+0.0001)/(px*py+0.0001))
    else:
        pmi = math.log(pxy/(px*py))
    
    return pmi

# calculate all the pmi for all tags across all articles and store top 5 tags for each article in df
def pmiForAllCal(artDf, binaryEncDf, term_column, label_column='mkt_moving'): 
    
    for i in tqdm(artDf.index): # for all articles
        terms = set(([word.lstrip() for word in (artDf[term_column].iloc[i].split(','))]))
        pmiposlist = []
        pmineglist = []

        for word in terms:
            #pmilist.append([word]+[pmiCal(df,word)])
            pmiposlist.append([word]+[pmiIndivCal(binaryEncDf,word,1,label_column)])
            pmineglist.append([word]+[pmiIndivCal(binaryEncDf,word,0,label_column)])
        
        #pmiDf = pd.DataFrame(pmilist)
        pmiposlist = pd.DataFrame(pmiposlist)
        pmineglist = pd.DataFrame(pmineglist)
        pmiposlist.columns = ['word','pmi']
        pmineglist.columns = ['word','pmi']
        artDf.at[i,'tags_posPMI_10'] = (',').join(word for word in pmiposlist.sort_values(by='pmi', ascending=False).head(10)['word'])
        artDf.at[i,'tags_negPMI_10'] = (',').join(word for word in pmineglist.sort_values(by='pmi', ascending=True).head(10)['word'])
    return artDf

# Functions to run extraction and rank tags

# Tag ranking using PMI
def calculatePMI(artDf, termType):
    # use PMI to calculate top 10 terms that should be displayed for each article    
    # get binary encoding of articles represented as uni- and bigrams
    binaryEncDf = dfTransform(artDf, termType)
    articleDf_ranked = pmiForAllCal(artDf, binaryEncDf, termType)
    
    return articleDf_ranked, binaryEncDf

# find most popular keyterms mentioned in news
def frequencyCounter(binEncDf):
    binEncDf = binEncDf.drop(['mkt_moving'], axis=1)
    # sum each column of binary encoded articles
    # output should be a dataframe with: word | # of articles mentioning word
    freqDf = binEncDf.sum(axis=0, skipna=True).sort_values(ascending=False).to_frame().reset_index()
    freqDf.columns = ['word','freq_articles']
    
    return freqDf

## Run Code

In [38]:
# Retrieve context
def retrieveContext_V1(articleDB, termType='ngrams'):
    # import classified articles
    articleDf = articleDB
    
    breakdown = {
        'ngrams': ngramDummy, # store n-grams pulled from context extraction
        'bigrams': bigramBreakdown, # store bigrams and unigrams captured by context extraction
        'unigrams': unigramBreakdown # store unigrams captured by separating all terms pulled by context extraction
        }
    
    for i in articleDf.index:
        # get context for articles
        keyterms = get_info(articleDf['contentWithStops'].iloc[i])  
        articleDf.at[i, 'tags'] = ', '.join(breakdown[termType](keyterms))    
    
    # returns article Df with new column for top tags
    articleDf, binaryEncDf = calculatePMI(articleDf, 'tags')
    
    # returns most popular terms mentioned across all articles
    trendingTermsDf = frequencyCounter(binaryEncDf)

    return articleDf, trendingTermsDf

In [39]:
articleDf, trendingTermsDf = retrieveContext_V1(articleDB)

100%|██████████| 321/321 [00:09<00:00, 34.51it/s]
100%|██████████| 321/321 [01:53<00:00,  2.69it/s]


In [48]:
articleDf

Unnamed: 0,nonRel,Rel,url,prediction,title,description,source,date,content,origContent,contentWithStops,tags,tags_posPMI_10,tags_negPMI_10,context,tags_top_5
0,0.225382,0.774618,https://www.cbc.ca/news/canada/manitoba/young-...,1,"'We must be saints,' young Catholic says about...",Being Catholic is one of the most unpopular th...,cbc-news,2019-03-03T12:00:00Z,abuse summit took place rome address global p...,"In February 2019, an abuse summit took place i...","In February 2019, an abuse summit took place i...","abuse summit, rome, global problem, abuse scan...","great saints,catholicism,entirely objective,wi...","great saints,entirely objective,winnipeg offer...","abuse summit, Rome, global problem, abuse scan...","sexual assault crisis line,own seminarian brot..."
1,0.234248,0.765752,https://www.washingtonpost.com/news/powerpost/...,1,The Daily 202: Michael Cohen hearing showcased...,Trump criticizes timing of ‘fake hearing’ afte...,the-washington-post,2019-02-28T14:06:39Z,big idea cutting short summit with north kore...,THE BIG IDEA: After cutting short his summit w...,THE BIG IDEA: After cutting short his summit w...,"big idea, short summit, long journey, washingt...","private-school scholarship programs,final outc...","never peaceful,politico,percent growth,cohens,...","BIG IDEA, short summit, long journey, Washingt...","first instinct,trump presidency,oversight comm..."
2,0.283077,0.716923,https://www.usatoday.com/story/tech/talkingtec...,1,What YouTube needs to do to clean up its thorn...,Google owned video network lost advertisers an...,usa-today,2019-03-02T14:15:29Z,network receives hours new content every minu...,But a network that receives 500 hours of new c...,But a network that receives 500 hours of new c...,"new content, non stop game, whack mole, tiktok...","need,e-mail addresses photos,ftc sprint,usa el...","need,accuracy rate,ftc sprint,usa eli blumenth...","new content, non stop game, whack mole, TikTok...","friendly discussions,app musical.ly,accuracy r..."
3,0.323753,0.676247,https://www.foxnews.com/us/trump-kim-summit-en...,1,Trump-Kim summit ends abruptly with no deal: '...,"Plus, 'impressed' Trump slams 'fake' Michael C...",fox-news,2019-02-28T10:06:42Z,trump summit fell north demanded full removal ...,Trump said the summit fell through after the N...,Trump said the summit fell through after the N...,"trump, north, full removal, international sanc...","democratic emails,lanny davis,animal food,illi...","clinton,trump presidential campaign,democratic...","Trump, North, full removal, international sanc...","oversight committee,first lady,mike pompeo,dem..."
4,0.358581,0.641419,https://www.ft.com/content/3befa2d8-1824-11e9-...,1,Business schools join forces to develop online...,Business schools join forces to develop online...,financial-times,2019-03-03T20:01:00Z,teamwork core element learning mba courses bus...,Teamwork is a core element of learning on MBA ...,Teamwork is a core element of learning on MBA ...,"teamwork, mba courses, business schools, new w...",challenging business schools flagship programm...,"mba,challenging business schools flagship prog...","Teamwork, MBA courses, business schools, new w...","possible traditional bricks,management educati..."
5,0.393527,0.606473,https://business.financialpost.com/investing/t...,1,Bill Gross says this secret condition made him...,Even after one of the most storied careers in ...,financial-post,2019-03-01T19:46:01Z,even one storied careers financial markets bil...,Even after one of the most storied careers in ...,Even after one of the most storied careers in ...,"storied careers, financial markets, bill gross...","onetime title,deflationary forces,one-man fami...","onetime title,glass eye,bond-market rivals,mor...","storied careers, financial markets, Bill Gross...","onetime title,janus,deflationary forces,proud ..."
6,0.413834,0.586166,https://www.economist.com/books-and-arts/2019/...,1,What to look for in a usage and grammar guide,"“Dreyer’s English”, a bestselling book, is rar...",the-economist,2019-02-28T15:55:48Z,people buy books english usage obvious answer...,WHY DO PEOPLE buy books on English usage? The ...,WHY DO PEOPLE buy books on English usage? The ...,"english usage, obvious answer, authoritative a...","sense,ms truss,recent edition,grammar book,lan...","sense,recent edition,grammar book,language col...","English usage, obvious answer, authoritative a...","merriam websters dictionary,syntax,dreyers eng..."
7,0.436507,0.563493,https://www.foxnews.com/politics/from-reparati...,1,"From reparations to Green New Deal, liberal li...","From reparations to Green New Deal, liberal li...",fox-news,2019-03-01T19:16:41Z,democrats presidential hopefuls latching onto...,The Democrats' 2020 presidential hopefuls are ...,The Democrats' 2020 presidential hopefuls are ...,"democrats, presidential hopefuls, left ideas, ...","economic theories impoverished nations,recent ...","politico,economic theories impoverished nation...","Democrats, presidential hopefuls, left ideas, ...","munching popcorn,white house bids official,sla..."
8,0.441999,0.558001,https://www.cbc.ca/news/canada/prince-edward-i...,1,Restoring heritage homes 'passion' projects fo...,Heritage award winners on P.E.I. share why the...,cbc-news,2019-03-02T11:00:00Z,interest restoring e s historic buildings rema...,Interest in restoring P.E.I.'s historic buildi...,Interest in restoring P.E.I.'s historic buildi...,"p.e.i., historic buildings, recent heritage aw...","water street,stringent rules,award winners,hil...","water street,p.e.i historic buildings,award wi...","P.E.I., historic buildings, recent heritage aw...","heritage board,walk property,hillsborough,awar..."
9,0.453345,0.546655,https://www.washingtonpost.com/politics/in-ame...,1,"In America, talk turns to something unspoken f...",Heated rhetoric about armed conflict raises a ...,the-washington-post,2019-03-01T12:15:00Z,with report special counsel robert mueller iii...,With the report by special counsel Robert S. M...,With the report by special counsel Robert S. M...,"special counsel robert mueller iii, complete i...","diminish trust,election result,shown extreme,a...","never peaceful,diminish trust,reaffirm loyalty...","special counsel Robert Mueller III, complete i...","trump presidency,first instinct,presidential n..."


# CE V2

In [189]:
#Script to extract important topics from content
# based on code written by: vipul-sharma20
# modifications made by: jadekhiev

# imports
import os
import sys
from pathlib import Path

# imports required utility functions
import string
from collections import Counter

# Data packages
import math
import pandas as pd
import numpy as np

#Operation
import operator

#Natural Language Processing Packages
import re
import spacy
# python -m spacy download en
try:
    nlp = spacy.load('en') #spacy PoS tagger
except:
    import en_core_web_sm
    nlp = en_core_web_sm.load()

#Progress bar
from tqdm import tqdm

# Utility functions for context extraction
def tagWords(article):
    # spacy context extraction
    # this is our spacy tagger 
    taggedArticle = nlp(article)
    taggedTerm = []
    stopwords = [
        # dates/times
          "january", "february", "march", "april", "may", "june", "july", "august", "september", "october"
        , "november", "december", "jan", "feb","mar", "apr", "jun", "jul", "aug", "oct", "nov", "dec"
        , "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "morning","evening"
        ,"today","pm","am","daily" 
        # specific article terms that are useless
        , "read", "file", "'s","'t", "photo", "inc", "corp", "group", "inc", "corp", "source"
        , "bloomberg", "cnbc","cnbcs", "cnn", "reuters","bbc", "published", "broadcast","msnbc","ap"
        , "said","nbcuniversal","newsletterupgrade","nbc", "news",'url',"cbc"
        # other useless terms
        , "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself"
        , "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its"
        , "itself", "they", "them", "their", "theirs","themselves", "what", "which", "who", "whom"
        , "this", "that", "these", "those", "theyve", "theyre", "theres", "heres", "didnt", "wouldn"
        , "couldn", "didn","are","is", "was", "will", "have", "be", "such","did","put"
        , "mr", "mr.", "ms", "ms.","mrs", "mrs."
    ]
    for token in taggedArticle:
        if token.text.lower() not in stopwords:
            if len(token.text)>2:
                taggedTerm.append((token.text,token.pos_,token.dep_))
            else: # collect numbers and symbols (percents, dollar signs, etc.)
                if token.dep_ in ('nummod','npadvmod','compound'): taggedTerm.append((token.text,token.pos_,token.dep_))
    return taggedTerm

def countWords(wordList):
    return dict(Counter(wordList))

def getContextTags(content):
    taggedTerm = tagWords(content)
    normalized = True
    while normalized:
        normalized = False
        for i in range(0, len(taggedTerm) - 1):
            token_1 = taggedTerm[i]
            if i+1 >= len(taggedTerm) - 1:
                break
            token_2 = taggedTerm[i+1]
            # chunk nouns
            if token_1[1] in ('NOUN','PROPN') and token_1[2]=='compound' and token_2[1]!='PUNCT':
                newTerm = taggedTerm[i][0]+" "+taggedTerm[i+1][0]
                pos = taggedTerm[i+1][1]
                dep = taggedTerm[i+1][2]
                taggedTerm.insert(i+2, (newTerm, pos, dep))
                taggedTerm.pop(i) # remove word 1
                taggedTerm.pop(i) # remove word 2
                normalized = True

            # chunk nouns with their adjectives
            elif token_1[1]=='ADJ' and token_2[1] in ('NOUN','PROPN'):
                newTerm = taggedTerm[i][0]+" "+taggedTerm[i+1][0]
                pos = taggedTerm[i+1][1]
                dep = taggedTerm[i+1][2]
                taggedTerm.insert(i+2, (newTerm, pos, dep))
                taggedTerm.pop(i) # remove word 1
                taggedTerm.pop(i) # remove word 2
                normalized = True

            # capture nouns that are composed of verb + noun (e.g. share price)
            elif token_1[1]=='VERB' and token_1[2] in ('ccomp') and token_2[1]=='NOUN':
                if i+2 >= len(taggedTerm) - 1:
                    break
                newTerm = taggedTerm[i][0]+" "+taggedTerm[i+1][0]
                pos = taggedTerm[i+1][1]
                dep = taggedTerm[i+1][2]
                taggedTerm.insert(i+2, (newTerm, pos, dep))
                taggedTerm.pop(i) # remove word 1
                taggedTerm.pop(i) # remove word 2
                normalized = True        

            # chunk hyphenated words
            elif token_1[2] in ('compound','npadvmod','amod','advmod','nmod','intj') and token_2[0]=='-':
                newTerm = taggedTerm[i][0]+taggedTerm[i+1][0]+taggedTerm[i+2][0]
                pos = 'ADJ'
                dep = 'amod'
                taggedTerm.insert(i+3, (newTerm, pos, dep))
                taggedTerm.pop(i) # remove word 1
                taggedTerm.pop(i) # remove word 2
                taggedTerm.pop(i) # remove word 3
                normalized = True

            # chunk numeric terms like money and percents
            elif token_1[1] in ('NUM','SYM','NVAL') and token_1[2] in ('nmod','nummod'):
                if token_1[1]=='NUM' and token_2[1]=='NOUN' and not token_2[0]=='%':
                    newTerm = taggedTerm[i][0]+" "+taggedTerm[i+1][0]
                else:
                    newTerm = taggedTerm[i][0]+taggedTerm[i+1][0]
                pos = 'NVAL' # number val
                dep = taggedTerm[i+1][2]
                taggedTerm.insert(i+2, (newTerm, pos, dep))
                taggedTerm.pop(i) # remove word 1
                taggedTerm.pop(i) # remove word 2
                normalized = True

    highlight_text = []
    noun_phrases = []
    for token in taggedTerm:
        term = token[0]
        pos = token[1]
        dep = token[2]
        if pos in ('NOUN', 'PROPN') and dep not in ('npadvmod','amod','advmod','attr'):
            if not(pos == 'NOUN' and len(term.split())<2):
                highlight_text.append(term)
                noun_phrases.append(term)
        elif pos in ('NOUN', 'PROPN') and dep == 'attr' and len(term.split()) > 2:
            highlight_text.append(term) 
            noun_phrases.append(term)
        elif pos in ('NVAL'): # highlight number values
            highlight_text.append(term)
    
    return highlight_text, noun_phrases

# extract all unigrams based on all words pulled from context extraction
def unigramBreakdown(fullContext):
    stopwords = [
    # dates/times
      "january", "february", "march", "april", "may", "june", "july", "august", "september", "october"
    , "november", "december", "jan", "feb","mar", "apr", "jun", "jul", "aug", "oct", "nov", "dec"
    , "jan.", "feb.","mar.", "apr.", "jun.", "jul.", "aug.", "oct.", "nov.", "dec."
    , "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "morning","evening"
    , "today","pm","am","daily","day", "year"
    # specific article terms that are useless
    , "read", "file", "'s","'t", "photo", "inc", "corp", "group", "inc", "corp", "source"
    , "bloomberg", "cnbc","cnbcs", "cnn", "reuters","bbc", "published", "broadcast","msnbc","ap"
    , "said","nbcuniversal","newsletterupgrade","nbc", "news",'url', "more information","cbc"
    , 'business insider', 'new york times', "wall street journal"
    # other useless terms
    , "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself"
    , "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its"
    , "itself", "they", "them", "their", "theirs","themselves", "what", "which", "who", "whom"
    , "this", "that", "these", "those", "theyve", "theyre", "theres", "heres", "didnt", "wouldn"
    , "couldn", "didn","are","is", "was","will", "have", "be", "were"
    , "company", "people", "president", "others", "times", "percent","number", "companies", "business"
    , "world", "state", "order","talk",'team', 'brands', 'program'
    , 'family', 'everyone', 'per', 'house', 'case', 'someone', 'something', 'anyone',"person"
    , "co.", "co", "inc.", "inc", ".com", "com", "report", "things", "thing", "job", "member", "members"
    , "staying", "possibility","part", "none","showing", "one"
    , "us", "u.s", "u.s.", "united states", "america", "united states of america", "usa", "states"
    ]
    
    # separates each word for each article => list of list
    articleUnigrams = []
    for term in fullContext:
        articleUnigrams.extend(term.split())
    
    # remove stop words and punctuation
    translator = str.maketrans('', '', string.punctuation)
    unigrams = [term.lower().translate(translator) for term in articleUnigrams if term.lower() not in stopwords and len(term)>2]
    # count frequency of terms
    # unigrams = countWords(unigrams)  
    return unigrams

# extracts unigrams AND bigrams pulled by context extraction
def bigramBreakdown(fullContext):
    stopwords = [
    # dates/times
      "january", "february", "march", "april", "may", "june", "july", "august", "september", "october"
    , "november", "december", "jan", "feb","mar", "apr", "jun", "jul", "aug", "oct", "nov", "dec"
    , "jan.", "feb.","mar.", "apr.", "jun.", "jul.", "aug.", "oct.", "nov.", "dec."
    , "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "morning","evening"
    , "today","pm","am","daily","day", "year"
    # specific article terms that are useless
    , "read", "file", "'s","'t", "photo", "inc", "corp", "group", "inc", "corp", "source"
    , "bloomberg", "cnbc","cnbcs", "cnn", "reuters","bbc", "published", "broadcast","msnbc","ap"
    , "said","nbcuniversal","newsletterupgrade","nbc", "news",'url', "more information","cbc"
    , 'business insider', 'new york times', "wall street journal"
    # other useless terms
    , "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself"
    , "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its"
    , "itself", "they", "them", "their", "theirs","themselves", "what", "which", "who", "whom"
    , "this", "that", "these", "those", "theyve", "theyre", "theres", "heres", "didnt", "wouldn"
    , "couldn", "didn","are","is", "was","will", "have", "be", "were"
    , "company", "people", "president", "others", "times", "percent","number", "companies", "business"
    , "world", "state", "order","talk",'team', 'brands', 'program'
    , 'family', 'everyone', 'per', 'house', 'case', 'someone', 'something', 'anyone',"person"
    , "co.", "co", "inc.", "inc", ".com", "com", "report", "things", "thing", "job", "member", "members"
    , "staying", "possibility","part", "none","showing", "one"
    , "us", "u.s.", "united states", "america", "united states of america", "usa", "states"
    ]
    bigrams = []
    # remove punctuation and translate all terms into lowercse
    translator = str.maketrans('', '', string.punctuation)
    #bigrams.extend([term.lower().translate(translator) for term in fullContext if len(term.split()) < 3 and term.lower not in stopwords])
    bigrams.extend([term.lower() for term in fullContext if len(term.split()) < 3 and term.lower() not in stopwords])
    
    return bigrams

# did this because I couldn't good way to write the switcher to switch to a non-function
def ngramBreakdown(keyterms):
    stopwords = [
    # dates/times
      "january", "february", "march", "april", "may", "june", "july", "august", "september", "october"
    , "november", "december", "jan", "feb","mar", "apr", "jun", "jul", "aug", "oct", "nov", "dec"
    , "jan.", "feb.","mar.", "apr.", "jun.", "jul.", "aug.", "oct.", "nov.", "dec."
    , "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "morning","evening"
    , "today","pm","am","daily","day", "year"
    # specific article terms that are useless
    , "read", "file", "'s","'t", "photo", "inc", "corp", "group", "inc", "corp", "source"
    , "bloomberg", "cnbc","cnbcs", "cnn", "reuters","bbc", "published", "broadcast","msnbc","ap"
    , "said","nbcuniversal","newsletterupgrade","nbc", "news",'url', "more information","cbc"
    , 'business insider', 'new york times', "wall street journal"
    # other useless terms
    , "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself"
    , "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its"
    , "itself", "they", "them", "their", "theirs","themselves", "what", "which", "who", "whom"
    , "this", "that", "these", "those", "theyve", "theyre", "theres", "heres", "didnt", "wouldn"
    , "couldn", "didn","are","is", "was","will", "have", "be", "were"
    , "company", "people", "president", "others", "times", "percent","number", "companies", "business"
    , "world", "state", "order","talk",'team', 'brands', 'program'
    , 'family', 'everyone', 'per', 'house', 'case', 'someone', 'something', 'anyone',"person"
    , "co.", "co", "inc.", "inc", ".com", "com", "report", "things", "thing", "job", "member", "members"
    , "staying", "possibility","part", "none","showing", "one"
    , "us", "u.s.", "united states", "america", "united states of america", "usa", "states"
    ]
    ngrams = []
    # remove punctuation and translate all terms into lowercse
    # translator = str.maketrans('', '', string.punctuation)
    #bigrams.extend([term.lower().translate(translator) for term in fullContext if len(term.split()) < 3 and term.lower not in stopwords])
    ngrams.extend([term.lower() for term in keyterms if term.lower() not in stopwords])
    
    return ngrams

# PMI For Tag Ranking
# return binary representation of article in terms of all keyphrases pulled
def dfTransform(df, term_column):
    # df is the article df ;
    keyterms = []
    for article in df[term_column].values:
        keyterms.extend([word.lstrip() for word in (article.split(','))])
    keyterms = set(keyterms) # deduplicate terms by casting as set
    
    # for each article and each keyword: give 1 if keyword in article and 0 if not
    encodedArticle = []
    for i in tqdm(df.index):
        articleTerms = ([word.lstrip() for word in (df[term_column].iloc[i].split(','))])
        encodedArticle.append([1 if word in articleTerms else 0 for word in keyterms])
    
    # set up dataframe
    binEncDf = pd.DataFrame(encodedArticle)
    # use keywords as columns
    binEncDf.columns = keyterms
    # keep article_id and prediction from original table
    df = df.rename(columns={'prediction': 'mkt_moving'}) # changed it from prediction because that was also a keyterm
    # join prediction with encoding
    binEncDf = df[['mkt_moving']].join(binEncDf)
    
    return binEncDf

# Simple example of getting pairwise mutual information of a term
def pmiCal(df, x, label_column='mkt_moving'):
    pmilist=[]
    for i in [0,1]:
        for j in [0,1]:
            px = sum(df[label_column]==i)/len(df)
            py = sum(df[x]==j)/len(df)
            pxy = len(df[(df[label_column]==i) & (df[x]==j)])/len(df)
            if pxy==0:#Log 0 cannot happen
                pmi = math.log((pxy+0.0001)/(px*py+0.0001))
            else:
                pmi = math.log(pxy/(px*py+0.0001))
            pmilist.append([i]+[j]+[px]+[py]+[pxy]+[pmi])
    pmiDf = pd.DataFrame(pmilist)
    pmiDf.columns = ['x','y','px','py','pxy','pmi']
    
    return pmiDf

def pmiIndivCal(df,x,gt, label_column='mkt_moving'):
    px = sum(df[label_column]==gt)/len(df)
    py = sum(df[x]==1)/len(df)
    pxy = len(df[(df[label_column]==gt) & (df[x]==1)])/len(df)
    if pxy==0:#Log 0 cannot happen
        pmi = math.log((pxy+0.0001)/(px*py+0.0001))
    else:
        pmi = math.log(pxy/(px*py))
    
    return pmi

# calculate all the pmi for all tags across all articles and store top 5 tags for each article in df
def pmiForAllCal(artDf, binaryEncDf, term_column, label_column='mkt_moving'): 
    
    for i in tqdm(artDf.index): # for all articles
        terms = set(([word.lstrip() for word in (artDf[term_column].iloc[i].split(','))]))
        pmineglist = []

        for word in terms:
            pmineglist.append([word]+[pmiIndivCal(binaryEncDf,word,0,label_column)])
        
        pmineglist = pd.DataFrame(pmineglist)
        pmineglist.columns = ['word','pmi']
        artDf.at[i,'tags_top_5'] = (',').join(word for word in pmineglist.sort_values(by='pmi', ascending=True).head(5)['word'])   
    return artDf

# Functions to run extraction and rank tags

# Tag ranking using PMI
def calculatePMI(artDf, termType):
    # use PMI to calculate top 10 terms that should be displayed for each article    
    # get binary encoding of articles represented as uni- and bigrams
    binaryEncDf = dfTransform(artDf, termType)
    articleDf_ranked = pmiForAllCal(artDf, binaryEncDf, termType)
    
    return articleDf_ranked, binaryEncDf

# find most popular keyterms mentioned in news
def frequencyCounter(binEncDf):
    binEncDf = binEncDf.drop(['mkt_moving'], axis=1)
    # sum each column of binary encoded articles
    # output should be a dataframe with: word | # of articles mentioning word
    freqDf = binEncDf.sum(axis=0, skipna=True).sort_values(ascending=False).to_frame().reset_index()
    freqDf.columns = ['word','freq_articles']
    
    return freqDf

## Run Code

In [121]:
# Retrieve context
def retrieveContext_V2(articleDB, termType='ngrams'):
    # import classified articles
    articleDf = articleDB
    
    breakdown = {
        'ngrams': ngramBreakdown, # store n-grams pulled from context extraction
        'bigrams': bigramBreakdown, # store bigrams and unigrams captured by context extraction
        'unigrams': unigramBreakdown # store unigrams captured by separating all terms pulled by context extraction
        }
    
    for i in articleDf.index:
        # get context for articles
        fullContext, keyTerms = getContextTags(articleDf['contentWithStops'].iloc[i])
        articleDf.at[i, 'context'] = ', '.join(fullContext) # highlight these terms within article 
        articleDf.at[i, 'tags'] = ', '.join(breakdown[termType](keyTerms)) # use these as tags as they are limited to noun/noun phrases
    
    # returns article Df with new column for top tags
    articleDf, binaryEncDf = calculatePMI(articleDf, 'tags')
    
    # returns most popular terms mentioned across all articles
    trendingTermsDf = frequencyCounter(binaryEncDf)

    return articleDf, trendingTermsDf

In [122]:
articleDf, trendingTermsDf = retrieveContext_V2(articleDB)

100%|██████████| 321/321 [00:08<00:00, 34.81it/s]
100%|██████████| 321/321 [01:10<00:00,  4.31it/s]


In [123]:
articleDf

Unnamed: 0,nonRel,Rel,url,prediction,title,description,source,date,content,origContent,contentWithStops,tags,tags_posPMI_10,tags_negPMI_10,context,tags_top_5
0,0.225382,0.774618,https://www.cbc.ca/news/canada/manitoba/young-...,1,"'We must be saints,' young Catholic says about...",Being Catholic is one of the most unpopular th...,cbc-news,2019-03-03T12:00:00Z,abuse summit took place rome address global p...,"In February 2019, an abuse summit took place i...","In February 2019, an abuse summit took place i...","abuse summit, rome, global problem, abuse scan...","great saints,catholicism,entirely objective,wi...","great saints,entirely objective,winnipeg offer...","2019,, abuse summit, Rome, global problem, abu...","sexual assault crisis line,catholic church,dif..."
1,0.234248,0.765752,https://www.washingtonpost.com/news/powerpost/...,1,The Daily 202: Michael Cohen hearing showcased...,Trump criticizes timing of ‘fake hearing’ afte...,the-washington-post,2019-02-28T14:06:39Z,big idea cutting short summit with north kore...,THE BIG IDEA: After cutting short his summit w...,THE BIG IDEA: After cutting short his summit w...,"big idea, short summit, north korean leader ki...","private-school scholarship programs,final outc...","never peaceful,politico,percent growth,cohens,...","BIG IDEA, short summit, North Korean leader Ki...","first instinct,trump presidency,mike pompeo,bl..."
2,0.283077,0.716923,https://www.usatoday.com/story/tech/talkingtec...,1,What YouTube needs to do to clean up its thorn...,Google owned video network lost advertisers an...,usa-today,2019-03-02T14:15:29Z,network receives hours new content every minu...,But a network that receives 500 hours of new c...,But a network that receives 500 hours of new c...,"new content, non-stop game, tiktok, tiktok, fe...","need,e-mail addresses photos,ftc sprint,usa el...","need,accuracy rate,ftc sprint,usa eli blumenth...","500 hours, new content, non-stop game, TikTok,...","friendly discussions,smart home products,new l..."
3,0.323753,0.676247,https://www.foxnews.com/us/trump-kim-summit-en...,1,Trump-Kim summit ends abruptly with no deal: '...,"Plus, 'impressed' Trump slams 'fake' Michael C...",fox-news,2019-02-28T10:06:42Z,trump summit fell north demanded full removal ...,Trump said the summit fell through after the N...,Trump said the summit fell through after the N...,"trump, north, full removal, international sanc...","democratic emails,lanny davis,animal food,illi...","clinton,trump presidential campaign,democratic...","Trump, North, full removal, international sanc...","democratic emails,oversight committee,mike pom..."
4,0.358581,0.641419,https://www.ft.com/content/3befa2d8-1824-11e9-...,1,Business schools join forces to develop online...,Business schools join forces to develop online...,financial-times,2019-03-03T20:01:00Z,teamwork core element learning mba courses bus...,Teamwork is a core element of learning on MBA ...,Teamwork is a core element of learning on MBA ...,"teamwork, mba courses, business schools, new w...",challenging business schools flagship programm...,"mba,challenging business schools flagship prog...","Teamwork, MBA courses, business schools, new w...","more value,vandenbosch,online degree platform,..."
5,0.393527,0.606473,https://business.financialpost.com/investing/t...,1,Bill Gross says this secret condition made him...,Even after one of the most storied careers in ...,financial-post,2019-03-01T19:46:01Z,even one storied careers financial markets bil...,Even after one of the most storied careers in ...,Even after one of the most storied careers in ...,"storied careers, financial markets, bill gross...","onetime title,deflationary forces,one-man fami...","onetime title,glass eye,bond-market rivals,mor...","storied careers, financial markets, Bill Gross...","onetime title,post-crisis stimulus,top margina..."
6,0.413834,0.586166,https://www.economist.com/books-and-arts/2019/...,1,What to look for in a usage and grammar guide,"“Dreyer’s English”, a bestselling book, is rar...",the-economist,2019-02-28T15:55:48Z,people buy books english usage obvious answer...,WHY DO PEOPLE buy books on English usage? The ...,WHY DO PEOPLE buy books on English usage? The ...,"english usage, obvious answer, authoritative a...","sense,ms truss,recent edition,grammar book,lan...","sense,recent edition,grammar book,language col...","English usage, obvious answer, authoritative a...","bryan garner,well-researched tomes,syntax,drey..."
7,0.436507,0.563493,https://www.foxnews.com/politics/from-reparati...,1,"From reparations to Green New Deal, liberal li...","From reparations to Green New Deal, liberal li...",fox-news,2019-03-01T19:16:41Z,democrats presidential hopefuls latching onto...,The Democrats' 2020 presidential hopefuls are ...,The Democrats' 2020 presidential hopefuls are ...,"democrats, pace months, republicans, limited a...","economic theories impoverished nations,recent ...","politico,economic theories impoverished nation...","Democrats, 2020presidential, pace months, Repu...","munching popcorn,slavery reparations,native am..."
8,0.441999,0.558001,https://www.cbc.ca/news/canada/prince-edward-i...,1,Restoring heritage homes 'passion' projects fo...,Heritage award winners on P.E.I. share why the...,cbc-news,2019-03-02T11:00:00Z,interest restoring e s historic buildings rema...,Interest in restoring P.E.I.'s historic buildi...,Interest in restoring P.E.I.'s historic buildi...,"p.e.i., historic buildings, recent heritage aw...","water street,stringent rules,award winners,hil...","water street,p.e.i historic buildings,award wi...","P.E.I., historic buildings, recent heritage aw...","steve,heritage board,award winners,canadian re..."
9,0.453345,0.546655,https://www.washingtonpost.com/politics/in-ame...,1,"In America, talk turns to something unspoken f...",Heated rhetoric about armed conflict raises a ...,the-washington-post,2019-03-01T12:15:00Z,with report special counsel robert mueller iii...,With the report by special counsel Robert S. M...,With the report by special counsel Robert S. M...,"special counsel robert s. mueller iii, impeach...","diminish trust,election result,shown extreme,a...","never peaceful,diminish trust,reaffirm loyalty...","special counsel Robert S. Mueller III, impeach...","first instinct,trump presidency,diminish trust..."


In [62]:
# Utility functions for context extraction
def tagTest(article):
    # spacy context extraction
    # this is our spacy tagger 
    taggedArticle = nlp(article)
    taggedTerm = []
    for token in taggedArticle:
        taggedTerm.append((token.text,token.pos_,token.dep_))
    return taggedTerm

In [95]:
artSamp = 'Things have gotten so bad at Victorias Secret that its parent company is tightening the purse strings, including shuttering dozens of underperforming locations. L Brands Inc., owner of the lingerie chain, announced Wednesday plans to close about 53 Victorias Secrets in North America this year, more than three times the 15 its historically closed down in an average year. Victorias Secret square footage in North America will drop by about 3%, it said, even as it continues to grow the footprint of its more successful Bath & Body Works chain. L Brands shares fell as much as 6.9% in early trading on Thursday. The stock had gained 6.7% this year through Wednesdays close. The rough patch isnt new: Victorias Secret has been under scrutiny for years for failing to keep up with shifting consumer demands, especially involving themes of female empowerment and diversity. But its reticence to change has been made even more pronounced with the emergence of competitors like Rihannas lingerie company Savage X Fenty, American Eagle Outfitters Inc.s Aerie and ThirdLove, which aim to be more inclusive of women of different shapes, sizes, and backgrounds. The road only gets harder from here: Major retailer Target(tgt) announced earlier this week plans to launch three new private-label brands specializing in low-cost underwear and sleepwear. On the back of the tough quarter at Victorias Secret, L Brands @(lb) said it sees total company profit, excluding some items, in a range of $2.20 to $2.60 this year. Thats well below the average estimate from analysts. Late last year, the company announced it would slash its dividend in half, a move Wall Street called surprising but prudent. The outlook illustrates that the lingerie-sales struggle will persist until the retailer finds a way to reconnect with shoppers on price, product and image, Bloomberg Intelligence analysts Poonam Goyal and Morgan Tarrant wrote in a note. Investments in VS e-commerce site and employees crimp margin but are vital to success. To try to breathe new life into the aging brand, the company has cycled through a series of leaders. Sharen Jester Turney, who led the chain for a decade, abruptly retired in 2016. Jan Singer took over later that year, exiting after about two years. John Mehas, formerly president of luxury label Tory Burch, was put in charge of Victorias Secret Lingerie earlier this year. It also hired a new head of the PINK brand last year. One of Mehass first big moves was partnering with a French luxury label to sell high-end lingerie as its new leader looks to expand beyond discounted bras and underwear. The main focus for Mehas and PINK chief Amy Hauk will be on getting closer to our customers and improving our merchandise assortments, the company said, noting that it expects to see gradual improvement as the year progresses. There were some bright spots: Digital was a strong suit in the latest quarter, with comparable sales online rising 8%. And the companywhich discontinued swimwear in 2016is bringing it back online in March. As mentioned earlier, our new CEOs in Lingerie and PINK are most focused on our product assortments, where we believe we have clear opportunities to improve, the company said.'
doc = nlp('There were some bright spots: Digital was a strong suit in the latest quarter, with comparable sales online rising 8%.')
taggedTerms = tagTest('There were some bright spots: Digital was a strong suit in the latest quarter, with comparable sales online rising 8%. ')

In [197]:
tagged2 = tagWords('There were some bright spots: Digital was a strong suit in the latest quarter, with comparable sales online rising 8%. ')

In [201]:
tagged2

[('There', 'ADV', 'expl'),
 ('were', 'VERB', 'ROOT'),
 ('some', 'DET', 'det'),
 ('bright', 'ADJ', 'amod'),
 ('spots', 'NOUN', 'attr'),
 ('Digital', 'PROPN', 'nsubj'),
 ('strong', 'ADJ', 'amod'),
 ('suit', 'NOUN', 'attr'),
 ('the', 'DET', 'det'),
 ('latest', 'ADJ', 'amod'),
 ('quarter', 'NOUN', 'pobj'),
 ('with', 'ADP', 'prep'),
 ('comparable', 'ADJ', 'amod'),
 ('sales', 'NOUN', 'nsubj'),
 ('online', 'ADP', 'advmod'),
 ('rising', 'VERB', 'pcomp'),
 ('8', 'NUM', 'nummod'),
 ('%', 'NOUN', 'npadvmod')]

In [199]:
hl, np = getContextTags('There were some bright spots: Digital was a strong suit in the latest quarter, with comparable sales online rising 8% higher.')

In [200]:
hl

['Digital', 'latest quarter', 'comparable sales', '8%']

In [193]:
#import spacy
from spacy import displacy
displacy.render(doc, style='dep', jupyter=True)
taggedTerms

[('There', 'ADV', 'expl'),
 ('were', 'VERB', 'ROOT'),
 ('some', 'DET', 'det'),
 ('bright', 'ADJ', 'amod'),
 ('spots', 'NOUN', 'attr'),
 (':', 'PUNCT', 'punct'),
 ('Digital', 'PROPN', 'nsubj'),
 ('was', 'VERB', 'acl'),
 ('a', 'DET', 'det'),
 ('strong', 'ADJ', 'amod'),
 ('suit', 'NOUN', 'attr'),
 ('in', 'ADP', 'prep'),
 ('the', 'DET', 'det'),
 ('latest', 'ADJ', 'amod'),
 ('quarter', 'NOUN', 'pobj'),
 (',', 'PUNCT', 'punct'),
 ('with', 'ADP', 'prep'),
 ('comparable', 'ADJ', 'amod'),
 ('sales', 'NOUN', 'nsubj'),
 ('online', 'ADP', 'advmod'),
 ('rising', 'VERB', 'pcomp'),
 ('8', 'NUM', 'nummod'),
 ('%', 'NOUN', 'npadvmod'),
 ('.', 'PUNCT', 'punct')]