# Content Extraction

The purpose of this code is to highlight key terms for articles that are determined to be "impactful". 

This step would be done after the article has been determined "impactful".

Resources:
http://vipulsharma20.blogspot.com/2017/03/sharingan-newspaper-text-and-context.html
https://github.com/vipul-sharma20/sharingan/blob/master/sharingan/summrizer/context.py
http://nltk.sourceforge.net/doc/en/ch03.html

In [1]:
import os
import sys
from pathlib import Path

# Data packages
import math
import pandas as pd
import numpy as np

#Progress bar
from tqdm import tqdm

#Counter
from collections import Counter

#Operation
import operator

#Natural Language Processing Packages
import re
import nltk

## Download Resources
nltk.download("vader_lexicon")
nltk.download("stopwords")
nltk.download("averaged_perceptron_tagger")
nltk.download("wordnet")

from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
from nltk import tokenize
from nltk.corpus import stopwords
from nltk.tag import PerceptronTagger
from nltk.data import find

## Machine Learning
import sklearn
import sklearn.metrics as metrics
from sklearn.feature_selection import *
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import datasets

from collections import OrderedDict
import pprint as pp

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/jadekhiev/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jadekhiev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jadekhiev/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jadekhiev/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
def importData():
    #Import Labelled Data
    DATA_DIR = "Data"
    thispath = Path().absolute()
    #dtype = {"index": str, "title": str, "description": str, "url": str, "date": str, "Retail Relevance": str, "Economy Relevant": str, "Market moving": str}
    RET_ARTICLES = os.path.join(DATA_DIR, "retailarticles-18-11-06.xlsx")

    
    df = pd.read_excel(RET_ARTICLES)

    try:
        df.head()
    except:
        pass
    return df

In [3]:
#def SelectFeaturesNP():
articleDf = importData()
# creating a new column with a cleaned up date so that it is possible to filter easily
articleDf['dateCleaned'] = pd.to_datetime(articleDf['date'].str[0:10])
print(articleDf.shape)

(2421, 10)


In [4]:
articleDf.head()

Unnamed: 0,index,title,description,url,date,content,Retail Relevance,Economy Relevant,Market moving,dateCleaned
0,1,Rival Retailers Try to Catch a Ride on Amazon ...,Bloomberg Rival Retailers Try to Catch a Ride ...,https://www.bloomberg.com/news/articles/2018-0...,2018-07-16T10:15:17Z,"Three years ago, Amazon.com Inc. created a sum...",1,0,0,2018-07-16
1,2,Walgreens and AmerisourceBergen deal talks hav...,Deal talks between Walgreens Boots Alliance an...,https://www.cnbc.com/2018/02/27/walgreens-and-...,2018-02-27T18:52:10Z,"The drugstore chain's quarterly profit fell, b...",1,0,1,2018-02-27
2,3,Walmart Whistle-Blower Claims Cheating in Race...,In its race to catch Amazon.com Inc. in online...,https://www.bloomberg.com/news/articles/2018-0...,2018-03-15T19:48:32Z,In its race to catch Amazon.com Inc. in online...,1,0,0,2018-03-15
3,4,Barnes & Noble cuts staff after dismal holiday...,Workers showed up Monday morning at various Ba...,https://www.cnbc.com/2018/02/12/barnes-noble-c...,2018-02-12T21:42:57Z,Black Friday shopping has given way to Thanksg...,1,0,1,2018-02-12
4,5,Bed Bath & Beyond Tumbles on Signs of Holiday-...,"For Bed Bath & Beyond Inc., Christmas may have...",https://www.bloomberg.com/news/articles/2017-1...,2017-12-21T17:10:58Z,"For Bed Bath &amp; Beyond Inc., Christmas may ...",1,0,1,2017-12-21


In [5]:
# Part of Speech Tagging
# Google: https://en.wikipedia.org/wiki/Part-of-speech_tagging
tagger = PerceptronTagger()
pos_tag = tagger.tag

In [4]:
# This grammar is described in the paper by S. N. Kim,
# T. Baldwin, and M.-Y. Kan.
# Evaluating n-gram based evaluation metrics for automatic
# keyphrase extraction.
# Technical report, University of Melbourne, Melbourne 2010.
grammar = r"""
    NBAR:
        {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
        
    NP:
        {<NBAR>}
        {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
"""

In [None]:
# Create phrase tree
chunker = nltk.RegexpParser(grammar)

In [36]:
# Noun Phrase Extraction Support Functions
#from nltk.corpus import stopwords
#stopwords = stopwords.words('english')
stopwords = [ 
    # months
    "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "Decemeber",
    # uninformative pronouns
    "myself", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "him", "his", "himself", "she", "her", "hers", "herself", "its", "itself", "they", "them", "their", "theirs", "themselves", 
    # other useless stop words -- decided to keep words that are implicit of future (e.g. can, should, will etc.)
    "what", "which", "who", "whom", "this", "that", "these", "those", "are", "was", "were", "been", "being", "have", "has", "had", "having", "does", "did", "doing", "the", "and", "but", "because", "while", "for", "with", "about", "into", "through", "during", "before", "after", "from", "down", "out", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "nor", "not", "only", "own", "same", "than", "too", "very", "just", "don", "now"]
lemmatizer = nltk.WordNetLemmatizer()
stemmer = nltk.stem.porter.PorterStemmer()

# generator, generate leaves one by one
def leaves(tree):
    """Finds NP (nounphrase) leaf nodes of a chunk tree."""
    for subtree in tree.subtrees(filter = lambda t: t.label()=='NP' or t.label()=='JJ' or t.label()=='RB'):
        yield subtree.leaves()

# stemming, lematizing, lower case... 
def normalise(word):
    """Normalises words to lowercase and stems and lemmatizes it."""
    word = word.lower()
    word = stemmer.stem(word)
    word = lemmatizer.lemmatize(word)
    return word

# stop-words and length control
def acceptable_word(word):
    """Checks conditions for acceptable word: length, stopword."""
    accepted = bool(2 <= len(word) <= 40
        and word.lower() not in stopwords)
    return accepted

# generator, create item once a time
def get_terms(tree):
    for leaf in leaves(tree):
        term = [normalise(w) for w,t in leaf if acceptable_word(w) ]
        # Phrase only
        if len(term)>1:
            yield term
            
# Flatten phrase lists to get tokens for analysis
def flatten(npTokenList):
    finalList =[]
    for phrase in npTokenList:
        token = ''
        for word in phrase:
            token += word + ' '
        finalList.append(token.rstrip())
    return finalList

In [129]:
"""
Utility functions for filtering content
originally written by: vipul-sharma20
modifications made by: jadekhiev
"""
from nltk import tokenize
#nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

stopwords = [
    # months
    "january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "decemeber",
    # symbols that don't separate a sentence
    '$','“','”','’','—','‘','•','\'','n\'t','[',']','(',')','{', '}','@', '#'
    # specific article terms that are useless
    "read", "share", "file", "'s","i", "photo", "percent","s", "t", "inc.", "corp", "group", "inc", "corp.", "source", "bloomberg", "CNBC",
    # useless pronouns
    "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "co.", "inc.",
    # etc
    "the", "a", "of", "have", "has", "had", "having", "because"
    #"am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "while", "of", "at", "by", "for", "about", "into", "through", "during", "before", "after", "to", "from", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "just", "don", "now"
    ]


def getWords(sentence):
    """
    Extracts words/tokens from a sentence
    :param sentence: (str) sentence
    :returns: list of tokens
    """
    words = word_tokenize(sentence)
    words = ([word for word in words if word.lower() not in stopwords])
    #print(words)
    return words


def getParagraphs(content):
    """
    Exctracts paragraphs from the the text content
    :param content: (str) text content
    :returns: list of paragraphs
    """
    paraList = content.split('\n\n')
    return paraList


def getSentences(paragraph):
    """
    Extracts sentences from a paragraph
    :param paragraph: (str) paragraph text
    :returns: list of sentences
    """
    indexed = {}
    sentenceList = tokenize.sent_tokenize(paragraph)
    for i, s in enumerate(sentenceList):
        indexed[i] = s
    return sentenceList, indexed

In [130]:
# -*- coding: utf-8 -*-

"""
Script to extract important topics from content
originally written by: vipul-sharma20
modifications made by: jadekhiev
"""

import nltk
#nltk.download('brown')
from nltk.corpus import brown

train = brown.tagged_sents(categories='news')

# backoff regex tagging
regex_tag = nltk.RegexpTagger([
     #(r'[$][0-9]+\s[MmBbTt]\S+','DV'), #dollar value 
     (r'^[-\:]?[0-9]+(.[0-9]+)?$', 'CD'),
     (r'.*able$', 'JJ'),
     (r'^[A-Z].*$', 'NNP'),
     (r'.*ly$', 'RB'),
     (r'.*s$', 'NNS'),
     (r'.*ing$', 'VBG'),
     (r'.*ed$', 'VBD'),
     (r'.[\/\/]\S+', 'URL'), #URL / useless
     (r'.*', 'NN')
])

unigram_tag = nltk.UnigramTagger(train, backoff=regex_tag)
bigram_tag = nltk.BigramTagger(train, backoff=unigram_tag)
trigram_tag = nltk.TrigramTagger(train, backoff=bigram_tag)

# custom defined Context Free Grammar (CFG) by vipul
cfg = dict()
cfg['NNP+NNP'] = 'NNP'
cfg['NN+NN'] = 'NNI'
cfg['NNP+NNI'] = 'NNI'
cfg['NNI+NN'] = 'NNI'
cfg['NNI+NNI'] = 'NNI'
cfg['NNI+NNP'] = 'NNI'
cfg['JJ+JJ'] = 'JJ'
cfg['JJ+NN'] = 'NNI'
cfg['CD+CD'] = 'CD'
cfg['NPI+NNP'] = 'NNP' # this is specific for collecting terms with the word deal
cfg['NNI+RP'] = 'NNI' # collects terms like "heats up"
cfg['RB+NN'] = 'NNP'# combination for monetary movement e.g. quarterly[RB] profit[NN] fell [VBD]
cfg['NNP+VBD'] = 'VPI' #VBP = a verb phrase
cfg['MD+VB'] = 'VPI' # collects terms like "will lose" (verb phrase incomplete)
cfg['MD+NN'] = 'VPI' # collects terms like "will soar" (verb phrase incomplete)
cfg['VPI+NN'] = 'VP' # collects terms like "will lose ground"
cfg['NNI+VP'] = 'VP' # collects terms like "index will soar"
cfg['NN+VPI'] = 'VP' # collects terms like "index will soar"
cfg['NNP+VPI'] = 'VP' # collects terms like "index will soar"
cfg['VPI+TO'] = 'VPI' # collect past participle verbs with to e.g. pledged to
cfg['VBN+TO'] = 'VBN' # collect past participle verbs with to e.g. pledged to
cfg['VBN+NN'] = 'VP' # collects terms like "pledged to adapt"

def get_info(content):
    words = getWords(content)
    temp_tags = trigram_tag.tag(words)
    tags = re_tag(temp_tags)
    normalized = True
    while normalized:
        normalized = False
        #print("len tag: ", len(tags))
        #pp.pprint(DictGroupBy(tags))
        for i in range(0, len(tags) - 1):
            #print("i: ", i)
            tagged1 = tags[i]
            if i+1 >= len(tags) - 1:
                break
            tagged2 = tags[i+1]
            
            # when word = deal and next word is tagged IN (with, for, etc.) 
            if tagged1[0]=='deal' and tagged2[1]=='IN':
                tags.pop(i)
                tags.pop(i)
                re_tagged = tagged1[0] + ' ' + tagged2[0]
                pos='NPI'
                tags.insert(i, (re_tagged, pos))
                normalized = True
            
            else: 
                key = tagged1[1] + '+' + tagged2[1]
                pos = cfg.get(key)       
                if pos:
                    tags.pop(i)
                    tags.pop(i)
                    re_tagged = tagged1[0] + ' ' + tagged2[0]
                    tags.insert(i, (re_tagged, pos))
                    normalized = True

    final_context = []
    for tag in tags:
        if tag[1] == 'NNP' or tag[1] == 'NNI' or tag[1] == 'VP':
            final_context.append(tag[0])
    return final_context


def re_tag(tagged):
    new_tagged = []
    for tag in tagged:
        if tag[1] == 'NP' or tag[1] == 'NP-TL':
            new_tagged.append((tag[0], 'NNP'))
        elif tag[1][-3:] == '-TL':
            new_tagged.append((tag[0], tag[1][:-3]))
        elif tag[1][-1:] == 'S':
            new_tagged.append((tag[0], tag[1][:-1]))
        else:
            new_tagged.append((tag[0], tag[1]))
    return new_tagged

In [131]:
#artNum = 2
#content = articleDf['content'].iloc[artNum]
dateFilteredDf = articleDf[articleDf['dateCleaned'].isin(pd.date_range('2017-11-15', '2017-11-15'))].reset_index()
dateFilteredDf.head()

Unnamed: 0,level_0,index,title,description,url,date,content,Retail Relevance,Economy Relevant,Market moving,dateCleaned
0,28,29,Target's Bid to Fight Wal-Mart on Prices Squee...,Target Corp.’s price war with Wal-Mart Stores ...,https://www.bloomberg.com/news/articles/2017-1...,2017-11-15T15:03:39Z,Target Corp. ’s price war with Wal-Mart Stores...,1,0,1,2017-11-15
1,229,230,Target beats 3Q forecasts,MINNEAPOLIS _ Target Corp. on Wednesday report...,https://www.cnbc.com/2017/11/15/the-associated...,2017-11-15T11:37:00Z,MINNEAPOLIS (AP) _ Target Corp. (TGT) on Wedne...,1,0,1,2017-11-15
2,290,291,Target Offers Bleak Holiday Season Profit Fore...,Shares flopped 5% in premarket trading.,http://fortune.com/2017/11/15/target-results-h...,2017-11-15T13:16:46Z,Target’s (tgt) return to growth is proving to ...,1,0,1,2017-11-15
3,325,326,AMAZON Cashierless Store Ready for Prime Time...,The Amazon Go team is said to have worked out ...,https://www.bloomberg.com/news/articles/2017-1...,2017-11-15T12:04:37Z,"For the past year, Amazon employees have been ...",1,0,1,2017-11-15
4,415,416,Amazon Cuts Prices Again at Whole Foods Ahead ...,"Meanwhile, shares of rival U.S. grocers Costco...",http://fortune.com/2017/11/15/amazon-whole-foo...,2017-11-15T15:06:48Z,Amazon.com on Wednesday unveiled more discount...,1,0,1,2017-11-15


In [132]:
#content = []
#for index, row in dateFilteredDf.iterrows():
#    content.append(row['content'])

#extract context and insert into new column of df
for i in dateFilteredDf.index:
     dateFilteredDf.at[i, 'context'] = ', '.join(get_info(dateFilteredDf['content'].iloc[i]))

In [133]:
dateFilteredDf

Unnamed: 0,level_0,index,title,description,url,date,content,Retail Relevance,Economy Relevant,Market moving,dateCleaned,context
0,28,29,Target's Bid to Fight Wal-Mart on Prices Squee...,Target Corp.’s price war with Wal-Mart Stores ...,https://www.bloomberg.com/news/articles/2017-1...,2017-11-15T15:03:39Z,Target Corp. ’s price war with Wal-Mart Stores...,1,0,1,2017-11-15,"Target price war, Wal-Mart Stores, Target, dis..."
1,229,230,Target beats 3Q forecasts,MINNEAPOLIS _ Target Corp. on Wednesday report...,https://www.cnbc.com/2017/11/15/the-associated...,2017-11-15T11:37:00Z,MINNEAPOLIS (AP) _ Target Corp. (TGT) on Wedne...,1,0,1,2017-11-15,"MINNEAPOLIS, Target TGT, fiscal third-quarter ..."
2,290,291,Target Offers Bleak Holiday Season Profit Fore...,Shares flopped 5% in premarket trading.,http://fortune.com/2017/11/15/target-results-h...,2017-11-15T13:16:46Z,Target’s (tgt) return to growth is proving to ...,1,0,1,2017-11-15,"Target tgt return, holiday quarter profit, dis..."
3,325,326,AMAZON Cashierless Store Ready for Prime Time...,The Amazon Go team is said to have worked out ...,https://www.bloomberg.com/news/articles/2017-1...,2017-11-15T12:04:37Z,"For the past year, Amazon employees have been ...",1,0,1,2017-11-15,"past year, Amazon, Amazon, experimental conven..."
4,415,416,Amazon Cuts Prices Again at Whole Foods Ahead ...,"Meanwhile, shares of rival U.S. grocers Costco...",http://fortune.com/2017/11/15/amazon-whole-foo...,2017-11-15T15:06:48Z,Amazon.com on Wednesday unveiled more discount...,1,0,1,2017-11-15,"Amazon.com, grocery products, Whole, Market, T..."
5,519,520,Target CEO: 'Stores are still important' for r...,More than half of Target's digital sales growt...,https://www.cnbc.com/2017/11/15/target-ceo-sto...,2017-11-15T15:26:00Z,Physical stores are still a vital component to...,1,0,1,2017-11-15,"Physical, vital component, retailers businesse..."
6,1053,1054,"US consumer prices rise marginally, core CPI f...",U.S. consumer prices barely rose in October as...,https://www.cnbc.com/2017/11/15/us-consumer-pr...,2017-11-15T13:31:00Z,The so-called core CPI gained 0.1 percent in S...,1,0,1,2017-11-15,"so-called core, year-on-year increase, CPI, ye..."
7,1136,1137,"Target Profits Are Hurting, But the Alternativ...",Paying now will help the retailer survive in t...,https://www.bloomberg.com/gadfly/articles/2017...,2017-11-15T15:39:53Z,There was much to cheer in Target Corp.'s thir...,1,0,1,2017-11-15,"Target third-quarter earnings results, compara..."
8,1158,1159,Amazon set for 'record-breaking' holiday seaso...,Almost half of all online shopping will be on ...,https://www.cnbc.com/2017/11/15/amazon-set-for...,2017-11-15T17:48:00Z,Almost half of all online shopping will be on ...,1,0,1,2017-11-15,"Amazon.com holiday season, Jeff Bezos commerci..."
9,1205,1206,"Target's turnaround is slow moving, but progre...",Target's third-quarter earnings beat was cloud...,https://www.cnbc.com/2017/11/15/targets-turnar...,2017-11-15T19:37:00Z,"For Target, its turnaround plan isn't yet hitt...",1,0,1,2017-11-15,"Target, turnaround plan, big-box retailer thir..."


In [14]:
def countWords(wordList):
    return dict(Counter(wordList))

In [158]:
def DictGroupBy(input):
    res = OrderedDict()
    for v, k in input.items():
        if k in res: res[k].append(v)
        else: res[k] = [v]
    return res

In [145]:
import string
stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "into", "through", "during", "before", "after", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
fullContext=[]
for i in dateFilteredDf.index:
    fullContext.append(dateFilteredDf['context'].iloc[i])    
articleUnigrams = []
# separates each word for each article => list of list
articleUnigrams.extend([term.split() for term in fullContext])
unigrams = []
# create 1 cohesive list for all unigrams
for article in articleUnigrams:
    unigrams.extend(article)
# remove stop words and punctuation
# This uses the 3-argument version of str.maketrans with arguments (x, y, z) where 'x' and 'y' must be equal-length strings and characters in 'x' are replaced by characters in 'y'. 'z' is a string (string.punctuation here) where each character in the string is mapped to None
translator = str.maketrans('', '', string.punctuation)
unigrams = [term.lower().translate(translator) for term in unigrams if term.lower() not in stopwords and len(term)>2]
# count frequency of terms
unigrams = countWords(unigrams)

In [146]:
fullContext

['Target price war, Wal-Mart Stores, Target, disappointing outlook, part investments, new brands, Executive Officer Brian Cornell, college campuses, multiyear plan, cheap-chic retailer, Speed, Wal-Mart, Taylor, results reflect impact short-term investments, long-term benefit, Moody, Charlie, O Shea said, Profit, Target tactical price investments, persistent market battles, Amazon, Wal-Mart, multiple categories., full-year profit, trailed analysts, average projection, Target, York, mid-June, Wal-Mart will give, reports third-quarter results, Gross, margins will continue, online orders, Financial, Officer Cathy Smith said, Target, now offering same-day delivery, York City stores, Restock, everyday household goods spans, Target, exclusive brands, New Day, Goodfellow, JoyLab, private labels, national brands, Brands, painful transition, signs payoff, same-store sales, average prediction, Consensus Metrix, online sales, Target, new apparel, home decor, back shoppers, recent years, U.S., Targ

In [147]:
articleUnigrams

[['Target',
  'price',
  'war,',
  'Wal-Mart',
  'Stores,',
  'Target,',
  'disappointing',
  'outlook,',
  'part',
  'investments,',
  'new',
  'brands,',
  'Executive',
  'Officer',
  'Brian',
  'Cornell,',
  'college',
  'campuses,',
  'multiyear',
  'plan,',
  'cheap-chic',
  'retailer,',
  'Speed,',
  'Wal-Mart,',
  'Taylor,',
  'results',
  'reflect',
  'impact',
  'short-term',
  'investments,',
  'long-term',
  'benefit,',
  'Moody,',
  'Charlie,',
  'O',
  'Shea',
  'said,',
  'Profit,',
  'Target',
  'tactical',
  'price',
  'investments,',
  'persistent',
  'market',
  'battles,',
  'Amazon,',
  'Wal-Mart,',
  'multiple',
  'categories.,',
  'full-year',
  'profit,',
  'trailed',
  'analysts,',
  'average',
  'projection,',
  'Target,',
  'York,',
  'mid-June,',
  'Wal-Mart',
  'will',
  'give,',
  'reports',
  'third-quarter',
  'results,',
  'Gross,',
  'margins',
  'will',
  'continue,',
  'online',
  'orders,',
  'Financial,',
  'Officer',
  'Cathy',
  'Smith',
  'said,'

In [148]:
unigrams

{'target': 68,
 'price': 20,
 'war': 3,
 'walmart': 17,
 'stores': 15,
 'disappointing': 1,
 'outlook': 3,
 'part': 3,
 'investments': 5,
 'new': 14,
 'brands': 12,
 'executive': 7,
 'officer': 5,
 'brian': 4,
 'cornell': 4,
 'college': 1,
 'campuses': 1,
 'multiyear': 1,
 'plan': 5,
 'cheapchic': 2,
 'retailer': 7,
 'speed': 1,
 'taylor': 2,
 'results': 3,
 'reflect': 3,
 'impact': 2,
 'shortterm': 3,
 'longterm': 2,
 'benefit': 2,
 'moody': 1,
 'charlie': 1,
 'shea': 1,
 'said': 7,
 'profit': 8,
 'tactical': 1,
 'persistent': 1,
 'market': 16,
 'battles': 1,
 'amazon': 88,
 'multiple': 2,
 'categories': 4,
 'fullyear': 3,
 'trailed': 1,
 'analysts': 4,
 'average': 6,
 'projection': 1,
 'york': 4,
 'midjune': 1,
 'give': 1,
 'reports': 2,
 'thirdquarter': 6,
 'gross': 2,
 'margins': 5,
 'continue': 1,
 'online': 12,
 'orders': 3,
 'financial': 7,
 'cathy': 3,
 'smith': 3,
 'offering': 2,
 'sameday': 2,
 'delivery': 5,
 'city': 1,
 'restock': 2,
 'everyday': 3,
 'household': 3,
 'goods

In [159]:
DictGroupBy(unigrams)

OrderedDict([(68, ['target']),
             (20, ['price', 'holiday']),
             (3,
              ['war',
               'outlook',
               'part',
               'results',
               'reflect',
               'shortterm',
               'fullyear',
               'orders',
               'cathy',
               'smith',
               'everyday',
               'household',
               'goods',
               'apparel',
               'success',
               'kids',
               'cat',
               'jack',
               'tgt',
               'estimate',
               'current',
               'adjusted',
               'stock',
               'front',
               'physical',
               'ceo',
               'christmas',
               'net',
               'past',
               'convenience',
               'fool',
               'declined',
               'comment',
               'cameras',
               'charge',
               'instore',
      

In [24]:
# partial stop words list used
context = [term for term in context]# if not (''in term ==True) and len(term.split()) > 1]
wordCount = countWords(context)
print("title: " + articleDf['title'].iloc[artNum])
print("description: " + articleDf['description'].iloc[artNum])
print("url: " + articleDf['url'].iloc[artNum])
print("content: " + content)
print("context:")
print([term for term, count in wordCount.items()])

title: Walmart Whistle-Blower Claims Cheating in Race with Amazon
description: In its race to catch Amazon.com Inc. in online retailing, Walmart Inc. issued misleading e-commerce results and fired an executive who complained the company was breaking the law, according to a whistle-blower lawsuit. Walmart shares fell as much as 2 percent.
url: https://www.bloomberg.com/news/articles/2018-03-15/walmart-whistle-blower-claims-retailer-cheated-to-catch-amazon
context:


In [200]:
wordCount

{'Dick Sporting Goods': 1,
 'seriously obligation': 1,
 'responsible seller firearms': 1,
 'background check': 1,
 'assault rifles': 1,
 'Florida high school massacre': 1,
 'slow sales': 1,
 'firearms industry': 1,
 'Lines parted ways': 1,
 'American voters': 1,
 'scope nation gun violence crisis': 1,
 'Gabrielle Giffords': 1,
 'gun control advocate': 1,
 'dangerous hands': 1,
 'mass shooting': 1}

# Consider:

## useful things:
* things that happened = past tense verbs (VBD)
* things currently happening = VBG
* things that could potentially happen = modal auxiliary (can, should, will) (MD)
* prepositions such as with (IN)

## useless things:
* names of writers, news sources, photographers
* URLs