# Content Extraction

The purpose of this code is to highlight key terms for articles that are determined to be "impactful". 

This step would be done after the article has been determined "impactful".

Resources:
http://vipulsharma20.blogspot.com/2017/03/sharingan-newspaper-text-and-context.html
https://github.com/vipul-sharma20/sharingan/blob/master/sharingan/summrizer/context.py
http://nltk.sourceforge.net/doc/en/ch03.html

In [6]:
import os
import sys
from pathlib import Path

# Data packages
import math
import pandas as pd
import numpy as np

#Progress bar
from tqdm import tqdm

#Counter
from collections import Counter

#Operation
import operator

#Natural Language Processing Packages
import re
import nltk

## Download Resources
nltk.download("vader_lexicon")
nltk.download("stopwords")
nltk.download("averaged_perceptron_tagger")
nltk.download("wordnet")

from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
from nltk import tokenize
from nltk.corpus import stopwords
from nltk.tag import PerceptronTagger
from nltk.data import find

## Machine Learning
import sklearn
import sklearn.metrics as metrics
from sklearn.feature_selection import *
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import datasets

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/jadekhiev/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jadekhiev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jadekhiev/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jadekhiev/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
def importData():
    #Import Labelled Data
    DATA_DIR = "Data"
    thispath = Path().absolute()
    #dtype = {"index": str, "title": str, "description": str, "url": str, "date": str, "Retail Relevance": str, "Economy Relevant": str, "Market moving": str}
    RET_ARTICLES = os.path.join(DATA_DIR, "retailarticles-18-11-06.xlsx")

    
    df = pd.read_excel(RET_ARTICLES)

    try:
        df.head()
    except:
        pass
    return df

In [8]:
#def SelectFeaturesNP():
articleDf = importData()

In [None]:
# Part of Speech Tagging
# Google: https://en.wikipedia.org/wiki/Part-of-speech_tagging
tagger = PerceptronTagger()
pos_tag = tagger.tag

In [None]:
# This grammar is described in the paper by S. N. Kim,
# T. Baldwin, and M.-Y. Kan.
# Evaluating n-gram based evaluation metrics for automatic
# keyphrase extraction.
# Technical report, University of Melbourne, Melbourne 2010.
grammar = r"""
    NBAR:
        {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
        
    NP:
        {<NBAR>}
        {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
"""

In [None]:
# Create phrase tree
chunker = nltk.RegexpParser(grammar)

In [None]:
# Noun Phrase Extraction Support Functions
#from nltk.corpus import stopwords
#stopwords = stopwords.words('english')
stopwords = ["myself", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "him", "his", "himself", "she", "her", "hers", "herself", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "are", "was", "were", "been", "being", "have", "has", "had", "having", "does", "did", "doing", "the", "and", "but", "because", "until", "while", "for", "with", "about", "into", "through", "during", "before", "after", "from", "down", "out", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "nor", "not", "only", "own", "same", "than", "too", "very", "can", "will", "just", "don", "should", "now"]
lemmatizer = nltk.WordNetLemmatizer()
stemmer = nltk.stem.porter.PorterStemmer()

# generator, generate leaves one by one
def leaves(tree):
    """Finds NP (nounphrase) leaf nodes of a chunk tree."""
    for subtree in tree.subtrees(filter = lambda t: t.label()=='NP' or t.label()=='JJ' or t.label()=='RB'):
        yield subtree.leaves()

# stemming, lematizing, lower case... 
def normalise(word):
    """Normalises words to lowercase and stems and lemmatizes it."""
    word = word.lower()
    word = stemmer.stem(word)
    word = lemmatizer.lemmatize(word)
    return word

# stop-words and length control
def acceptable_word(word):
    """Checks conditions for acceptable word: length, stopword."""
    accepted = bool(2 <= len(word) <= 40
        and word.lower() not in stopwords)
    return accepted

# generator, create item once a time
def get_terms(tree):
    for leaf in leaves(tree):
        term = [normalise(w) for w,t in leaf if acceptable_word(w) ]
        # Phrase only
        if len(term)>1:
            yield term
            
# Flatten phrase lists to get tokens for analysis
def flatten(npTokenList):
    finalList =[]
    for phrase in npTokenList:
        token = ''
        for word in phrase:
            token += word + ' '
        finalList.append(token.rstrip())
    return finalList

In [77]:
"""
Utility functions for filtering content
originally written by: vipul-sharma20
modifications made by: jadekhiev
"""
from nltk import tokenize
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

stopwords = ["'s","i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "into", "through", "during", "before", "after", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]


def getWords(sentence):
    """
    Extracts words/tokens from a sentence
    :param sentence: (str) sentence
    :returns: list of tokens
    """
    words = word_tokenize(sentence)
    words = ([word for word in words if len(word) > 1 and word.lower() not in stopwords])
    print(words)
    return words


def getParagraphs(content):
    """
    Exctracts paragraphs from the the text content
    :param content: (str) text content
    :returns: list of paragraphs
    """
    paraList = content.split('\n\n')
    return paraList


def getSentences(paragraph):
    """
    Extracts sentences from a paragraph
    :param paragraph: (str) paragraph text
    :returns: list of sentences
    """
    indexed = {}
    sentenceList = tokenize.sent_tokenize(paragraph)
    for i, s in enumerate(sentenceList):
        indexed[i] = s
    return sentenceList, indexed

[nltk_data] Downloading package punkt to /Users/jadekhiev/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [64]:
# -*- coding: utf-8 -*-

"""
Script to extract important topics from content
originally written by: vipul-sharma20
modifications made by: jadekhiev
"""

import nltk
#nltk.download('brown')
from nltk.corpus import brown

train = brown.tagged_sents(categories='news')

# backoff regex tagging
regex_tag = nltk.RegexpTagger([
     (r'^[-\:]?[0-9]+(.[0-9]+)?$', 'CD'),
     (r'.*able$', 'JJ'),
     (r'^[A-Z].*$', 'NNP'),
     (r'.*ly$', 'RB'),
     (r'.*s$', 'NNS'),
     (r'.*ing$', 'VBG'),
     (r'.*ed$', 'VBD'),
     (r'.[\'s]', 'UL'), #possessive/useless
     (r'[$][0-9]+\s[MmBbTt]\S+','DV'), #dollar value
     (r'.*', 'NN')
])

unigram_tag = nltk.UnigramTagger(train, backoff=regex_tag)
bigram_tag = nltk.BigramTagger(train, backoff=unigram_tag)
trigram_tag = nltk.TrigramTagger(train, backoff=bigram_tag)

# custom defined CFG by vipul
cfg = dict()
cfg['NNP+NNP'] = 'NNP'
cfg['NN+NN'] = 'NNI'
cfg['NNI+NN'] = 'NNI'
cfg['JJ+JJ'] = 'JJ'
cfg['JJ+NN'] = 'NNI'
# combination for monetary movement e.g. quarterly profit fell
cfg['RB+NN+VBD'] = 'NNP'

def get_info(content):
    sentences = getSentences(content)
    for sentence in sentences:
        words = getWords(content)
        temp_tags = trigram_tag.tag(words)
        tags = re_tag(temp_tags)
        #print(tags)
        normalized = True
        while normalized:
            normalized = False
            for i in range(0, len(tags) - 1):
                tagged1 = tags[i]
                if i+1 >= len(tags):
                    break
                tagged2 = tags[i+1]
                key = tagged1[1] + '+' + tagged2[1]
                #print(key)
                pos = cfg.get(key)
                
                if i+2 >= len(tags):
                    break
                tagged3 = tags[i+2]
                key2 = key + '+' + tagged3[1]
                #print(key2)
                pos2 = cfg.get(key)
                
                if pos2:
                    tags.pop(i)
                    tags.pop(i)
                    re_tagged = tagged1[0] + ' ' + tagged2[0] + ' ' + tagged3[0]
                    tags.insert(i, (re_tagged, pos2))
                    normalized = True
                
                elif pos:
                    tags.pop(i)
                    tags.pop(i)
                    re_tagged = tagged1[0] + ' ' + tagged2[0]
                    tags.insert(i, (re_tagged, pos))
                    normalized = True
                    
        #print(tags)
        final_context = []
        for tag in tags:
            if tag[1] == 'NNP' or tag[1] == 'NNI' or tag[1] == 'MON':
                final_context.append(tag[0])
        return final_context

def re_tag(tagged):
    new_tagged = []
    for tag in tagged:
        if tag[1] == 'NP' or tag[1] == 'NP-TL':
            new_tagged.append((tag[0], 'NNP'))
        elif tag[1][-3:] == '-TL':
            new_tagged.append((tag[0], tag[1][:-3]))
        elif tag[1][-1:] == 'S':
            new_tagged.append((tag[0], tag[1][:-1]))
        else:
            new_tagged.append((tag[0], tag[1]))
    return new_tagged

In [24]:
content = articleDf['content'].iloc[1]

In [35]:
content

"The drugstore chain's quarterly profit fell, but it raised the lower end of its profit forecast. Fred Katayama reports. Video provided by Reuters Newslook FILE - This June 4, 2014, file photo, shows a Walgreens retail store in Boston. Shares of AmerisourceBergen are soaring before the opening bell, Tuesday, Feb. 13, 2018, on reports that Walgreens is pursuing a complete takeover of the huge drug distributor. The Wall Street Journal is reporting that Walgreens CEO Stefano Pessina reached out to AmerisourceBergen Corp. with the potential deal. Walgreens already owns about 26 percent of the company. (AP Photo/Charles Krupa, File) ORG XMIT: NY109 (Photo: Charles Krupa, AP) Deal talks between Walgreens Boots Alliance and AmerisourceBergen about an acquisition of the wholesale drug distributor have ended without an agreement, according to people familiar with the matter. Walgreens Chief Executive Stefano Pessina and Amerisource Chief Executive Steven Collis met to discuss a potential tie-up

In [78]:
context = get_info(content)

['drugstore', 'chain', 'quarterly', 'profit', 'fell', 'raised', 'lower', 'end', 'profit', 'forecast', 'Fred', 'Katayama', 'reports', 'Video', 'provided', 'Reuters', 'Newslook', 'FILE', 'June', '2014', 'file', 'photo', 'shows', 'Walgreens', 'retail', 'store', 'Boston', 'Shares', 'AmerisourceBergen', 'soaring', 'opening', 'bell', 'Tuesday', 'Feb.', '13', '2018', 'reports', 'Walgreens', 'pursuing', 'complete', 'takeover', 'huge', 'drug', 'distributor', 'Wall', 'Street', 'Journal', 'reporting', 'Walgreens', 'CEO', 'Stefano', 'Pessina', 'reached', 'AmerisourceBergen', 'Corp.', 'potential', 'deal', 'Walgreens', 'already', 'owns', '26', 'percent', 'company', 'AP', 'Photo/Charles', 'Krupa', 'File', 'ORG', 'XMIT', 'NY109', 'Photo', 'Charles', 'Krupa', 'AP', 'Deal', 'talks', 'between', 'Walgreens', 'Boots', 'Alliance', 'AmerisourceBergen', 'acquisition', 'wholesale', 'drug', 'distributor', 'ended', 'without', 'agreement', 'according', 'people', 'familiar', 'matter', 'Walgreens', 'Chief', 'Execut

['drugstore chain quarterly',
 'end profit forecast',
 'Fred Katayama reports',
 'Video',
 'Reuters Newslook FILE FILE June 2014 2014',
 'file photo shows shows Walgreens',
 'Walgreens',
 'retail store Boston',
 'Boston Shares AmerisourceBergen AmerisourceBergen soaring',
 'Feb.',
 'Walgreens',
 'complete takeover huge',
 'huge drug distributor distributor Wall',
 'Wall',
 'Street Journal reporting',
 'Walgreens CEO Stefano Stefano Pessina reached reached',
 'AmerisourceBergen',
 'Corp. potential deal',
 'Walgreens',
 'percent company AP',
 'Photo/Charles Krupa File File ORG XMIT XMIT NY109 Photo XMIT NY109 Photo Photo Charles Krupa Krupa Krupa Krupa AP',
 'Deal',
 'Walgreens Boots Alliance',
 'AmerisourceBergen',
 'wholesale drug distributor distributor ended',
 'familiar matter Walgreens',
 'Walgreens',
 'Stefano Pessina Amerisource Amerisource Chief',
 'Steven Collis met',
 'potential tie-up early',
 'stage explorations ended',
 'agreement sources said',
 'Amerisource',
 'market cap

["drugstore chain 's 's quarterly",
 'Fred Katayama reports',
 'Video',
 'Reuters Newslook FILE FILE -',
 'June',
 'file photo ,',
 'Walgreens',
 'retail store in',
 'Boston',
 'Shares',
 'AmerisourceBergen',
 'opening bell ,',
 'Feb.',
 'Walgreens',
 'complete takeover of',
 'huge drug distributor distributor .',
 'Wall Street Journal Journal is',
 'Walgreens CEO Stefano Stefano Pessina reached reached',
 'AmerisourceBergen',
 'Walgreens',
 'AP Photo/Charles Krupa Krupa ,',
 'File',
 'ORG XMIT :',
 'NY109',
 'Photo',
 'Charles Krupa ,',
 'Deal',
 'Walgreens Boots Alliance',
 'AmerisourceBergen',
 'wholesale drug distributor distributor have',
 'Walgreens',
 'Stefano Pessina and',
 'Amerisource',
 'Steven Collis met',
 'potential tie-up ,',
 'early stage explorations explorations ended',
 'Amerisource',
 'market capitalization around',
 'Aetna',
 'corporate executives to',
 'new competitor .',
 'Wall Street Journal Journal reported',
 'Walgreens',
 'takeover approach to',
 'Amerisource