In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import pickle
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import re
from nltk.util import ngrams
import nltk
import csv
import string
import rake

In [2]:
# Network Parameters
def create_architecture(n_input, layers):

    hidden_features = {
        10:[7,5,3,2],
        1000:[666,444,300,200]
    }

    n_hidden_1 = hidden_features[n_input][0] # 1st layer number of features
    n_hidden_2 = hidden_features[n_input][1] # 2nd layer number of features
    n_hidden_3 = hidden_features[n_input][2] # 3rd layer number of features
    n_hidden_4 = hidden_features[n_input][3] # 3rd layer number of features
    n_classes = 1 # total classes (binary)

    # Create model
    def multilayer_perceptron(layers, x, weights, biases):
        # Hidden layer with RELU activation
        layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
        layer_1 = tf.nn.relu(layer_1)

        layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
        layer_2 = tf.nn.relu(layer_2)

        layer_3 = tf.add(tf.matmul(layer_2, weights['h3']), biases['b3'])
        layer_3 = tf.nn.relu(layer_3)

        layer_4 = tf.add(tf.matmul(layer_3, weights['h4']), biases['b4'])
        layer_4 = tf.nn.relu(layer_4)
            
        #Output layer with linear activation
        out_layer = {
            1: tf.matmul(layer_1, weights['out1']) + biases['out'],
            2: tf.matmul(layer_2, weights['out2']) + biases['out'],
            3: tf.matmul(layer_3, weights['out3']) + biases['out'],
            4: tf.matmul(layer_4, weights['out4']) + biases['out']
                     }
        return out_layer[layers]

    # Store layers weight & bias
    weights = {
        'h1': tf.Variable(tf.random_normal([n_input,n_hidden_1],
                                           mean=0,
                                           stddev=(np.sqrt(6/n_input+
                                                             n_hidden_1+1)),
                                           name="h1")),
        'h2': tf.Variable(tf.random_normal([n_hidden_1,n_hidden_2],
                                           mean=0,
                                           stddev=(np.sqrt(6/n_hidden_1+
                                                             n_hidden_2+1)),
                                           name="h2")),
        'h3': tf.Variable(tf.random_normal([n_hidden_2,n_hidden_3],
                                           mean=0,
                                           stddev=(np.sqrt(6/n_hidden_2+
                                                             n_hidden_3+1)),
                                           name="h3")),
        'h4': tf.Variable(tf.random_normal([n_hidden_3,n_hidden_4],
                                           mean=0,
                                           stddev=(np.sqrt(6/n_hidden_3+
                                                             n_hidden_4+1)),
                                           name="h4")),
        'out1': tf.Variable(tf.random_normal([n_hidden_1,n_classes],
                                           mean=0,
                                           stddev=(np.sqrt(6/n_hidden_1+
                                                             n_classes+1)),
                                           name="out")),
        'out2': tf.Variable(tf.random_normal([n_hidden_2,n_classes],
                                           mean=0,
                                           stddev=(np.sqrt(6/n_hidden_2+
                                                             n_classes+1)),
                                           name="out")),
        'out3': tf.Variable(tf.random_normal([n_hidden_3,n_classes],
                                           mean=0,
                                           stddev=(np.sqrt(6/n_hidden_3+
                                                             n_classes+1)),
                                           name="out")),
        'out4': tf.Variable(tf.random_normal([n_hidden_4,n_classes],
                                           mean=0,
                                           stddev=(np.sqrt(6/n_hidden_4+
                                                             n_classes+1)),
                                           name="out"))
    }
    biases = {
        'b1': tf.Variable(tf.random_normal([n_hidden_1],
                                        mean=0,
                                        stddev=(np.sqrt(6/n_hidden_1+1)),
                                        name="b1")),
        'b2': tf.Variable(tf.random_normal([n_hidden_2],
                                        mean=0,
                                        stddev=(np.sqrt(6/n_hidden_2+1)),
                                        name="b2")),
        'b3': tf.Variable(tf.random_normal([n_hidden_3],
                                        mean=0,
                                        stddev=(np.sqrt(6/n_hidden_3+1)),
                                        name="b3")),
        'b4': tf.Variable(tf.random_normal([n_hidden_4],
                                        mean=0,
                                        stddev=(np.sqrt(6/n_hidden_4+1)),
                                        name="b4")),
        'out': tf.Variable(tf.random_normal([n_classes],
                                        mean=0,
                                        stddev=(np.sqrt(6/n_classes+1)),
                                        name="biasout"))
    }
    
    # Parameters
    learning_rate = 0.001 # the alpha

    # tf Graph input
    x = tf.placeholder("float", [None, n_input])
    y = tf.placeholder("float", [None, n_classes])

    # Construct model
    pred = multilayer_perceptron(layers, x, weights, biases)

    # Define loss and optimizer
    cost = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(pred, y))
    #cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred, y))
    #cost = tf.nn.l2_loss(pred-y, name="squared_error_cost")
    #optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
    predict_op = tf.nn.sigmoid(pred)
    
    return x, y, cost, optimizer, predict_op


In [3]:
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
nltk.download(info_or_id='stopwords')
nltk.download(info_or_id='punkt')
sw = set(stopwords.words("english")) | set(i.strip() for i in list(open('stopwordlist.txt'))[1:])
terms_df = pd.read_csv('finalterms_grouped_clean2.csv')
single_terms = set(b for a in terms_df['term'] for b in nltk.word_tokenize(a) if (b not in sw and len(b)> 1))
del stopwords
del terms_df
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
puncs = set(string.punctuation)
tokenizer = WhitespaceTokenizer()
def return_words_from_text(x):
    def return_tokens(fulltext,start,end):
        text = fulltext[start:end]
        return tuple( [
            tuple(t.strip(string.punctuation) for t in tokenizer.tokenize(re.sub(r'\d+','SOMENUM',text.lower()))), 
            tuple((t[0]+start,t[1]+start) for t in tokenizer.span_tokenize(text))
                ]  )
    return tuple(return_tokens(x.strip(), sent[0], sent[1]) for sent in sent_detector.span_tokenize(x.strip()))

def get_ngrams(x, ngram_start, ngram_end):
    def get_ngram(sent, i):
        span = [(s[0][0],s[-1][-1]) for s in ngrams(sent[1], i)]
        the_ngrams = list(ngrams(sent[0], i))
        #print(len(span))
        #print(len(the_ngrams))
        if len(the_ngrams) > 0:
            return zip(the_ngrams, span)
        else:
            return []
    
    def remove_punc(ngram):
        return tuple([tuple(gram for gram in ngram[0] if not all(j in puncs for j in gram)), ngram[1]])
    def check_num_only(ngram):
        #print(ngram)
        return set(ngram[0]) != set('SOMENUM')
          
    ngram_list = defaultdict(list)
    for sent in x:
        #print(sent)
        for i in range(ngram_start,ngram_end+1):
            for gram in map(remove_punc, get_ngram(sent, i)):
                if ~check_num_only(gram) and len(gram) >0:
                    ngram_list[gram[0]].append(gram[1])
    return ngram_list

from gensim.models import Word2Vec
model = Word2Vec.load("space_100features_5minwords_10context_300karticles.bin")
allwords = set(model.index2word)
def get_vector(term):
    vector = [model[unigram.lower()] if unigram.lower() in allwords else np.zeros(100) for unigram in term ]
    return np.concatenate(vector + [np.zeros(100)]*(10-len(vector)), axis=0)

myrake = rake.Rake('stopwordlist.txt')

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from flask import Flask
from flask import request
from flask import render_template
from bs4 import BeautifulSoup
from IPython.core.display import display, HTML
    
app = Flask(__name__)

@app.route('/')
def my_form():
    return render_template("my_form.html", result="")

@app.route('/', methods=['POST'])
def my_form_post():
    new_text = request.form['text'].strip()
    
    potential_kw = myrake.run(new_text)
    rake_kw = [kw[0] for kw in potential_kw if kw[1] >1]
    
    ngram_of_article = get_ngrams(return_words_from_text(new_text), 1, 10)
    tf.reset_default_graph()
    x, y, cost, optimizer, predict_op = create_architecture(1000, 3)
    saver = tf.train.Saver()
    classify_func = np.vectorize(lambda x: 1 if x >=0.5 else 0)
    with tf.Session() as sess:
        saver.restore(sess, "bestmodel_10_new.ckpt")
        print("Model restored.")

        for i in ngram_of_article:
            vector = np.array([0.000001 if y==0 else y for y in get_vector(i) ])
            predicts = sess.run(predict_op, feed_dict = {x: [vector]})
            ngram_of_article[i] = (ngram_of_article[i], vector, classify_func(predicts)[0][0])
            
    bold_range = np.zeros(len(new_text))
    bold_range.shape
    for gram in ngram_of_article:
        if ngram_of_article[gram][2]==1 and len(gram)<5:
            #print(gram)
            for span in ngram_of_article[gram][0]:
                bold_range[span[0]:span[1]] = 1
    len(bold_range)
    sum(bold_range)

    html_string = """<br><h4>Business Variable from proposed algorithm highlighted in dark red:</h4>
    <br>"""
    temp_bold = ''
    for idx,i in enumerate(bold_range):
        if i==1:
            temp_bold += new_text[idx]
        else:
            if temp_bold:
                html_string += '<b style="color:DarkRed">{}</b>'.format(temp_bold)
                temp_bold = ""
            html_string += new_text[idx]
    if temp_bold:
        html_string += '<b style="color:DarkRed">{}</b>'.format(temp_bold)
        temp_bold = ""
    #print(html_string)
    rake_list_html = """<h4>Results from conventional keyword extraction algorithm - [Rapid Automatic Keyword Extraction, RAKE] (Rose et. al. 2010):</h4>
    <br><ul>{}</ul>""".format("".join(["<li>{}</li>".format(kw) for kw in rake_kw]))
    
    return render_template("my_form.html", result=html_string, rakeresult=rake_list_html)

if __name__ == '__main__':
    app.run('0.0.0.0',port=9999)

 * Running on http://0.0.0.0:9999/ (Press CTRL+C to quit)
183.171.170.220 - - [31/Oct/2016 03:23:30] "GET / HTTP/1.1" 200 -


Model restored.


183.171.170.220 - - [31/Oct/2016 03:24:08] "POST / HTTP/1.1" 200 -


Model restored.


183.171.170.220 - - [31/Oct/2016 03:25:23] "POST / HTTP/1.1" 200 -


In [None]:
html_string

In [18]:
output = myrake.run("""
Officials from OPEC and non-member oil producing countries met on Saturday aiming to build support for an OPEC plan to reduce output one day after OPEC members were unable to agreed on how to implement the deal.

Arriving for the meeting with OPEC's High Level Committee of exporters, only the representative of non-OPEC Azerbaijan made comments supportive of the need for producer action to help prop up prices.

"Today we will discuss the recognized positions of countries, first of all the OPEC countries," Azerbaijan's energy minister Natig Aliyev told reporters outside OPEC's headquarters.

"Just one week ago we met with the president of Venezuela," he added, in reference to the south American OPEC member which has been pushing for measures to support prices.

"Venezuela and Azerbaijan agree that some measures will be taken to stabilize the market. We agreed the price of oil can be around $60 per barrel."

Oil LCOc1 is trading closer to $50 a barrel, less than half its price of mid-2014, weighed down by persistent oversupply and squeezing the incomes of exporting nations.

Other non-OPEC officials did not mention joint producer action.

The deputy minister for Kazakhstan, asked what he hoped the meeting would achieve, said: "We just hope the price will react and it will increase."

Brazil's representative said his country was attending only as an observer.

"Brazilian production will increase in the next few years," said Brazilian official Marcio Felix.

Russia, which is one of the world's top producers and has been supporting action with OPEC to prop up prices, is also attending the meeting, so far without making public comment in Vienna.

Two OPEC sources said Russian energy officials told the gathering that Moscow was still willing to freeze its output levels if OPEC agreed to cap its production.

"Russia is ready but they want to see in detail figures agreed for yesterday," one of the sources said. Another source said Russia would freeze if OPEC agreed to reduce output.""")

In [19]:
[kw[0] for kw in output if kw[1] >1]

['energy minister natig aliyev told reporters',
 'non-opec azerbaijan made comments supportive',
 'non-member oil producing countries met',
 'russian energy officials told',
 'brazilian official marcio felix',
 'south american opec member',
 'mention joint producer action',
 'high level committee',
 'making public comment',
 'detail figures agreed',
 'deputy minister',
 'non-opec officials',
 'producer action',
 'azerbaijan agree',
 'oil lcoc1',
 'supporting action',
 'brazilian production',
 'opec countries',
 'build support',
 'saturday aiming',
 'exporting nations',
 'reduce output',
 'trading closer',
 'top producers',
 'recognized positions',
 'output levels',
 'persistent oversupply',
 'week ago',
 'opec plan',
 'opec members',
 'opec agreed',
 'support prices',
 'opec sources',
 'met',
 'countries',
 'azerbaijan',
 'oil',
 'officials',
 'opec',
 'agreed',
 'sources',
 'production',
 'prices']